diff --git a/.gitignore b/.gitignore
index 16389f34e..215b56e9a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,3 +12,7 @@ CMakeCache.txt
 doc
 
 apps/tensor_times_vector/tensor_times_vector
+
+.cache
+.vscode
+compile_commands.json
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a6a80d9d1..5c405fb27 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,10 +10,12 @@ project(taco
   LANGUAGES C CXX
 )
 option(CUDA "Build for NVIDIA GPU (CUDA must be preinstalled)" OFF)
+option(ISPC "Build for Intel ISPC Compiler (ISPC Compiler must be preinstalled)" OFF)
 option(PYTHON "Build TACO for python environment" OFF)
-option(OPENMP "Build with OpenMP execution support" OFF)
+option(OPENMP "Build with OpenMP execution support" ON)
 option(COVERAGE "Build with code coverage analysis" OFF)
 set(TACO_FEATURE_CUDA 0)
+set(TACO_FEATURE_ISPC 0)
 set(TACO_FEATURE_OPENMP 0)
 set(TACO_FEATURE_PYTHON 0)
 if(CUDA)
@@ -22,6 +24,11 @@ if(CUDA)
   add_definitions(-DCUDA_BUILT)
   set(TACO_FEATURE_CUDA 1)
 endif(CUDA)
+if(ISPC)
+  message("-- Searching for ISPC Installation")
+  add_definitions(-DISPC_BUILT)
+  set(TACO_FEATURE_ISPC 1)
+endif(ISPC) 
 if(OPENMP)
   message("-- Will use OpenMP for parallel execution")
   add_definitions(-DUSE_OPENMP)
diff --git a/include/taco/codegen/module.h b/include/taco/codegen/module.h
index 36eb34f1a..3df7c8e0f 100644
--- a/include/taco/codegen/module.h
+++ b/include/taco/codegen/module.h
@@ -68,6 +68,7 @@ class Module {
   
 private:
   std::stringstream source;
+  std::stringstream additional_source;
   std::stringstream header;
   std::string libname;
   std::string tmpdir;
diff --git a/include/taco/cuda.h b/include/taco/cuda.h
index aad6b5229..9c4a7aae9 100644
--- a/include/taco/cuda.h
+++ b/include/taco/cuda.h
@@ -9,7 +9,19 @@
   #define CUDA_BUILT false
 #endif
 
+#ifndef ISPC_BUILT
+  #define ISPC_BUILT false
+#endif
+
 namespace taco {
+
+/// Functions used by taco to interface with ISPC
+bool should_use_ISPC_codegen();
+void set_ISPC_codegen_enabled(bool enabled);
+bool is_ISPC_code_stream_enabled();
+void set_ISPC_code_stream_enabled(bool enabled);
+
+
 /// Functions used by taco to interface with CUDA (especially unified memory)
 /// Check if should use CUDA codegen
 bool should_use_CUDA_codegen();
diff --git a/include/taco/index_notation/transformations.h b/include/taco/index_notation/transformations.h
index 7aa2579ad..4d6ec6830 100644
--- a/include/taco/index_notation/transformations.h
+++ b/include/taco/index_notation/transformations.h
@@ -223,6 +223,9 @@ IndexStmt parallelizeOuterLoop(IndexStmt stmt);
  */
 IndexStmt reorderLoopsTopologically(IndexStmt stmt);
 
+IndexStmt loopFusionOverFission(IndexStmt stmt, Assignment assignment,
+  std::string side, int iters);
+
 /**
  * Performs scalar promotion so that reductions are done by accumulating into 
  * scalar temporaries whenever possible.
diff --git a/include/taco/ir/ir.h b/include/taco/ir/ir.h
index f852f26b1..96dc7d034 100644
--- a/include/taco/ir/ir.h
+++ b/include/taco/ir/ir.h
@@ -591,7 +591,7 @@ struct Switch : public StmtNode<Switch> {
   static const IRNodeType _type_info = IRNodeType::Switch;
 };
 
-enum class LoopKind {Serial, Static, Dynamic, Runtime, Vectorized, Static_Chunked};
+enum class LoopKind {Serial, Static, Dynamic, Runtime, Vectorized, Static_Chunked, Foreach, Mul_Thread, Init};
 
 /** A for loop from start to end by increment.
  * A vectorized loop will require the increment to be 1 and the
diff --git a/include/taco/ir/ir_printer.h b/include/taco/ir/ir_printer.h
index 4e50764e9..c2c505bf5 100644
--- a/include/taco/ir/ir_printer.h
+++ b/include/taco/ir/ir_printer.h
@@ -16,6 +16,7 @@ class IRPrinter : public IRVisitorStrict {
 public:
   IRPrinter(std::ostream& stream);
   IRPrinter(std::ostream& stream, bool color, bool simplify);
+  IRPrinter(std::ostream& stream, std::ostream& stream2, bool color, bool simplify);
   virtual ~IRPrinter();
 
   void setColor(bool color);
@@ -72,6 +73,7 @@ class IRPrinter : public IRVisitorStrict {
   virtual void visit(const Break*);
 
   std::ostream &stream;
+  std::ostream &stream2;
   int indent;
   bool color;
   bool simplify;
@@ -109,6 +111,7 @@ class IRPrinter : public IRVisitorStrict {
   void doIndent();
   void printBinOp(Expr a, Expr b, std::string op, Precedence precedence);
   bool needsParentheses(Precedence precedence);
+  void sendToStream(std::stringstream &stream);
 
   std::string keywordString(std::string);
   std::string commentString(std::string);
diff --git a/include/taco/ir_tags.h b/include/taco/ir_tags.h
index 5858a13e3..6a74be173 100644
--- a/include/taco/ir_tags.h
+++ b/include/taco/ir_tags.h
@@ -9,7 +9,7 @@ namespace taco {
 /// ParallelUnit::GPUWarp can be optionally used to allow for GPU warp-level primitives
 /// ParallelUnit::GPUThread causes for every iteration to be executed on a separate GPU thread
 enum class ParallelUnit {
-  NotParallel, DefaultUnit, GPUBlock, GPUWarp, GPUThread, CPUThread, CPUVector, CPUThreadGroupReduction, GPUBlockReduction, GPUWarpReduction
+  NotParallel, DefaultUnit, GPUBlock, GPUWarp, GPUThread, CPUThread, CPUVector, CPUThreadGroupReduction, GPUBlockReduction, GPUWarpReduction, CPUSimd, CPUSpmd
 };
 extern const char *ParallelUnit_NAMES[];
 
diff --git a/include/taco/lower/lowerer_impl_imperative.h b/include/taco/lower/lowerer_impl_imperative.h
index 65f069fda..d743f5875 100644
--- a/include/taco/lower/lowerer_impl_imperative.h
+++ b/include/taco/lower/lowerer_impl_imperative.h
@@ -499,10 +499,13 @@ class LowererImplImperative : public LowererImpl {
 
   bool emitUnderivedGuards = true;
 
+  int loopDepth = 0;
   int inParallelLoopDepth = 0;
 
   std::map<ParallelUnit, ir::Expr> parallelUnitSizes;
   std::map<ParallelUnit, IndexVar> parallelUnitIndexVars;
+  std::map<int, ParallelUnit> forUnits; // <loopdepth, ParallelUnit>
+  std::map<TensorVar,int> whereTempsWithLoopDepth;
 
   /// Keep track of what IndexVars have already been defined
   std::set<IndexVar> definedIndexVars;
diff --git a/include/taco/util/strings.h b/include/taco/util/strings.h
index 5dfb2f174..a3c3d863f 100644
--- a/include/taco/util/strings.h
+++ b/include/taco/util/strings.h
@@ -1,6 +1,7 @@
 #ifndef TACO_UTIL_STRINGS_H
 #define TACO_UTIL_STRINGS_H
 
+#include "taco/cuda.h"
 #include <string>
 #include <sstream>
 #include <vector>
@@ -8,6 +9,8 @@
 #include <iomanip>
 #include <limits>
 
+#include "taco/type.h"
+
 // To get the value of a compiler macro variable
 #define STRINGIFY(x) #x
 #define TO_STRING(x) STRINGIFY(x)
@@ -15,6 +18,25 @@
 namespace taco {
 namespace util {
 
+// /// Turn anything except floating points that can be written to a stream
+// /// into a string.
+// template <class T>
+// typename std::enable_if<!std::is_floating_point<T>::value, std::string>::type
+// toStringISPC(const T &val) {
+
+//   std::stringstream sstream;
+//   if (val == Int32) {
+//     sstream << "int32";
+//   }
+//   else if (val == Int64) {
+//     sstream << "int64";
+//   }
+//   else {
+//     sstream << val;
+//   }
+//   return sstream.str();
+// }
+
 /// Turn anything except floating points that can be written to a stream
 /// into a string.
 template <class T>
diff --git a/include/taco/version.h.in b/include/taco/version.h.in
index bc5559d7d..8ef507598 100644
--- a/include/taco/version.h.in
+++ b/include/taco/version.h.in
@@ -20,5 +20,6 @@
 #define TACO_FEATURE_OPENMP @TACO_FEATURE_OPENMP@
 #define TACO_FEATURE_PYTHON @TACO_FEATURE_PYTHON@
 #define TACO_FEATURE_CUDA   @TACO_FEATURE_CUDA@
+#define TACO_FEATURE_ISPC   @TACO_FEATURE_ISPC@
 
 #endif /* TACO_VERSION_H */
diff --git a/out/taco-uml/._taco.svg b/out/taco-uml/._taco.svg
new file mode 100755
index 000000000..e88dbd51b
Binary files /dev/null and b/out/taco-uml/._taco.svg differ
diff --git a/out/taco-uml/taco.svg b/out/taco-uml/taco.svg
new file mode 100644
index 000000000..57f7a18d1
--- /dev/null
+++ b/out/taco-uml/taco.svg
@@ -0,0 +1,878 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" contentScriptType="application/ecmascript" contentStyleType="text/css" height="1823px" preserveAspectRatio="none" style="width:3568px;height:1823px;background:#FFFFFF;" version="1.1" viewBox="0 0 3568 1823" width="3568px" zoomAndPan="magnify"><defs><filter height="300%" id="fujoep6dbpit" width="300%" x="-1" y="-1"><feGaussianBlur result="blurOut" stdDeviation="2.0"/><feColorMatrix in="blurOut" result="blurOut2" type="matrix" values="0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 .4 0"/><feOffset dx="4.0" dy="4.0" in="blurOut2" result="blurOut3"/><feBlend in="SourceGraphic" in2="blurOut3" mode="normal"/></filter></defs><g><!--MD5=[d414847e5e8717ca0c3531bdd138c8ba]
+class IntrusivePtr--><rect codeLine="4" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="60.8047" id="IntrusivePtr" style="stroke:#A80036;stroke-width:1.5;" width="103" x="632" y="7"/><ellipse cx="647" cy="23" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M649.9688,28.6406 Q649.3906,28.9375 648.75,29.0781 Q648.1094,29.2344 647.4063,29.2344 Q644.9063,29.2344 643.5781,27.5938 Q642.2656,25.9375 642.2656,22.8125 Q642.2656,19.6875 643.5781,18.0313 Q644.9063,16.375 647.4063,16.375 Q648.1094,16.375 648.75,16.5313 Q649.4063,16.6875 649.9688,16.9844 L649.9688,19.7031 Q649.3438,19.125 648.75,18.8594 Q648.1563,18.5781 647.5313,18.5781 Q646.1875,18.5781 645.5,19.6563 Q644.8125,20.7188 644.8125,22.8125 Q644.8125,24.9063 645.5,25.9844 Q646.1875,27.0469 647.5313,27.0469 Q648.1563,27.0469 648.75,26.7813 Q649.3438,26.5 649.9688,25.9219 L649.9688,28.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="71" x="661" y="27.1543">IntrusivePtr</text><line style="stroke:#A80036;stroke-width:1.5;" x1="633" x2="734" y1="39" y2="39"/><ellipse cx="643" cy="50" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;fill:none;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="33" x="652" y="53.2104">T *ptr</text><line style="stroke:#A80036;stroke-width:1.5;" x1="633" x2="734" y1="59.8047" y2="59.8047"/><!--MD5=[9fb058d7a838b7ba6ed26398a5e03f68]
+class Uncopyable--><rect codeLine="7" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="48" id="Uncopyable" style="stroke:#A80036;stroke-width:1.5;" width="105" x="786" y="244"/><ellipse cx="801" cy="260" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M803.9688,265.6406 Q803.3906,265.9375 802.75,266.0781 Q802.1094,266.2344 801.4063,266.2344 Q798.9063,266.2344 797.5781,264.5938 Q796.2656,262.9375 796.2656,259.8125 Q796.2656,256.6875 797.5781,255.0313 Q798.9063,253.375 801.4063,253.375 Q802.1094,253.375 802.75,253.5313 Q803.4063,253.6875 803.9688,253.9844 L803.9688,256.7031 Q803.3438,256.125 802.75,255.8594 Q802.1563,255.5781 801.5313,255.5781 Q800.1875,255.5781 799.5,256.6563 Q798.8125,257.7188 798.8125,259.8125 Q798.8125,261.9063 799.5,262.9844 Q800.1875,264.0469 801.5313,264.0469 Q802.1563,264.0469 802.75,263.7813 Q803.3438,263.5 803.9688,262.9219 L803.9688,265.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="73" x="815" y="264.1543">Uncopyable</text><line style="stroke:#A80036;stroke-width:1.5;" x1="787" x2="890" y1="276" y2="276"/><line style="stroke:#A80036;stroke-width:1.5;" x1="787" x2="890" y1="284" y2="284"/><!--MD5=[f38687c19e1720eba4a1ab1343a37015]
+class IRNode--><rect codeLine="9" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="73.6094" id="IRNode" style="stroke:#A80036;stroke-width:1.5;" width="288" x="197.5" y="548"/><ellipse cx="315.25" cy="564" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M318.2188,569.6406 Q317.6406,569.9375 317,570.0781 Q316.3594,570.2344 315.6563,570.2344 Q313.1563,570.2344 311.8281,568.5938 Q310.5156,566.9375 310.5156,563.8125 Q310.5156,560.6875 311.8281,559.0313 Q313.1563,557.375 315.6563,557.375 Q316.3594,557.375 317,557.5313 Q317.6563,557.6875 318.2188,557.9844 L318.2188,560.7031 Q317.5938,560.125 317,559.8594 Q316.4063,559.5781 315.7813,559.5781 Q314.4375,559.5781 313.75,560.6563 Q313.0625,561.7188 313.0625,563.8125 Q313.0625,565.9063 313.75,566.9844 Q314.4375,568.0469 315.7813,568.0469 Q316.4063,568.0469 317,567.7813 Q317.5938,567.5 318.2188,566.9219 L318.2188,569.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="44" x="335.75" y="568.1543">IRNode</text><line style="stroke:#A80036;stroke-width:1.5;" x1="198.5" x2="484.5" y1="580" y2="580"/><line style="stroke:#A80036;stroke-width:1.5;" x1="198.5" x2="484.5" y1="588" y2="588"/><ellipse cx="208.5" cy="599" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="262" x="217.5" y="602.2104">virtual void accept(IRVisitorStrict *v) const = 0</text><ellipse cx="208.5" cy="611.8047" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="231" x="217.5" y="615.0151">virtual IRNodeType type_info() const = 0;</text><!--MD5=[bc9d8c255d7fbd519a9f6a6cf76a7a1b]
+class BaseStmtNode--><rect codeLine="14" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="48" id="BaseStmtNode" style="stroke:#A80036;stroke-width:1.5;" width="125" x="110" y="830"/><ellipse cx="125" cy="846" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M127.9688,851.6406 Q127.3906,851.9375 126.75,852.0781 Q126.1094,852.2344 125.4063,852.2344 Q122.9063,852.2344 121.5781,850.5938 Q120.2656,848.9375 120.2656,845.8125 Q120.2656,842.6875 121.5781,841.0313 Q122.9063,839.375 125.4063,839.375 Q126.1094,839.375 126.75,839.5313 Q127.4063,839.6875 127.9688,839.9844 L127.9688,842.7031 Q127.3438,842.125 126.75,841.8594 Q126.1563,841.5781 125.5313,841.5781 Q124.1875,841.5781 123.5,842.6563 Q122.8125,843.7188 122.8125,845.8125 Q122.8125,847.9063 123.5,848.9844 Q124.1875,850.0469 125.5313,850.0469 Q126.1563,850.0469 126.75,849.7813 Q127.3438,849.5 127.9688,848.9219 L127.9688,851.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="93" x="139" y="850.1543">BaseStmtNode</text><line style="stroke:#A80036;stroke-width:1.5;" x1="111" x2="234" y1="862" y2="862"/><line style="stroke:#A80036;stroke-width:1.5;" x1="111" x2="234" y1="870" y2="870"/><!--MD5=[27b83928eb4ae87e2fc2e82e735e02cd]
+class BaseExprNode--><rect codeLine="15" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="60.8047" id="BaseExprNode" style="stroke:#A80036;stroke-width:1.5;" width="123" x="315" y="823.5"/><ellipse cx="330" cy="839.5" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M332.9688,845.1406 Q332.3906,845.4375 331.75,845.5781 Q331.1094,845.7344 330.4063,845.7344 Q327.9063,845.7344 326.5781,844.0938 Q325.2656,842.4375 325.2656,839.3125 Q325.2656,836.1875 326.5781,834.5313 Q327.9063,832.875 330.4063,832.875 Q331.1094,832.875 331.75,833.0313 Q332.4063,833.1875 332.9688,833.4844 L332.9688,836.2031 Q332.3438,835.625 331.75,835.3594 Q331.1563,835.0781 330.5313,835.0781 Q329.1875,835.0781 328.5,836.1563 Q327.8125,837.2188 327.8125,839.3125 Q327.8125,841.4063 328.5,842.4844 Q329.1875,843.5469 330.5313,843.5469 Q331.1563,843.5469 331.75,843.2813 Q332.3438,843 332.9688,842.4219 L332.9688,845.1406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="91" x="344" y="843.6543">BaseExprNode</text><line style="stroke:#A80036;stroke-width:1.5;" x1="316" x2="437" y1="855.5" y2="855.5"/><ellipse cx="326" cy="866.5" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;fill:none;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="78" x="335" y="869.7104">Datatype type</text><line style="stroke:#A80036;stroke-width:1.5;" x1="316" x2="437" y1="876.3047" y2="876.3047"/><!--MD5=[d94a4fdce57fa90edc62507e0f6859c0]
+class StmtNode--><rect codeLine="19" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="60.8047" id="StmtNode" style="stroke:#A80036;stroke-width:1.5;" width="225" x="15" y="1198"/><ellipse cx="92.25" cy="1214" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M95.2188,1219.6406 Q94.6406,1219.9375 94,1220.0781 Q93.3594,1220.2344 92.6563,1220.2344 Q90.1563,1220.2344 88.8281,1218.5938 Q87.5156,1216.9375 87.5156,1213.8125 Q87.5156,1210.6875 88.8281,1209.0313 Q90.1563,1207.375 92.6563,1207.375 Q93.3594,1207.375 94,1207.5313 Q94.6563,1207.6875 95.2188,1207.9844 L95.2188,1210.7031 Q94.5938,1210.125 94,1209.8594 Q93.4063,1209.5781 92.7813,1209.5781 Q91.4375,1209.5781 90.75,1210.6563 Q90.0625,1211.7188 90.0625,1213.8125 Q90.0625,1215.9063 90.75,1216.9844 Q91.4375,1218.0469 92.7813,1218.0469 Q93.4063,1218.0469 94,1217.7813 Q94.5938,1217.5 95.2188,1216.9219 L95.2188,1219.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="62" x="112.75" y="1218.1543">StmtNode</text><line style="stroke:#A80036;stroke-width:1.5;" x1="16" x2="239" y1="1230" y2="1230"/><line style="stroke:#A80036;stroke-width:1.5;" x1="16" x2="239" y1="1238" y2="1238"/><ellipse cx="26" cy="1249" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="199" x="35" y="1252.2104">void accept(IRVisitorStrict *v) const</text><!--MD5=[475d6310b0690b98eac8d3436b0f8c3b]
+class ExprNode--><rect codeLine="22" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="60.8047" id="ExprNode" style="stroke:#A80036;stroke-width:1.5;" width="225" x="275" y="1198"/><ellipse cx="353.25" cy="1214" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M356.2188,1219.6406 Q355.6406,1219.9375 355,1220.0781 Q354.3594,1220.2344 353.6563,1220.2344 Q351.1563,1220.2344 349.8281,1218.5938 Q348.5156,1216.9375 348.5156,1213.8125 Q348.5156,1210.6875 349.8281,1209.0313 Q351.1563,1207.375 353.6563,1207.375 Q354.3594,1207.375 355,1207.5313 Q355.6563,1207.6875 356.2188,1207.9844 L356.2188,1210.7031 Q355.5938,1210.125 355,1209.8594 Q354.4063,1209.5781 353.7813,1209.5781 Q352.4375,1209.5781 351.75,1210.6563 Q351.0625,1211.7188 351.0625,1213.8125 Q351.0625,1215.9063 351.75,1216.9844 Q352.4375,1218.0469 353.7813,1218.0469 Q354.4063,1218.0469 355,1217.7813 Q355.5938,1217.5 356.2188,1216.9219 L356.2188,1219.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="60" x="373.75" y="1218.1543">ExprNode</text><line style="stroke:#A80036;stroke-width:1.5;" x1="276" x2="499" y1="1230" y2="1230"/><line style="stroke:#A80036;stroke-width:1.5;" x1="276" x2="499" y1="1238" y2="1238"/><ellipse cx="286" cy="1249" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="199" x="295" y="1252.2104">void accept(IRVisitorStrict *v) const</text><!--MD5=[a8ff5e7d622655153c4b3f7a4e4aeffe]
+class IRHandle--><rect codeLine="32" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="60.8047" id="IRHandle" style="stroke:#A80036;stroke-width:1.5;" width="225" x="72" y="237.5"/><ellipse cx="152.75" cy="253.5" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M155.7188,259.1406 Q155.1406,259.4375 154.5,259.5781 Q153.8594,259.7344 153.1563,259.7344 Q150.6563,259.7344 149.3281,258.0938 Q148.0156,256.4375 148.0156,253.3125 Q148.0156,250.1875 149.3281,248.5313 Q150.6563,246.875 153.1563,246.875 Q153.8594,246.875 154.5,247.0313 Q155.1563,247.1875 155.7188,247.4844 L155.7188,250.2031 Q155.0938,249.625 154.5,249.3594 Q153.9063,249.0781 153.2813,249.0781 Q151.9375,249.0781 151.25,250.1563 Q150.5625,251.2188 150.5625,253.3125 Q150.5625,255.4063 151.25,256.4844 Q151.9375,257.5469 153.2813,257.5469 Q153.9063,257.5469 154.5,257.2813 Q155.0938,257 155.7188,256.4219 L155.7188,259.1406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="55" x="173.25" y="257.6543">IRHandle</text><line style="stroke:#A80036;stroke-width:1.5;" x1="73" x2="296" y1="269.5" y2="269.5"/><line style="stroke:#A80036;stroke-width:1.5;" x1="73" x2="296" y1="277.5" y2="277.5"/><ellipse cx="83" cy="288.5" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="199" x="92" y="291.7104">void accept(IRVisitorStrict *v) const</text><!--MD5=[45d7a04dc863bc0ed8f0c57430a02d4a]
+class Expr--><rect codeLine="35" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="48" id="Expr" style="stroke:#A80036;stroke-width:1.5;" width="59" x="7" y="561"/><ellipse cx="22" cy="577" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M24.9688,582.6406 Q24.3906,582.9375 23.75,583.0781 Q23.1094,583.2344 22.4063,583.2344 Q19.9063,583.2344 18.5781,581.5938 Q17.2656,579.9375 17.2656,576.8125 Q17.2656,573.6875 18.5781,572.0313 Q19.9063,570.375 22.4063,570.375 Q23.1094,570.375 23.75,570.5313 Q24.4063,570.6875 24.9688,570.9844 L24.9688,573.7031 Q24.3438,573.125 23.75,572.8594 Q23.1563,572.5781 22.5313,572.5781 Q21.1875,572.5781 20.5,573.6563 Q19.8125,574.7188 19.8125,576.8125 Q19.8125,578.9063 20.5,579.9844 Q21.1875,581.0469 22.5313,581.0469 Q23.1563,581.0469 23.75,580.7813 Q24.3438,580.5 24.9688,579.9219 L24.9688,582.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="27" x="36" y="581.1543">Expr</text><line style="stroke:#A80036;stroke-width:1.5;" x1="8" x2="65" y1="593" y2="593"/><line style="stroke:#A80036;stroke-width:1.5;" x1="8" x2="65" y1="601" y2="601"/><!--MD5=[add513dd89cf3f02144ebc6704fab9f7]
+class Stmt--><rect codeLine="36" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="48" id="Stmt" style="stroke:#A80036;stroke-width:1.5;" width="61" x="101" y="561"/><ellipse cx="116" cy="577" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M118.9688,582.6406 Q118.3906,582.9375 117.75,583.0781 Q117.1094,583.2344 116.4063,583.2344 Q113.9063,583.2344 112.5781,581.5938 Q111.2656,579.9375 111.2656,576.8125 Q111.2656,573.6875 112.5781,572.0313 Q113.9063,570.375 116.4063,570.375 Q117.1094,570.375 117.75,570.5313 Q118.4063,570.6875 118.9688,570.9844 L118.9688,573.7031 Q118.3438,573.125 117.75,572.8594 Q117.1563,572.5781 116.5313,572.5781 Q115.1875,572.5781 114.5,573.6563 Q113.8125,574.7188 113.8125,576.8125 Q113.8125,578.9063 114.5,579.9844 Q115.1875,581.0469 116.5313,581.0469 Q117.1563,581.0469 117.75,580.7813 Q118.3438,580.5 118.9688,579.9219 L118.9688,582.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="29" x="130" y="581.1543">Stmt</text><line style="stroke:#A80036;stroke-width:1.5;" x1="102" x2="161" y1="593" y2="593"/><line style="stroke:#A80036;stroke-width:1.5;" x1="102" x2="161" y1="601" y2="601"/><!--MD5=[927685d34b77cdaffb6bcd7c2ecdcc1a]
+class IRVisitorStrict--><rect codeLine="47" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="60.8047" id="IRVisitorStrict" style="stroke:#A80036;stroke-width:1.5;" width="262" x="2676.5" y="7"/><ellipse cx="2761.75" cy="23" fill="#B4A7E5" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M2757.6719,18.7656 L2757.6719,16.6094 L2765.0625,16.6094 L2765.0625,18.7656 L2762.5938,18.7656 L2762.5938,26.8438 L2765.0625,26.8438 L2765.0625,29 L2757.6719,29 L2757.6719,26.8438 L2760.1406,26.8438 L2760.1406,18.7656 L2757.6719,18.7656 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" font-style="italic" lengthAdjust="spacing" textLength="83" x="2782.25" y="27.1543">IRVisitorStrict</text><line style="stroke:#A80036;stroke-width:1.5;" x1="2677.5" x2="2937.5" y1="39" y2="39"/><line style="stroke:#A80036;stroke-width:1.5;" x1="2677.5" x2="2937.5" y1="47" y2="47"/><ellipse cx="2687.5" cy="58" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="236" x="2696.5" y="61.2104">virtual void visit(const IRNode*) const = 0</text><!--MD5=[b78282c203133343885c01c420157c8a]
+class IRVisitor--><rect codeLine="55" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="60.8047" id="IRVisitor" style="stroke:#A80036;stroke-width:1.5;" width="203" x="2387" y="237.5"/><ellipse cx="2459.25" cy="253.5" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M2462.2188,259.1406 Q2461.6406,259.4375 2461,259.5781 Q2460.3594,259.7344 2459.6563,259.7344 Q2457.1563,259.7344 2455.8281,258.0938 Q2454.5156,256.4375 2454.5156,253.3125 Q2454.5156,250.1875 2455.8281,248.5313 Q2457.1563,246.875 2459.6563,246.875 Q2460.3594,246.875 2461,247.0313 Q2461.6563,247.1875 2462.2188,247.4844 L2462.2188,250.2031 Q2461.5938,249.625 2461,249.3594 Q2460.4063,249.0781 2459.7813,249.0781 Q2458.4375,249.0781 2457.75,250.1563 Q2457.0625,251.2188 2457.0625,253.3125 Q2457.0625,255.4063 2457.75,256.4844 Q2458.4375,257.5469 2459.7813,257.5469 Q2460.4063,257.5469 2461,257.2813 Q2461.5938,257 2462.2188,256.4219 L2462.2188,259.1406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="50" x="2479.75" y="257.6543">IRVisitor</text><line style="stroke:#A80036;stroke-width:1.5;" x1="2388" x2="2589" y1="269.5" y2="269.5"/><line style="stroke:#A80036;stroke-width:1.5;" x1="2388" x2="2589" y1="277.5" y2="277.5"/><ellipse cx="2398" cy="288.5" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="177" x="2407" y="291.7104">virtual void visit(const IRNode*)</text><!--MD5=[e7ea7c5d2ec9672a3f65e9628a854185]
+class IRRewriter--><rect codeLine="59" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="137.6328" id="IRRewriter" style="stroke:#A80036;stroke-width:1.5;" width="238" x="2688.5" y="199"/><ellipse cx="2772.25" cy="215" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M2775.2188,220.6406 Q2774.6406,220.9375 2774,221.0781 Q2773.3594,221.2344 2772.6563,221.2344 Q2770.1563,221.2344 2768.8281,219.5938 Q2767.5156,217.9375 2767.5156,214.8125 Q2767.5156,211.6875 2768.8281,210.0313 Q2770.1563,208.375 2772.6563,208.375 Q2773.3594,208.375 2774,208.5313 Q2774.6563,208.6875 2775.2188,208.9844 L2775.2188,211.7031 Q2774.5938,211.125 2774,210.8594 Q2773.4063,210.5781 2772.7813,210.5781 Q2771.4375,210.5781 2770.75,211.6563 Q2770.0625,212.7188 2770.0625,214.8125 Q2770.0625,216.9063 2770.75,217.9844 Q2771.4375,219.0469 2772.7813,219.0469 Q2773.4063,219.0469 2774,218.7813 Q2774.5938,218.5 2775.2188,217.9219 L2775.2188,220.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="62" x="2792.75" y="219.1543">IRRewriter</text><line style="stroke:#A80036;stroke-width:1.5;" x1="2689.5" x2="2925.5" y1="231" y2="231"/><polygon fill="none" points="2699.5,237,2703.5,241,2699.5,245,2695.5,241" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="54" x="2708.5" y="245.2104">Expr expr</text><polygon fill="none" points="2699.5,249.8047,2703.5,253.8047,2699.5,257.8047,2695.5,253.8047" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="57" x="2708.5" y="258.0151">Stmt stmt</text><line style="stroke:#A80036;stroke-width:1.5;" x1="2689.5" x2="2925.5" y1="264.6094" y2="264.6094"/><polygon fill="#FFFF44" points="2699.5,270.6094,2703.5,274.6094,2699.5,278.6094,2695.5,274.6094" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="210" x="2708.5" y="278.8198">virtual void visit(const ExprNode* op)</text><polygon fill="#FFFF44" points="2699.5,283.4141,2703.5,287.4141,2699.5,291.4141,2695.5,287.4141" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="212" x="2708.5" y="291.6245">virtual void visit(const StmtNode* op)</text><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="0" x="2712.5" y="304.4292"/><ellipse cx="2699.5" cy="314.0234" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="102" x="2708.5" y="317.2339">Expr rewrite(Expr)</text><ellipse cx="2699.5" cy="326.8281" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="106" x="2708.5" y="330.0386">Stmt rewrite(Stmt)</text><!--MD5=[fc5b2d51c8ad612433d8a39d4bdd37c4]
+class IRPrinter--><rect codeLine="71" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="278.4844" id="IRPrinter" style="stroke:#A80036;stroke-width:1.5;" width="430" x="3008.5" y="129"/><ellipse cx="3192.75" cy="145" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M3195.7188,150.6406 Q3195.1406,150.9375 3194.5,151.0781 Q3193.8594,151.2344 3193.1563,151.2344 Q3190.6563,151.2344 3189.3281,149.5938 Q3188.0156,147.9375 3188.0156,144.8125 Q3188.0156,141.6875 3189.3281,140.0313 Q3190.6563,138.375 3193.1563,138.375 Q3193.8594,138.375 3194.5,138.5313 Q3195.1563,138.6875 3195.7188,138.9844 L3195.7188,141.7031 Q3195.0938,141.125 3194.5,140.8594 Q3193.9063,140.5781 3193.2813,140.5781 Q3191.9375,140.5781 3191.25,141.6563 Q3190.5625,142.7188 3190.5625,144.8125 Q3190.5625,146.9063 3191.25,147.9844 Q3191.9375,149.0469 3193.2813,149.0469 Q3193.9063,149.0469 3194.5,148.7813 Q3195.0938,148.5 3195.7188,147.9219 L3195.7188,150.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="53" x="3213.25" y="149.1543">IRPrinter</text><line style="stroke:#A80036;stroke-width:1.5;" x1="3009.5" x2="3437.5" y1="161" y2="161"/><polygon fill="none" points="3019.5,167,3023.5,171,3019.5,175,3015.5,171" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="129" x="3028.5" y="175.2104">std::ostream &amp;stream</text><polygon fill="none" points="3019.5,179.8047,3023.5,183.8047,3019.5,187.8047,3015.5,183.8047" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="136" x="3028.5" y="188.0151">std::ostream &amp;stream2</text><polygon fill="none" points="3019.5,192.6094,3023.5,196.6094,3019.5,200.6094,3015.5,196.6094" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="53" x="3028.5" y="200.8198">int indent</text><polygon fill="none" points="3019.5,205.4141,3023.5,209.4141,3019.5,213.4141,3015.5,209.4141" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="56" x="3028.5" y="213.6245">bool color</text><polygon fill="none" points="3019.5,218.2188,3023.5,222.2188,3019.5,226.2188,3015.5,222.2188" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="72" x="3028.5" y="226.4292">bool simplify</text><polygon fill="none" points="3019.5,231.0234,3023.5,235.0234,3019.5,239.0234,3015.5,235.0234" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="102" x="3028.5" y="239.2339">enum Precedence</text><polygon fill="none" points="3019.5,243.8281,3023.5,247.8281,3019.5,251.8281,3015.5,247.8281" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="239" x="3028.5" y="252.0386">Precedence parentPrecedence = BOTTOM</text><polygon fill="none" points="3019.5,256.6328,3023.5,260.6328,3019.5,264.6328,3015.5,260.6328" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="204" x="3028.5" y="264.8433">NameGenerator varNameGenerator</text><polygon fill="none" points="3019.5,269.4375,3023.5,273.4375,3019.5,277.4375,3015.5,273.4375" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="237" x="3028.5" y="277.6479">scopedMap&lt;Expr, std::String&gt; varNames</text><line style="stroke:#A80036;stroke-width:1.5;" x1="3009.5" x2="3437.5" y1="284.2422" y2="284.2422"/><polygon fill="#FFFF44" points="3019.5,290.2422,3023.5,294.2422,3019.5,298.2422,3015.5,294.2422" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="84" x="3028.5" y="298.4526">void doIndent()</text><polygon fill="#FFFF44" points="3019.5,303.0469,3023.5,307.0469,3019.5,311.0469,3015.5,307.0469" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="404" x="3028.5" y="311.2573">void printBinOp(Expr a, Expr b, std::string op, Precedence precedence)</text><polygon fill="#FFFF44" points="3019.5,315.8516,3023.5,319.8516,3019.5,323.8516,3015.5,319.8516" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="131" x="3028.5" y="324.062">void fewMoreMethods()</text><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="0" x="3032.5" y="336.8667"/><polygon fill="#FFFF44" points="3019.5,341.4609,3023.5,345.4609,3019.5,349.4609,3015.5,345.4609" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="192" x="3028.5" y="349.6714">virtual void visit(const ExprNode*)</text><polygon fill="#FFFF44" points="3019.5,354.2656,3023.5,358.2656,3019.5,362.2656,3015.5,358.2656" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="194" x="3028.5" y="362.4761">virtual void visit(const StmtNode*)</text><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="0" x="3032.5" y="375.2808"/><ellipse cx="3019.5" cy="384.875" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="112" x="3028.5" y="388.0854">setColor(bool color)</text><ellipse cx="3019.5" cy="397.6797" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="61" x="3028.5" y="400.8901">print(Stmt)</text><!--MD5=[5f8d54360f7c21960948de60fa30257d]
+class IRVerifier--><rect codeLine="92" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="48" id="IRVerifier" style="stroke:#A80036;stroke-width:1.5;" width="87" x="2288" y="561"/><ellipse cx="2303" cy="577" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M2305.9688,582.6406 Q2305.3906,582.9375 2304.75,583.0781 Q2304.1094,583.2344 2303.4063,583.2344 Q2300.9063,583.2344 2299.5781,581.5938 Q2298.2656,579.9375 2298.2656,576.8125 Q2298.2656,573.6875 2299.5781,572.0313 Q2300.9063,570.375 2303.4063,570.375 Q2304.1094,570.375 2304.75,570.5313 Q2305.4063,570.6875 2305.9688,570.9844 L2305.9688,573.7031 Q2305.3438,573.125 2304.75,572.8594 Q2304.1563,572.5781 2303.5313,572.5781 Q2302.1875,572.5781 2301.5,573.6563 Q2300.8125,574.7188 2300.8125,576.8125 Q2300.8125,578.9063 2301.5,579.9844 Q2302.1875,581.0469 2303.5313,581.0469 Q2304.1563,581.0469 2304.75,580.7813 Q2305.3438,580.5 2305.9688,579.9219 L2305.9688,582.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="55" x="2317" y="581.1543">IRVerifier</text><line style="stroke:#A80036;stroke-width:1.5;" x1="2289" x2="2374" y1="593" y2="593"/><line style="stroke:#A80036;stroke-width:1.5;" x1="2289" x2="2374" y1="601" y2="601"/><!--MD5=[1e59d9c8d5cb32d21caddc96a281f60c]
+class ExpressionSimplifier--><rect codeLine="101" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="48" id="ExpressionSimplifier" style="stroke:#A80036;stroke-width:1.5;" width="156" x="2410.5" y="561"/><ellipse cx="2425.5" cy="577" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M2428.4688,582.6406 Q2427.8906,582.9375 2427.25,583.0781 Q2426.6094,583.2344 2425.9063,583.2344 Q2423.4063,583.2344 2422.0781,581.5938 Q2420.7656,579.9375 2420.7656,576.8125 Q2420.7656,573.6875 2422.0781,572.0313 Q2423.4063,570.375 2425.9063,570.375 Q2426.6094,570.375 2427.25,570.5313 Q2427.9063,570.6875 2428.4688,570.9844 L2428.4688,573.7031 Q2427.8438,573.125 2427.25,572.8594 Q2426.6563,572.5781 2426.0313,572.5781 Q2424.6875,572.5781 2424,573.6563 Q2423.3125,574.7188 2423.3125,576.8125 Q2423.3125,578.9063 2424,579.9844 Q2424.6875,581.0469 2426.0313,581.0469 Q2426.6563,581.0469 2427.25,580.7813 Q2427.8438,580.5 2428.4688,579.9219 L2428.4688,582.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="124" x="2439.5" y="581.1543">ExpressionSimplifier</text><line style="stroke:#A80036;stroke-width:1.5;" x1="2411.5" x2="2565.5" y1="593" y2="593"/><line style="stroke:#A80036;stroke-width:1.5;" x1="2411.5" x2="2565.5" y1="601" y2="601"/><!--MD5=[09d0ace23740abc72ce7e8b4f8ae65c7]
+class RemoveRedundantStatements--><rect codeLine="105" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="48" id="RemoveRedundantStatements" style="stroke:#A80036;stroke-width:1.5;" width="223" x="2602" y="561"/><ellipse cx="2617" cy="577" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M2619.9688,582.6406 Q2619.3906,582.9375 2618.75,583.0781 Q2618.1094,583.2344 2617.4063,583.2344 Q2614.9063,583.2344 2613.5781,581.5938 Q2612.2656,579.9375 2612.2656,576.8125 Q2612.2656,573.6875 2613.5781,572.0313 Q2614.9063,570.375 2617.4063,570.375 Q2618.1094,570.375 2618.75,570.5313 Q2619.4063,570.6875 2619.9688,570.9844 L2619.9688,573.7031 Q2619.3438,573.125 2618.75,572.8594 Q2618.1563,572.5781 2617.5313,572.5781 Q2616.1875,572.5781 2615.5,573.6563 Q2614.8125,574.7188 2614.8125,576.8125 Q2614.8125,578.9063 2615.5,579.9844 Q2616.1875,581.0469 2617.5313,581.0469 Q2618.1563,581.0469 2618.75,580.7813 Q2619.3438,580.5 2619.9688,579.9219 L2619.9688,582.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="191" x="2631" y="581.1543">RemoveRedundantStatements</text><line style="stroke:#A80036;stroke-width:1.5;" x1="2603" x2="2824" y1="593" y2="593"/><line style="stroke:#A80036;stroke-width:1.5;" x1="2603" x2="2824" y1="601" y2="601"/><!--MD5=[8dd11208bc782b9bc4fe9a727775ac71]
+class RemoveRedundantLoops--><rect codeLine="106" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="48" id="RemoveRedundantLoops" style="stroke:#A80036;stroke-width:1.5;" width="187" x="2860" y="561"/><ellipse cx="2875" cy="577" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M2877.9688,582.6406 Q2877.3906,582.9375 2876.75,583.0781 Q2876.1094,583.2344 2875.4063,583.2344 Q2872.9063,583.2344 2871.5781,581.5938 Q2870.2656,579.9375 2870.2656,576.8125 Q2870.2656,573.6875 2871.5781,572.0313 Q2872.9063,570.375 2875.4063,570.375 Q2876.1094,570.375 2876.75,570.5313 Q2877.4063,570.6875 2877.9688,570.9844 L2877.9688,573.7031 Q2877.3438,573.125 2876.75,572.8594 Q2876.1563,572.5781 2875.5313,572.5781 Q2874.1875,572.5781 2873.5,573.6563 Q2872.8125,574.7188 2872.8125,576.8125 Q2872.8125,578.9063 2873.5,579.9844 Q2874.1875,581.0469 2875.5313,581.0469 Q2876.1563,581.0469 2876.75,580.7813 Q2877.3438,580.5 2877.9688,579.9219 L2877.9688,582.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="155" x="2889" y="581.1543">RemoveRedundantLoops</text><line style="stroke:#A80036;stroke-width:1.5;" x1="2861" x2="3046" y1="593" y2="593"/><line style="stroke:#A80036;stroke-width:1.5;" x1="2861" x2="3046" y1="601" y2="601"/><!--MD5=[85eaa2c6ee966b219cfed7e8ed27a206]
+class RemoveDuplicateBody--><rect codeLine="107" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="48" id="RemoveDuplicateBody" style="stroke:#A80036;stroke-width:1.5;" width="170" x="3082.5" y="561"/><ellipse cx="3097.5" cy="577" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M3100.4688,582.6406 Q3099.8906,582.9375 3099.25,583.0781 Q3098.6094,583.2344 3097.9063,583.2344 Q3095.4063,583.2344 3094.0781,581.5938 Q3092.7656,579.9375 3092.7656,576.8125 Q3092.7656,573.6875 3094.0781,572.0313 Q3095.4063,570.375 3097.9063,570.375 Q3098.6094,570.375 3099.25,570.5313 Q3099.9063,570.6875 3100.4688,570.9844 L3100.4688,573.7031 Q3099.8438,573.125 3099.25,572.8594 Q3098.6563,572.5781 3098.0313,572.5781 Q3096.6875,572.5781 3096,573.6563 Q3095.3125,574.7188 3095.3125,576.8125 Q3095.3125,578.9063 3096,579.9844 Q3096.6875,581.0469 3098.0313,581.0469 Q3098.6563,581.0469 3099.25,580.7813 Q3099.8438,580.5 3100.4688,579.9219 L3100.4688,582.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="138" x="3111.5" y="581.1543">RemoveDuplicateBody</text><line style="stroke:#A80036;stroke-width:1.5;" x1="3083.5" x2="3251.5" y1="593" y2="593"/><line style="stroke:#A80036;stroke-width:1.5;" x1="3083.5" x2="3251.5" y1="601" y2="601"/><!--MD5=[781eb37a56bb69dce1ac0e85789010ac]
+class CodeGen--><rect codeLine="115" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="48" id="CodeGen" style="stroke:#A80036;stroke-width:1.5;" width="89" x="3288" y="561"/><ellipse cx="3303" cy="577" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M3305.9688,582.6406 Q3305.3906,582.9375 3304.75,583.0781 Q3304.1094,583.2344 3303.4063,583.2344 Q3300.9063,583.2344 3299.5781,581.5938 Q3298.2656,579.9375 3298.2656,576.8125 Q3298.2656,573.6875 3299.5781,572.0313 Q3300.9063,570.375 3303.4063,570.375 Q3304.1094,570.375 3304.75,570.5313 Q3305.4063,570.6875 3305.9688,570.9844 L3305.9688,573.7031 Q3305.3438,573.125 3304.75,572.8594 Q3304.1563,572.5781 3303.5313,572.5781 Q3302.1875,572.5781 3301.5,573.6563 Q3300.8125,574.7188 3300.8125,576.8125 Q3300.8125,578.9063 3301.5,579.9844 Q3302.1875,581.0469 3303.5313,581.0469 Q3304.1563,581.0469 3304.75,580.7813 Q3305.3438,580.5 3305.9688,579.9219 L3305.9688,582.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="57" x="3317" y="581.1543">CodeGen</text><line style="stroke:#A80036;stroke-width:1.5;" x1="3289" x2="3376" y1="593" y2="593"/><line style="stroke:#A80036;stroke-width:1.5;" x1="3289" x2="3376" y1="601" y2="601"/><!--MD5=[1c66665a05557eaba0ef54dbe8329f75]
+class CodeGen_C--><rect codeLine="116" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="48" id="CodeGen_C" style="stroke:#A80036;stroke-width:1.5;" width="103" x="3130" y="830"/><ellipse cx="3145" cy="846" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M3147.9688,851.6406 Q3147.3906,851.9375 3146.75,852.0781 Q3146.1094,852.2344 3145.4063,852.2344 Q3142.9063,852.2344 3141.5781,850.5938 Q3140.2656,848.9375 3140.2656,845.8125 Q3140.2656,842.6875 3141.5781,841.0313 Q3142.9063,839.375 3145.4063,839.375 Q3146.1094,839.375 3146.75,839.5313 Q3147.4063,839.6875 3147.9688,839.9844 L3147.9688,842.7031 Q3147.3438,842.125 3146.75,841.8594 Q3146.1563,841.5781 3145.5313,841.5781 Q3144.1875,841.5781 3143.5,842.6563 Q3142.8125,843.7188 3142.8125,845.8125 Q3142.8125,847.9063 3143.5,848.9844 Q3144.1875,850.0469 3145.5313,850.0469 Q3146.1563,850.0469 3146.75,849.7813 Q3147.3438,849.5 3147.9688,848.9219 L3147.9688,851.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="71" x="3159" y="850.1543">CodeGen_C</text><line style="stroke:#A80036;stroke-width:1.5;" x1="3131" x2="3232" y1="862" y2="862"/><line style="stroke:#A80036;stroke-width:1.5;" x1="3131" x2="3232" y1="870" y2="870"/><!--MD5=[b05ffbf1810bcc29bd244a8644dcab5e]
+class CodeGen_CUDA--><rect codeLine="117" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="48" id="CodeGen_CUDA" style="stroke:#A80036;stroke-width:1.5;" width="129" x="3268" y="830"/><ellipse cx="3283" cy="846" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M3285.9688,851.6406 Q3285.3906,851.9375 3284.75,852.0781 Q3284.1094,852.2344 3283.4063,852.2344 Q3280.9063,852.2344 3279.5781,850.5938 Q3278.2656,848.9375 3278.2656,845.8125 Q3278.2656,842.6875 3279.5781,841.0313 Q3280.9063,839.375 3283.4063,839.375 Q3284.1094,839.375 3284.75,839.5313 Q3285.4063,839.6875 3285.9688,839.9844 L3285.9688,842.7031 Q3285.3438,842.125 3284.75,841.8594 Q3284.1563,841.5781 3283.5313,841.5781 Q3282.1875,841.5781 3281.5,842.6563 Q3280.8125,843.7188 3280.8125,845.8125 Q3280.8125,847.9063 3281.5,848.9844 Q3282.1875,850.0469 3283.5313,850.0469 Q3284.1563,850.0469 3284.75,849.7813 Q3285.3438,849.5 3285.9688,848.9219 L3285.9688,851.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="97" x="3297" y="850.1543">CodeGen_CUDA</text><line style="stroke:#A80036;stroke-width:1.5;" x1="3269" x2="3396" y1="862" y2="862"/><line style="stroke:#A80036;stroke-width:1.5;" x1="3269" x2="3396" y1="870" y2="870"/><!--MD5=[e6fabe1c34e0f779d9281ebc64edf122]
+class CodeGen_ISPC--><rect codeLine="118" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="48" id="CodeGen_ISPC" style="stroke:#A80036;stroke-width:1.5;" width="122" x="3432.5" y="830"/><ellipse cx="3447.5" cy="846" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M3450.4688,851.6406 Q3449.8906,851.9375 3449.25,852.0781 Q3448.6094,852.2344 3447.9063,852.2344 Q3445.4063,852.2344 3444.0781,850.5938 Q3442.7656,848.9375 3442.7656,845.8125 Q3442.7656,842.6875 3444.0781,841.0313 Q3445.4063,839.375 3447.9063,839.375 Q3448.6094,839.375 3449.25,839.5313 Q3449.9063,839.6875 3450.4688,839.9844 L3450.4688,842.7031 Q3449.8438,842.125 3449.25,841.8594 Q3448.6563,841.5781 3448.0313,841.5781 Q3446.6875,841.5781 3446,842.6563 Q3445.3125,843.7188 3445.3125,845.8125 Q3445.3125,847.9063 3446,848.9844 Q3446.6875,850.0469 3448.0313,850.0469 Q3448.6563,850.0469 3449.25,849.7813 Q3449.8438,849.5 3450.4688,848.9219 L3450.4688,851.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="90" x="3461.5" y="850.1543">CodeGen_ISPC</text><line style="stroke:#A80036;stroke-width:1.5;" x1="3433.5" x2="3553.5" y1="862" y2="862"/><line style="stroke:#A80036;stroke-width:1.5;" x1="3433.5" x2="3553.5" y1="870" y2="870"/><!--MD5=[a8e9f8a103380e23aa8687dbc5a94fb7]
+class Manageable--><rect codeLine="126" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="48" id="Manageable" style="stroke:#A80036;stroke-width:1.5;" width="109" x="1221" y="244"/><ellipse cx="1236" cy="260" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M1238.9688,265.6406 Q1238.3906,265.9375 1237.75,266.0781 Q1237.1094,266.2344 1236.4063,266.2344 Q1233.9063,266.2344 1232.5781,264.5938 Q1231.2656,262.9375 1231.2656,259.8125 Q1231.2656,256.6875 1232.5781,255.0313 Q1233.9063,253.375 1236.4063,253.375 Q1237.1094,253.375 1237.75,253.5313 Q1238.4063,253.6875 1238.9688,253.9844 L1238.9688,256.7031 Q1238.3438,256.125 1237.75,255.8594 Q1237.1563,255.5781 1236.5313,255.5781 Q1235.1875,255.5781 1234.5,256.6563 Q1233.8125,257.7188 1233.8125,259.8125 Q1233.8125,261.9063 1234.5,262.9844 Q1235.1875,264.0469 1236.5313,264.0469 Q1237.1563,264.0469 1237.75,263.7813 Q1238.3438,263.5 1238.9688,262.9219 L1238.9688,265.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="77" x="1250" y="264.1543">Manageable</text><line style="stroke:#A80036;stroke-width:1.5;" x1="1222" x2="1329" y1="276" y2="276"/><line style="stroke:#A80036;stroke-width:1.5;" x1="1222" x2="1329" y1="284" y2="284"/><!--MD5=[b230114a6dc80ef25a3e5e6e95ae886a]
+class IndexStmtNode--><rect codeLine="127" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="60.8047" id="IndexStmtNode" style="stroke:#A80036;stroke-width:1.5;" width="325" x="521" y="554.5"/><ellipse cx="631.75" cy="570.5" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M634.7188,576.1406 Q634.1406,576.4375 633.5,576.5781 Q632.8594,576.7344 632.1563,576.7344 Q629.6563,576.7344 628.3281,575.0938 Q627.0156,573.4375 627.0156,570.3125 Q627.0156,567.1875 628.3281,565.5313 Q629.6563,563.875 632.1563,563.875 Q632.8594,563.875 633.5,564.0313 Q634.1563,564.1875 634.7188,564.4844 L634.7188,567.2031 Q634.0938,566.625 633.5,566.3594 Q632.9063,566.0781 632.2813,566.0781 Q630.9375,566.0781 630.25,567.1563 Q629.5625,568.2188 629.5625,570.3125 Q629.5625,572.4063 630.25,573.4844 Q630.9375,574.5469 632.2813,574.5469 Q632.9063,574.5469 633.5,574.2813 Q634.0938,574 634.7188,573.4219 L634.7188,576.1406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="95" x="652.25" y="574.6543">IndexStmtNode</text><line style="stroke:#A80036;stroke-width:1.5;" x1="522" x2="845" y1="586.5" y2="586.5"/><line style="stroke:#A80036;stroke-width:1.5;" x1="522" x2="845" y1="594.5" y2="594.5"/><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="529" y="602.5"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="299" x="541" y="608.7104">virtual void accept(IndexStmtVisitorStrict*) const = 0</text><!--MD5=[d94a097bbd14b86b446d6c306c6327b3]
+class IndexExprNode--><rect codeLine="130" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="60.8047" id="IndexExprNode" style="stroke:#A80036;stroke-width:1.5;" width="325" x="1281" y="554.5"/><ellipse cx="1392.75" cy="570.5" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M1395.7188,576.1406 Q1395.1406,576.4375 1394.5,576.5781 Q1393.8594,576.7344 1393.1563,576.7344 Q1390.6563,576.7344 1389.3281,575.0938 Q1388.0156,573.4375 1388.0156,570.3125 Q1388.0156,567.1875 1389.3281,565.5313 Q1390.6563,563.875 1393.1563,563.875 Q1393.8594,563.875 1394.5,564.0313 Q1395.1563,564.1875 1395.7188,564.4844 L1395.7188,567.2031 Q1395.0938,566.625 1394.5,566.3594 Q1393.9063,566.0781 1393.2813,566.0781 Q1391.9375,566.0781 1391.25,567.1563 Q1390.5625,568.2188 1390.5625,570.3125 Q1390.5625,572.4063 1391.25,573.4844 Q1391.9375,574.5469 1393.2813,574.5469 Q1393.9063,574.5469 1394.5,574.2813 Q1395.0938,574 1395.7188,573.4219 L1395.7188,576.1406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="93" x="1413.25" y="574.6543">IndexExprNode</text><line style="stroke:#A80036;stroke-width:1.5;" x1="1282" x2="1605" y1="586.5" y2="586.5"/><line style="stroke:#A80036;stroke-width:1.5;" x1="1282" x2="1605" y1="594.5" y2="594.5"/><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="1289" y="602.5"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="299" x="1301" y="608.7104">virtual void accept(IndexStmtVisitorStrict*) const = 0</text><!--MD5=[2ae3d0d839308205eb4a3976239628b6]
+class IndexStmt--><rect codeLine="140" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="48" id="IndexStmt" style="stroke:#A80036;stroke-width:1.5;" width="94" x="636.5" y="244"/><ellipse cx="651.5" cy="260" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M654.4688,265.6406 Q653.8906,265.9375 653.25,266.0781 Q652.6094,266.2344 651.9063,266.2344 Q649.4063,266.2344 648.0781,264.5938 Q646.7656,262.9375 646.7656,259.8125 Q646.7656,256.6875 648.0781,255.0313 Q649.4063,253.375 651.9063,253.375 Q652.6094,253.375 653.25,253.5313 Q653.9063,253.6875 654.4688,253.9844 L654.4688,256.7031 Q653.8438,256.125 653.25,255.8594 Q652.6563,255.5781 652.0313,255.5781 Q650.6875,255.5781 650,256.6563 Q649.3125,257.7188 649.3125,259.8125 Q649.3125,261.9063 650,262.9844 Q650.6875,264.0469 652.0313,264.0469 Q652.6563,264.0469 653.25,263.7813 Q653.8438,263.5 654.4688,262.9219 L654.4688,265.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="62" x="665.5" y="264.1543">IndexStmt</text><line style="stroke:#A80036;stroke-width:1.5;" x1="637.5" x2="729.5" y1="276" y2="276"/><line style="stroke:#A80036;stroke-width:1.5;" x1="637.5" x2="729.5" y1="284" y2="284"/><!--MD5=[97c64a8910e96953a95fad8b92c83bb0]
+class IndexExpr--><rect codeLine="141" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="48" id="IndexExpr" style="stroke:#A80036;stroke-width:1.5;" width="92" x="1374.5" y="244"/><ellipse cx="1389.5" cy="260" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M1392.4688,265.6406 Q1391.8906,265.9375 1391.25,266.0781 Q1390.6094,266.2344 1389.9063,266.2344 Q1387.4063,266.2344 1386.0781,264.5938 Q1384.7656,262.9375 1384.7656,259.8125 Q1384.7656,256.6875 1386.0781,255.0313 Q1387.4063,253.375 1389.9063,253.375 Q1390.6094,253.375 1391.25,253.5313 Q1391.9063,253.6875 1392.4688,253.9844 L1392.4688,256.7031 Q1391.8438,256.125 1391.25,255.8594 Q1390.6563,255.5781 1390.0313,255.5781 Q1388.6875,255.5781 1388,256.6563 Q1387.3125,257.7188 1387.3125,259.8125 Q1387.3125,261.9063 1388,262.9844 Q1388.6875,264.0469 1390.0313,264.0469 Q1390.6563,264.0469 1391.25,263.7813 Q1391.8438,263.5 1392.4688,262.9219 L1392.4688,265.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="60" x="1403.5" y="264.1543">IndexExpr</text><line style="stroke:#A80036;stroke-width:1.5;" x1="1375.5" x2="1465.5" y1="276" y2="276"/><line style="stroke:#A80036;stroke-width:1.5;" x1="1375.5" x2="1465.5" y1="284" y2="284"/><!--MD5=[28b0f4e593c8487512a9debc1bac1917]
+class IndexExprVisitorStrict--><rect codeLine="149" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="201.6563" id="IndexExprVisitorStrict" style="stroke:#A80036;stroke-width:1.5;" width="283" x="1641" y="484"/><ellipse cx="1711.25" cy="500" fill="#A9DCDF" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M1711.3594,495.3438 L1710.2031,500.4219 L1712.5313,500.4219 L1711.3594,495.3438 Z M1709.875,493.1094 L1712.8594,493.1094 L1716.2188,505.5 L1713.7656,505.5 L1713,502.4375 L1709.7188,502.4375 L1708.9688,505.5 L1706.5313,505.5 L1709.875,493.1094 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" font-style="italic" lengthAdjust="spacing" textLength="134" x="1731.75" y="504.1543">IndexExprVisitorStrict</text><line style="stroke:#A80036;stroke-width:1.5;" x1="1642" x2="1923" y1="516" y2="516"/><line style="stroke:#A80036;stroke-width:1.5;" x1="1642" x2="1923" y1="524" y2="524"/><ellipse cx="1652" cy="535" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="160" x="1661" y="538.2104">void visit(const IndexStmt&amp;)</text><ellipse cx="1652" cy="547.8047" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="231" x="1661" y="551.0151">virtual void visit(const AccessNode*) = 0</text><ellipse cx="1652" cy="560.6094" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="226" x="1661" y="563.8198">virtual void visit(const LiteralNode*) = 0</text><ellipse cx="1652" cy="573.4141" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="213" x="1661" y="576.6245">virtual void visit(const NegNode*) = 0</text><ellipse cx="1652" cy="586.2188" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="212" x="1661" y="589.4292">virtual void visit(const AddNode*) = 0</text><ellipse cx="1652" cy="599.0234" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="213" x="1661" y="602.2339">virtual void visit(const SubNode*) = 0</text><ellipse cx="1652" cy="611.8281" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="210" x="1661" y="615.0386">virtual void visit(const MulNode*) = 0</text><ellipse cx="1652" cy="624.6328" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="208" x="1661" y="627.8433">virtual void visit(const DivNode*) = 0</text><ellipse cx="1652" cy="637.4375" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="215" x="1661" y="640.6479">virtual void visit(const SqrtNode*) = 0</text><ellipse cx="1652" cy="650.2422" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="217" x="1661" y="653.4526">virtual void visit(const CastNode*) = 0</text><ellipse cx="1652" cy="663.0469" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="257" x="1661" y="666.2573">virtual void visit(const CallIntrinsicNode*) = 0</text><ellipse cx="1652" cy="675.8516" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="246" x="1661" y="679.062">virtual void visit(const ReductionNode*) = 0</text><!--MD5=[a89aadb6ea0d27c41410991969988628]
+class IndexStmtVisitorStrict--><rect codeLine="163" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="163.2422" id="IndexStmtVisitorStrict" style="stroke:#A80036;stroke-width:1.5;" width="284" x="1968.5" y="503.5"/><ellipse cx="2038.75" cy="519.5" fill="#A9DCDF" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M2038.8594,514.8438 L2037.7031,519.9219 L2040.0313,519.9219 L2038.8594,514.8438 Z M2037.375,512.6094 L2040.3594,512.6094 L2043.7188,525 L2041.2656,525 L2040.5,521.9375 L2037.2188,521.9375 L2036.4688,525 L2034.0313,525 L2037.375,512.6094 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" font-style="italic" lengthAdjust="spacing" textLength="135" x="2059.25" y="523.6543">IndexStmtVisitorStrict</text><line style="stroke:#A80036;stroke-width:1.5;" x1="1969.5" x2="2251.5" y1="535.5" y2="535.5"/><line style="stroke:#A80036;stroke-width:1.5;" x1="1969.5" x2="2251.5" y1="543.5" y2="543.5"/><ellipse cx="1979.5" cy="554.5" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="160" x="1988.5" y="557.7104">void visit(const IndexStmt&amp;)</text><ellipse cx="1979.5" cy="567.3047" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="258" x="1988.5" y="570.5151">virtual void visit(const AssignmentNode*) = 0</text><ellipse cx="1979.5" cy="580.1094" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="218" x="1988.5" y="583.3198">virtual void visit(const YieldNode*) = 0</text><ellipse cx="1979.5" cy="592.9141" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="222" x="1988.5" y="596.1245">virtual void visit(const ForallNode*) = 0</text><ellipse cx="1979.5" cy="605.7188" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="226" x="1988.5" y="608.9292">virtual void visit(const WhereNode*) = 0</text><ellipse cx="1979.5" cy="618.5234" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="247" x="1988.5" y="621.7339">virtual void visit(const SequenceNode*) = 0</text><ellipse cx="1979.5" cy="631.3281" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="247" x="1988.5" y="634.5386">virtual void visit(const AssembleNode*) = 0</text><ellipse cx="1979.5" cy="644.1328" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="217" x="1988.5" y="647.3433">virtual void visit(const MultiNode*) = 0</text><ellipse cx="1979.5" cy="656.9375" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="244" x="1988.5" y="660.1479">virtual void visit(const SuchThatNode*) = 0</text><!--MD5=[b74718248e125c8ad329889fd2a32c16]
+class IndexNotationVisitorStrict--><rect codeLine="175" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="48" id="IndexNotationVisitorStrict" style="stroke:#A80036;stroke-width:1.5;" width="192" x="1404.5" y="830"/><ellipse cx="1419.5" cy="846" fill="#A9DCDF" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M1419.6094,841.3438 L1418.4531,846.4219 L1420.7813,846.4219 L1419.6094,841.3438 Z M1418.125,839.1094 L1421.1094,839.1094 L1424.4688,851.5 L1422.0156,851.5 L1421.25,848.4375 L1417.9688,848.4375 L1417.2188,851.5 L1414.7813,851.5 L1418.125,839.1094 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" font-style="italic" lengthAdjust="spacing" textLength="160" x="1433.5" y="850.1543">IndexNotationVisitorStrict</text><line style="stroke:#A80036;stroke-width:1.5;" x1="1405.5" x2="1595.5" y1="862" y2="862"/><line style="stroke:#A80036;stroke-width:1.5;" x1="1405.5" x2="1595.5" y1="870" y2="870"/><!--MD5=[cb464207dbcea0ece296242645495747]
+class IndexNotationPrinter--><rect codeLine="176" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="368.1172" id="IndexNotationPrinter" style="stroke:#A80036;stroke-width:1.5;" width="253" x="1301" y="1044.5"/><ellipse cx="1358.75" cy="1060.5" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M1361.7188,1066.1406 Q1361.1406,1066.4375 1360.5,1066.5781 Q1359.8594,1066.7344 1359.1563,1066.7344 Q1356.6563,1066.7344 1355.3281,1065.0938 Q1354.0156,1063.4375 1354.0156,1060.3125 Q1354.0156,1057.1875 1355.3281,1055.5313 Q1356.6563,1053.875 1359.1563,1053.875 Q1359.8594,1053.875 1360.5,1054.0313 Q1361.1563,1054.1875 1361.7188,1054.4844 L1361.7188,1057.2031 Q1361.0938,1056.625 1360.5,1056.3594 Q1359.9063,1056.0781 1359.2813,1056.0781 Q1357.9375,1056.0781 1357.25,1057.1563 Q1356.5625,1058.2188 1356.5625,1060.3125 Q1356.5625,1062.4063 1357.25,1063.4844 Q1357.9375,1064.5469 1359.2813,1064.5469 Q1359.9063,1064.5469 1360.5,1064.2813 Q1361.0938,1064 1361.7188,1063.4219 L1361.7188,1066.1406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="129" x="1379.25" y="1064.6543">IndexNotationPrinter</text><line style="stroke:#A80036;stroke-width:1.5;" x1="1302" x2="1553" y1="1076.5" y2="1076.5"/><line style="stroke:#A80036;stroke-width:1.5;" x1="1302" x2="1553" y1="1084.5" y2="1084.5"/><ellipse cx="1312" cy="1095.5" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="190" x="1321" y="1098.7104">void print(const IndexExpr&amp; expr)</text><ellipse cx="1312" cy="1108.3047" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="192" x="1321" y="1111.5151">void print(const IndexStmt&amp; expr)</text><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="0" x="1325" y="1124.3198"/><ellipse cx="1312" cy="1133.9141" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="200" x="1321" y="1137.1245">void visit(const AccessNode* node)</text><ellipse cx="1312" cy="1146.7188" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="195" x="1321" y="1149.9292">void visit(const LiteralNode* node)</text><ellipse cx="1312" cy="1159.5234" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="182" x="1321" y="1162.7339">void visit(const NegNode* node)</text><ellipse cx="1312" cy="1172.3281" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="181" x="1321" y="1175.5386">void visit(const AddNode* node)</text><ellipse cx="1312" cy="1185.1328" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="182" x="1321" y="1188.3433">void visit(const SubNode* node)</text><ellipse cx="1312" cy="1197.9375" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="179" x="1321" y="1201.1479">void visit(const MulNode* node)</text><ellipse cx="1312" cy="1210.7422" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="177" x="1321" y="1213.9526">void visit(const DivNode* node)</text><ellipse cx="1312" cy="1223.5469" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="184" x="1321" y="1226.7573">void visit(const SqrtNode* node)</text><ellipse cx="1312" cy="1236.3516" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="186" x="1321" y="1239.562">void visit(const CastNode* node)</text><ellipse cx="1312" cy="1249.1563" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="226" x="1321" y="1252.3667">void visit(const CallIntrinsicNode* node)</text><ellipse cx="1312" cy="1261.9609" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="218" x="1321" y="1265.1714">void visit(const UnaryExprNode* node)</text><ellipse cx="1312" cy="1274.7656" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="221" x="1321" y="1277.9761">void visit(const BinaryExprNode* node)</text><ellipse cx="1312" cy="1287.5703" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="215" x="1321" y="1290.7808">void visit(const ReductionNode* node)</text><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="0" x="1325" y="1303.5854"/><ellipse cx="1312" cy="1313.1797" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="227" x="1321" y="1316.3901">void visit(const AssignmentNode* node)</text><ellipse cx="1312" cy="1325.9844" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="187" x="1321" y="1329.1948">void visit(const YieldNode* node)</text><ellipse cx="1312" cy="1338.7891" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="191" x="1321" y="1341.9995">void visit(const ForallNode* node)</text><ellipse cx="1312" cy="1351.5938" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="195" x="1321" y="1354.8042">void visit(const WhereNode* node)</text><ellipse cx="1312" cy="1364.3984" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="216" x="1321" y="1367.6089">void visit(const SequenceNode* node)</text><ellipse cx="1312" cy="1377.2031" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="216" x="1321" y="1380.4136">void visit(const AssembleNode* node)</text><ellipse cx="1312" cy="1390.0078" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="186" x="1321" y="1393.2183">void visit(const MultiNode* node)</text><ellipse cx="1312" cy="1402.8125" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="213" x="1321" y="1406.0229">void visit(const SuchThatNode* node)</text><!--MD5=[1889949f301ae6d76cb20e56f2d1d951]
+class IndexNotationVisitor--><rect codeLine="205" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="329.7031" id="IndexNotationVisitor" style="stroke:#A80036;stroke-width:1.5;" width="292" x="1589.5" y="1063.5"/><ellipse cx="1668.25" cy="1079.5" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M1671.2188,1085.1406 Q1670.6406,1085.4375 1670,1085.5781 Q1669.3594,1085.7344 1668.6563,1085.7344 Q1666.1563,1085.7344 1664.8281,1084.0938 Q1663.5156,1082.4375 1663.5156,1079.3125 Q1663.5156,1076.1875 1664.8281,1074.5313 Q1666.1563,1072.875 1668.6563,1072.875 Q1669.3594,1072.875 1670,1073.0313 Q1670.6563,1073.1875 1671.2188,1073.4844 L1671.2188,1076.2031 Q1670.5938,1075.625 1670,1075.3594 Q1669.4063,1075.0781 1668.7813,1075.0781 Q1667.4375,1075.0781 1666.75,1076.1563 Q1666.0625,1077.2188 1666.0625,1079.3125 Q1666.0625,1081.4063 1666.75,1082.4844 Q1667.4375,1083.5469 1668.7813,1083.5469 Q1669.4063,1083.5469 1670,1083.2813 Q1670.5938,1083 1671.2188,1082.4219 L1671.2188,1085.1406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="126" x="1688.75" y="1083.6543">IndexNotationVisitor</text><line style="stroke:#A80036;stroke-width:1.5;" x1="1590.5" x2="1880.5" y1="1095.5" y2="1095.5"/><line style="stroke:#A80036;stroke-width:1.5;" x1="1590.5" x2="1880.5" y1="1103.5" y2="1103.5"/><ellipse cx="1600.5" cy="1114.5" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="239" x="1609.5" y="1117.7104">virtual void visit(const AccessNode* node)</text><ellipse cx="1600.5" cy="1127.3047" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="234" x="1609.5" y="1130.5151">virtual void visit(const LiteralNode* node)</text><ellipse cx="1600.5" cy="1140.1094" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="221" x="1609.5" y="1143.3198">virtual void visit(const NegNode* node)</text><ellipse cx="1600.5" cy="1152.9141" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="220" x="1609.5" y="1156.1245">virtual void visit(const AddNode* node)</text><ellipse cx="1600.5" cy="1165.7188" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="221" x="1609.5" y="1168.9292">virtual void visit(const SubNode* node)</text><ellipse cx="1600.5" cy="1178.5234" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="218" x="1609.5" y="1181.7339">virtual void visit(const MulNode* node)</text><ellipse cx="1600.5" cy="1191.3281" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="216" x="1609.5" y="1194.5386">virtual void visit(const DivNode* node)</text><ellipse cx="1600.5" cy="1204.1328" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="223" x="1609.5" y="1207.3433">virtual void visit(const SqrtNode* node)</text><ellipse cx="1600.5" cy="1216.9375" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="225" x="1609.5" y="1220.1479">virtual void visit(const CastNode* node)</text><ellipse cx="1600.5" cy="1229.7422" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="265" x="1609.5" y="1232.9526">virtual void visit(const CallIntrinsicNode* node)</text><ellipse cx="1600.5" cy="1242.5469" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="257" x="1609.5" y="1245.7573">virtual void visit(const UnaryExprNode* node)</text><ellipse cx="1600.5" cy="1255.3516" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="260" x="1609.5" y="1258.562">virtual void visit(const BinaryExprNode* node)</text><ellipse cx="1600.5" cy="1268.1563" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="254" x="1609.5" y="1271.3667">virtual void visit(const ReductionNode* node)</text><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="0" x="1613.5" y="1284.1714"/><ellipse cx="1600.5" cy="1293.7656" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="266" x="1609.5" y="1296.9761">virtual void visit(const AssignmentNode* node)</text><ellipse cx="1600.5" cy="1306.5703" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="226" x="1609.5" y="1309.7808">virtual void visit(const YieldNode* node)</text><ellipse cx="1600.5" cy="1319.375" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="230" x="1609.5" y="1322.5854">virtual void visit(const ForallNode* node)</text><ellipse cx="1600.5" cy="1332.1797" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="234" x="1609.5" y="1335.3901">virtual void visit(const WhereNode* node)</text><ellipse cx="1600.5" cy="1344.9844" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="255" x="1609.5" y="1348.1948">virtual void visit(const SequenceNode* node)</text><ellipse cx="1600.5" cy="1357.7891" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="255" x="1609.5" y="1360.9995">virtual void visit(const AssembleNode* node)</text><ellipse cx="1600.5" cy="1370.5938" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="225" x="1609.5" y="1373.8042">virtual void visit(const MultiNode* node)</text><ellipse cx="1600.5" cy="1383.3984" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="252" x="1609.5" y="1386.6089">virtual void visit(const SuchThatNode* node)</text><!--MD5=[c249847c086044a14a4ecd1d09905030]
+class Matcher--><rect codeLine="231" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="48" id="Matcher" style="stroke:#A80036;stroke-width:1.5;" width="83" x="1694" y="1621"/><ellipse cx="1709" cy="1637" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M1711.9688,1642.6406 Q1711.3906,1642.9375 1710.75,1643.0781 Q1710.1094,1643.2344 1709.4063,1643.2344 Q1706.9063,1643.2344 1705.5781,1641.5938 Q1704.2656,1639.9375 1704.2656,1636.8125 Q1704.2656,1633.6875 1705.5781,1632.0313 Q1706.9063,1630.375 1709.4063,1630.375 Q1710.1094,1630.375 1710.75,1630.5313 Q1711.4063,1630.6875 1711.9688,1630.9844 L1711.9688,1633.7031 Q1711.3438,1633.125 1710.75,1632.8594 Q1710.1563,1632.5781 1709.5313,1632.5781 Q1708.1875,1632.5781 1707.5,1633.6563 Q1706.8125,1634.7188 1706.8125,1636.8125 Q1706.8125,1638.9063 1707.5,1639.9844 Q1708.1875,1641.0469 1709.5313,1641.0469 Q1710.1563,1641.0469 1710.75,1640.7813 Q1711.3438,1640.5 1711.9688,1639.9219 L1711.9688,1642.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="51" x="1723" y="1641.1543">Matcher</text><line style="stroke:#A80036;stroke-width:1.5;" x1="1695" x2="1776" y1="1653" y2="1653"/><line style="stroke:#A80036;stroke-width:1.5;" x1="1695" x2="1776" y1="1661" y2="1661"/><!--MD5=[ea8f53988b378f12e96f95ad2b8e8e7e]
+class IndexExprRewriterStrict--><rect codeLine="235" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="214.4609" id="IndexExprRewriterStrict" style="stroke:#A80036;stroke-width:1.5;" width="301" x="1632" y="747"/><ellipse cx="1704.25" cy="763" fill="#A9DCDF" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M1704.3594,758.3438 L1703.2031,763.4219 L1705.5313,763.4219 L1704.3594,758.3438 Z M1702.875,756.1094 L1705.8594,756.1094 L1709.2188,768.5 L1706.7656,768.5 L1706,765.4375 L1702.7188,765.4375 L1701.9688,768.5 L1699.5313,768.5 L1702.875,756.1094 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" font-style="italic" lengthAdjust="spacing" textLength="148" x="1724.75" y="767.1543">IndexExprRewriterStrict</text><line style="stroke:#A80036;stroke-width:1.5;" x1="1633" x2="1932" y1="779" y2="779"/><polygon fill="none" points="1643,785,1647,789,1643,793,1639,789" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="84" x="1652" y="793.2104">IndexExpr expr</text><line style="stroke:#A80036;stroke-width:1.5;" x1="1633" x2="1932" y1="799.8047" y2="799.8047"/><ellipse cx="1643" cy="810.8047" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="162" x="1652" y="814.0151">IndexExpr rewrite(IndexExpr)</text><polygon fill="#FFFF44" points="1643,818.6094,1647,822.6094,1643,826.6094,1639,822.6094" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="249" x="1652" y="826.8198">virtual void visit(const AccessNode* op) = 0</text><polygon fill="#FFFF44" points="1643,831.4141,1647,835.4141,1643,839.4141,1639,835.4141" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="244" x="1652" y="839.6245">virtual void visit(const LiteralNode* op) = 0</text><polygon fill="#FFFF44" points="1643,844.2188,1647,848.2188,1643,852.2188,1639,848.2188" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="231" x="1652" y="852.4292">virtual void visit(const NegNode* op) = 0</text><polygon fill="#FFFF44" points="1643,857.0234,1647,861.0234,1643,865.0234,1639,861.0234" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="233" x="1652" y="865.2339">virtual void visit(const SqrtNode* op) = 0</text><polygon fill="#FFFF44" points="1643,869.8281,1647,873.8281,1643,877.8281,1639,873.8281" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="230" x="1652" y="878.0386">virtual void visit(const AddNode* op) = 0</text><polygon fill="#FFFF44" points="1643,882.6328,1647,886.6328,1643,890.6328,1639,886.6328" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="231" x="1652" y="890.8433">virtual void visit(const SubNode* op) = 0</text><polygon fill="#FFFF44" points="1643,895.4375,1647,899.4375,1643,903.4375,1639,899.4375" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="228" x="1652" y="903.6479">virtual void visit(const MulNode* op) = 0</text><polygon fill="#FFFF44" points="1643,908.2422,1647,912.2422,1643,916.2422,1639,912.2422" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="226" x="1652" y="916.4526">virtual void visit(const DivNode* op) = 0</text><polygon fill="#FFFF44" points="1643,921.0469,1647,925.0469,1643,929.0469,1639,925.0469" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="235" x="1652" y="929.2573">virtual void visit(const CastNode* op) = 0</text><polygon fill="#FFFF44" points="1643,933.8516,1647,937.8516,1643,941.8516,1639,937.8516" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="275" x="1652" y="942.062">virtual void visit(const CallIntrinsicNode* op) = 0</text><polygon fill="#FFFF44" points="1643,946.6563,1647,950.6563,1643,954.6563,1639,950.6563" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="264" x="1652" y="954.8667">virtual void visit(const ReductionNode* op) = 0</text><!--MD5=[fce5a5c177cad31ce6c931f148bb8f55]
+class IndexStmtRewriterStrict--><rect codeLine="252" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="176.0469" id="IndexStmtRewriterStrict" style="stroke:#A80036;stroke-width:1.5;" width="302" x="1968.5" y="766"/><ellipse cx="2040.75" cy="782" fill="#A9DCDF" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M2040.8594,777.3438 L2039.7031,782.4219 L2042.0313,782.4219 L2040.8594,777.3438 Z M2039.375,775.1094 L2042.3594,775.1094 L2045.7188,787.5 L2043.2656,787.5 L2042.5,784.4375 L2039.2188,784.4375 L2038.4688,787.5 L2036.0313,787.5 L2039.375,775.1094 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" font-style="italic" lengthAdjust="spacing" textLength="149" x="2061.25" y="786.1543">IndexStmtRewriterStrict</text><line style="stroke:#A80036;stroke-width:1.5;" x1="1969.5" x2="2269.5" y1="798" y2="798"/><polygon fill="none" points="1979.5,804,1983.5,808,1979.5,812,1975.5,808" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="87" x="1988.5" y="812.2104">IndexStmt stmt</text><line style="stroke:#A80036;stroke-width:1.5;" x1="1969.5" x2="2269.5" y1="818.8047" y2="818.8047"/><ellipse cx="1979.5" cy="829.8047" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="166" x="1988.5" y="833.0151">IndexStmt rewrite(IndexStmt)</text><polygon fill="#FFFF44" points="1979.5,837.6094,1983.5,841.6094,1979.5,845.6094,1975.5,841.6094" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="276" x="1988.5" y="845.8198">virtual void visit(const AssignmentNode* op) = 0</text><polygon fill="#FFFF44" points="1979.5,850.4141,1983.5,854.4141,1979.5,858.4141,1975.5,854.4141" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="236" x="1988.5" y="858.6245">virtual void visit(const YieldNode* op) = 0</text><polygon fill="#FFFF44" points="1979.5,863.2188,1983.5,867.2188,1979.5,871.2188,1975.5,867.2188" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="240" x="1988.5" y="871.4292">virtual void visit(const ForallNode* op) = 0</text><polygon fill="#FFFF44" points="1979.5,876.0234,1983.5,880.0234,1979.5,884.0234,1975.5,880.0234" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="244" x="1988.5" y="884.2339">virtual void visit(const WhereNode* op) = 0</text><polygon fill="#FFFF44" points="1979.5,888.8281,1983.5,892.8281,1979.5,896.8281,1975.5,892.8281" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="265" x="1988.5" y="897.0386">virtual void visit(const SequenceNode* op) = 0</text><polygon fill="#FFFF44" points="1979.5,901.6328,1983.5,905.6328,1979.5,909.6328,1975.5,905.6328" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="265" x="1988.5" y="909.8433">virtual void visit(const AssembleNode* op) = 0</text><polygon fill="#FFFF44" points="1979.5,914.4375,1983.5,918.4375,1979.5,922.4375,1975.5,918.4375" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="235" x="1988.5" y="922.6479">virtual void visit(const MultiNode* op) = 0</text><polygon fill="#FFFF44" points="1979.5,927.2422,1983.5,931.2422,1979.5,935.2422,1975.5,931.2422" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="262" x="1988.5" y="935.4526">virtual void visit(const SuchThatNode* op) = 0</text><!--MD5=[c34474f968cd689ed26c36a6e449f9a5]
+class IndexNotationRewriterStrict--><rect codeLine="266" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="48" id="IndexNotationRewriterStrict" style="stroke:#A80036;stroke-width:1.5;" width="206" x="1966.5" y="1204.5"/><ellipse cx="1981.5" cy="1220.5" fill="#A9DCDF" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M1981.6094,1215.8438 L1980.4531,1220.9219 L1982.7813,1220.9219 L1981.6094,1215.8438 Z M1980.125,1213.6094 L1983.1094,1213.6094 L1986.4688,1226 L1984.0156,1226 L1983.25,1222.9375 L1979.9688,1222.9375 L1979.2188,1226 L1976.7813,1226 L1980.125,1213.6094 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" font-style="italic" lengthAdjust="spacing" textLength="174" x="1995.5" y="1224.6543">IndexNotationRewriterStrict</text><line style="stroke:#A80036;stroke-width:1.5;" x1="1967.5" x2="2171.5" y1="1236.5" y2="1236.5"/><line style="stroke:#A80036;stroke-width:1.5;" x1="1967.5" x2="2171.5" y1="1244.5" y2="1244.5"/><!--MD5=[f43b50a501af9b122d481161df5564ac]
+class IndexNotationRewriter--><rect codeLine="267" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="329.7031" id="IndexNotationRewriter" style="stroke:#A80036;stroke-width:1.5;" width="292" x="1923.5" y="1480"/><ellipse cx="1996.25" cy="1496" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M1999.2188,1501.6406 Q1998.6406,1501.9375 1998,1502.0781 Q1997.3594,1502.2344 1996.6563,1502.2344 Q1994.1563,1502.2344 1992.8281,1500.5938 Q1991.5156,1498.9375 1991.5156,1495.8125 Q1991.5156,1492.6875 1992.8281,1491.0313 Q1994.1563,1489.375 1996.6563,1489.375 Q1997.3594,1489.375 1998,1489.5313 Q1998.6563,1489.6875 1999.2188,1489.9844 L1999.2188,1492.7031 Q1998.5938,1492.125 1998,1491.8594 Q1997.4063,1491.5781 1996.7813,1491.5781 Q1995.4375,1491.5781 1994.75,1492.6563 Q1994.0625,1493.7188 1994.0625,1495.8125 Q1994.0625,1497.9063 1994.75,1498.9844 Q1995.4375,1500.0469 1996.7813,1500.0469 Q1997.4063,1500.0469 1998,1499.7813 Q1998.5938,1499.5 1999.2188,1498.9219 L1999.2188,1501.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="138" x="2016.75" y="1500.1543">IndexNotationRewriter</text><line style="stroke:#A80036;stroke-width:1.5;" x1="1924.5" x2="2214.5" y1="1512" y2="1512"/><line style="stroke:#A80036;stroke-width:1.5;" x1="1924.5" x2="2214.5" y1="1520" y2="1520"/><ellipse cx="1934.5" cy="1531" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="239" x="1943.5" y="1534.2104">virtual void visit(const AccessNode* node)</text><ellipse cx="1934.5" cy="1543.8047" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="234" x="1943.5" y="1547.0151">virtual void visit(const LiteralNode* node)</text><ellipse cx="1934.5" cy="1556.6094" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="221" x="1943.5" y="1559.8198">virtual void visit(const NegNode* node)</text><ellipse cx="1934.5" cy="1569.4141" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="220" x="1943.5" y="1572.6245">virtual void visit(const AddNode* node)</text><ellipse cx="1934.5" cy="1582.2188" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="221" x="1943.5" y="1585.4292">virtual void visit(const SubNode* node)</text><ellipse cx="1934.5" cy="1595.0234" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="218" x="1943.5" y="1598.2339">virtual void visit(const MulNode* node)</text><ellipse cx="1934.5" cy="1607.8281" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="216" x="1943.5" y="1611.0386">virtual void visit(const DivNode* node)</text><ellipse cx="1934.5" cy="1620.6328" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="223" x="1943.5" y="1623.8433">virtual void visit(const SqrtNode* node)</text><ellipse cx="1934.5" cy="1633.4375" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="225" x="1943.5" y="1636.6479">virtual void visit(const CastNode* node)</text><ellipse cx="1934.5" cy="1646.2422" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="265" x="1943.5" y="1649.4526">virtual void visit(const CallIntrinsicNode* node)</text><ellipse cx="1934.5" cy="1659.0469" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="257" x="1943.5" y="1662.2573">virtual void visit(const UnaryExprNode* node)</text><ellipse cx="1934.5" cy="1671.8516" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="260" x="1943.5" y="1675.062">virtual void visit(const BinaryExprNode* node)</text><ellipse cx="1934.5" cy="1684.6563" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="254" x="1943.5" y="1687.8667">virtual void visit(const ReductionNode* node)</text><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="0" x="1947.5" y="1700.6714"/><ellipse cx="1934.5" cy="1710.2656" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="266" x="1943.5" y="1713.4761">virtual void visit(const AssignmentNode* node)</text><ellipse cx="1934.5" cy="1723.0703" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="226" x="1943.5" y="1726.2808">virtual void visit(const YieldNode* node)</text><ellipse cx="1934.5" cy="1735.875" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="230" x="1943.5" y="1739.0854">virtual void visit(const ForallNode* node)</text><ellipse cx="1934.5" cy="1748.6797" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="234" x="1943.5" y="1751.8901">virtual void visit(const WhereNode* node)</text><ellipse cx="1934.5" cy="1761.4844" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="255" x="1943.5" y="1764.6948">virtual void visit(const SequenceNode* node)</text><ellipse cx="1934.5" cy="1774.2891" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="255" x="1943.5" y="1777.4995">virtual void visit(const AssembleNode* node)</text><ellipse cx="1934.5" cy="1787.0938" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="225" x="1943.5" y="1790.3042">virtual void visit(const MultiNode* node)</text><ellipse cx="1934.5" cy="1799.8984" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="252" x="1943.5" y="1803.1089">virtual void visit(const SuchThatNode* node)</text><!--MD5=[2bd6b9bd378d282739bad95694e0395c]
+class Lowerer--><rect codeLine="317" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="60.8047" id="Lowerer" style="stroke:#A80036;stroke-width:1.5;" width="234" x="946.5" y="237.5"/><ellipse cx="1034.75" cy="253.5" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M1037.7188,259.1406 Q1037.1406,259.4375 1036.5,259.5781 Q1035.8594,259.7344 1035.1563,259.7344 Q1032.6563,259.7344 1031.3281,258.0938 Q1030.0156,256.4375 1030.0156,253.3125 Q1030.0156,250.1875 1031.3281,248.5313 Q1032.6563,246.875 1035.1563,246.875 Q1035.8594,246.875 1036.5,247.0313 Q1037.1563,247.1875 1037.7188,247.4844 L1037.7188,250.2031 Q1037.0938,249.625 1036.5,249.3594 Q1035.9063,249.0781 1035.2813,249.0781 Q1033.9375,249.0781 1033.25,250.1563 Q1032.5625,251.2188 1032.5625,253.3125 Q1032.5625,255.4063 1033.25,256.4844 Q1033.9375,257.5469 1035.2813,257.5469 Q1035.9063,257.5469 1036.5,257.2813 Q1037.0938,257 1037.7188,256.4219 L1037.7188,259.1406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="49" x="1055.25" y="257.6543">Lowerer</text><line style="stroke:#A80036;stroke-width:1.5;" x1="947.5" x2="1179.5" y1="269.5" y2="269.5"/><ellipse cx="957.5" cy="280.5" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;fill:none;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="208" x="966.5" y="283.7104">std::shared_ptr&lt;LowererImpl&gt; impl;</text><line style="stroke:#A80036;stroke-width:1.5;" x1="947.5" x2="1179.5" y1="290.3047" y2="290.3047"/><!--MD5=[b7b8bc7e8eb8ee18eadc3b8fd556bfb2]
+class LowererImpl--><rect codeLine="320" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="188.8516" id="LowererImpl" style="stroke:#A80036;stroke-width:1.5;" width="365" x="881" y="490.5"/><ellipse cx="1020.75" cy="506.5" fill="#A9DCDF" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M1020.8594,501.8438 L1019.7031,506.9219 L1022.0313,506.9219 L1020.8594,501.8438 Z M1019.375,499.6094 L1022.3594,499.6094 L1025.7188,512 L1023.2656,512 L1022.5,508.9375 L1019.2188,508.9375 L1018.4688,512 L1016.0313,512 L1019.375,499.6094 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" font-style="italic" lengthAdjust="spacing" textLength="77" x="1041.25" y="510.6543">LowererImpl</text><line style="stroke:#A80036;stroke-width:1.5;" x1="882" x2="1245" y1="522.5" y2="522.5"/><polygon fill="none" points="892,528.5,896,532.5,892,536.5,888,532.5" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="74" x="901" y="536.7104">class Visitor;</text><polygon fill="none" points="892,541.3047,896,545.3047,892,549.3047,888,545.3047" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="111" x="901" y="549.5151">friend class Visitor;</text><polygon fill="none" points="892,554.1094,896,558.1094,892,562.1094,888,558.1094" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="185" x="901" y="562.3198">std::shared_ptr&lt;Visitor&gt; visitor;</text><line style="stroke:#A80036;stroke-width:1.5;" x1="882" x2="1245" y1="568.9141" y2="568.9141"/><polygon fill="#FFFF44" points="892,574.9141,896,578.9141,892,582.9141,888,578.9141" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="216" x="901" y="583.1245">virtual ir::Stmt lower(IndexStmt stmt);</text><polygon fill="#FFFF44" points="892,587.7188,896,591.7188,892,595.7188,888,591.7188" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="211" x="901" y="595.9292">virtual ir::Expr lower(IndexExpr expr);</text><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="0" x="905" y="608.7339"/><polygon fill="#FFFF44" points="892,613.3281,896,617.3281,892,621.3281,888,617.3281" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="260" x="901" y="621.5386">virtual ir::Expr lowerExpr(IndexExpr expr) = 0;</text><polygon fill="#FFFF44" points="892,626.1328,896,630.1328,892,634.1328,888,630.1328" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="267" x="901" y="634.3433">virtual ir::Stmt lowerStmt(IndexStmt stmt) = 0;</text><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="0" x="905" y="647.1479"/><ellipse cx="892" cy="656.7422" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="315" x="901" y="659.9526">virtual ir::Stmt lower(IndexStmt stmt, std::string name,</text><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="339" x="901" y="672.7573">bool assemble, bool compute, bool pack, bool unpack) = 0;</text><!--MD5=[cf3b4bcfbe7bc4015089b336f3e5ed76]
+class LowererImplImperative--><rect codeLine="337" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="188.8516" id="LowererImplImperative" style="stroke:#A80036;stroke-width:1.5;" width="337" x="691" y="759.5"/><ellipse cx="785.75" cy="775.5" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M788.7188,781.1406 Q788.1406,781.4375 787.5,781.5781 Q786.8594,781.7344 786.1563,781.7344 Q783.6563,781.7344 782.3281,780.0938 Q781.0156,778.4375 781.0156,775.3125 Q781.0156,772.1875 782.3281,770.5313 Q783.6563,768.875 786.1563,768.875 Q786.8594,768.875 787.5,769.0313 Q788.1563,769.1875 788.7188,769.4844 L788.7188,772.2031 Q788.0938,771.625 787.5,771.3594 Q786.9063,771.0781 786.2813,771.0781 Q784.9375,771.0781 784.25,772.1563 Q783.5625,773.2188 783.5625,775.3125 Q783.5625,777.4063 784.25,778.4844 Q784.9375,779.5469 786.2813,779.5469 Q786.9063,779.5469 787.5,779.2813 Q788.0938,779 788.7188,778.4219 L788.7188,781.1406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="139" x="806.25" y="779.6543">LowererImplImperative</text><line style="stroke:#A80036;stroke-width:1.5;" x1="692" x2="1027" y1="791.5" y2="791.5"/><rect fill="none" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="699" y="799.5"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="70" x="711" y="805.7104">class Visitor</text><rect fill="none" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="699" y="812.3047"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="102" x="711" y="818.5151">fiend class Visitor</text><rect fill="none" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="699" y="825.1094"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="181" x="711" y="831.3198">std::shared_ptr&lt;Visitor&gt; visitor</text><rect fill="none" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="699" y="837.9141"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="84" x="711" y="844.1245">bool assemble</text><rect fill="none" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="699" y="850.7188"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="77" x="711" y="856.9292">bool compute</text><rect fill="none" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="699" y="863.5234"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="166" x="711" y="869.7339">vars a_bunch_of_other_fields</text><line style="stroke:#A80036;stroke-width:1.5;" x1="692" x2="1027" y1="876.3281" y2="876.3281"/><polygon fill="#FFFF44" points="702,882.3281,706,886.3281,702,890.3281,698,886.3281" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="238" x="711" y="890.5386">virtual ir::Stmt lowerExpr(IndexExpr expr);</text><polygon fill="#FFFF44" points="702,895.1328,706,899.1328,702,903.1328,698,899.1328" style="stroke:#B38D22;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="243" x="711" y="903.3433">virtual ir::Stmt lowerStmt(IndexStmt stmt);</text><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="0" x="715" y="916.1479"/><ellipse cx="702" cy="925.7422" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="276" x="711" y="928.9526">ir::Stmt lower(IndexStmt stmt, std::string name,</text><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="311" x="711" y="941.7573">bool assemble, bool compute, bool pack, bool unpack)</text><path d="M581,1201 L581,1256.3984 A0,0 0 0 0 581,1256.3984 L946,1256.3984 A0,0 0 0 0 946,1256.3984 L946,1211 L936,1201 L774.6103,1201 L835.2751,948.5022 L766.6103,1201 L581,1201 A0,0 0 0 0 581,1201 " fill="#FBFB77" filter="url(#fujoep6dbpit)" style="stroke:#A80036;stroke-width:1.0;"/><path d="M936,1201 L936,1211 L946,1211 L936,1201 " fill="#FBFB77" style="stroke:#A80036;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="344" x="587" y="1218.0669">Stmt LowererImplImperative::lower(IndexStmt stmt) {</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="174" x="595" y="1233.1997">return visitor-&gt;lower(stmt);</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="8" x="587" y="1248.3325">}</text><!--MD5=[53bf68ed638bcf4718423098b3d480ea]
+class Visitor--><rect codeLine="362" fill="#FEFECE" filter="url(#fujoep6dbpit)" height="380.9219" id="Visitor" style="stroke:#A80036;stroke-width:1.5;" width="253" x="981" y="1038"/><ellipse cx="1083.75" cy="1054" fill="#ADD1B2" rx="11" ry="11" style="stroke:#A80036;stroke-width:1.0;"/><path d="M1086.7188,1059.6406 Q1086.1406,1059.9375 1085.5,1060.0781 Q1084.8594,1060.2344 1084.1563,1060.2344 Q1081.6563,1060.2344 1080.3281,1058.5938 Q1079.0156,1056.9375 1079.0156,1053.8125 Q1079.0156,1050.6875 1080.3281,1049.0313 Q1081.6563,1047.375 1084.1563,1047.375 Q1084.8594,1047.375 1085.5,1047.5313 Q1086.1563,1047.6875 1086.7188,1047.9844 L1086.7188,1050.7031 Q1086.0938,1050.125 1085.5,1049.8594 Q1084.9063,1049.5781 1084.2813,1049.5781 Q1082.9375,1049.5781 1082.25,1050.6563 Q1081.5625,1051.7188 1081.5625,1053.8125 Q1081.5625,1055.9063 1082.25,1056.9844 Q1082.9375,1058.0469 1084.2813,1058.0469 Q1084.9063,1058.0469 1085.5,1057.7813 Q1086.0938,1057.5 1086.7188,1056.9219 L1086.7188,1059.6406 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="12" lengthAdjust="spacing" textLength="39" x="1104.25" y="1058.1543">Visitor</text><line style="stroke:#A80036;stroke-width:1.5;" x1="982" x2="1233" y1="1070" y2="1070"/><rect fill="none" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1078"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="104" x="1001" y="1084.2104">LowererImpl* impl</text><rect fill="none" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1090.8047"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="54" x="1001" y="1097.0151">Expr expr</text><rect fill="none" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1103.6094"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="57" x="1001" y="1109.8198">Stmt stmt</text><line style="stroke:#A80036;stroke-width:1.5;" x1="982" x2="1233" y1="1116.4141" y2="1116.4141"/><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1124.4141"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="227" x="1001" y="1130.6245">void visit(const AssignmentNode* node)</text><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1137.2188"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="187" x="1001" y="1143.4292">void visit(const YieldNode* node)</text><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1150.0234"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="191" x="1001" y="1156.2339">void visit(const ForallNode* node)</text><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1162.8281"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="195" x="1001" y="1169.0386">void visit(const WhereNode* node)</text><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1175.6328"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="186" x="1001" y="1181.8433">void visit(const MultiNode* node)</text><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1188.4375"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="213" x="1001" y="1194.6479">void visit(const SuchThatNode* node)</text><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1201.2422"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="216" x="1001" y="1207.4526">void visit(const SequenceNode* node)</text><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1214.0469"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="216" x="1001" y="1220.2573">void visit(const AssembleNode* node)</text><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1226.8516"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="200" x="1001" y="1233.062">void visit(const AccessNode* node)</text><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1239.6563"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="195" x="1001" y="1245.8667">void visit(const LiteralNode* node)</text><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1252.4609"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="182" x="1001" y="1258.6714">void visit(const NegNode* node)</text><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1265.2656"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="181" x="1001" y="1271.4761">void visit(const AddNode* node)</text><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1278.0703"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="182" x="1001" y="1284.2808">void visit(const SubNode* node)</text><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1290.875"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="179" x="1001" y="1297.0854">void visit(const MulNode* node)</text><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1303.6797"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="177" x="1001" y="1309.8901">void visit(const DivNode* node)</text><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1316.4844"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="184" x="1001" y="1322.6948">void visit(const SqrtNode* node)</text><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1329.2891"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="186" x="1001" y="1335.4995">void visit(const CastNode* node)</text><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1342.0938"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="226" x="1001" y="1348.3042">void visit(const CallIntrinsicNode* node)</text><rect fill="#F24D5C" height="6" style="stroke:#C82930;stroke-width:1.0;" width="6" x="989" y="1354.8984"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="215" x="1001" y="1361.1089">void visit(const ReductionNode* node)</text><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="0" x="1005" y="1373.9136"/><ellipse cx="992" cy="1383.5078" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="208" x="1001" y="1386.7183">Visitor(LowererImplImperative* impl)</text><ellipse cx="992" cy="1396.3125" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="157" x="1001" y="1399.5229">Stmt lower(IndexStmt stmt)</text><ellipse cx="992" cy="1409.1172" fill="#84BE84" rx="3" ry="3" style="stroke:#038048;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="11" lengthAdjust="spacing" textLength="152" x="1001" y="1412.3276">Expr lower(IndexExpr expr)</text><path d="M975.5,1587 L975.5,1702.9297 A0,0 0 0 0 975.5,1702.9297 L1239.5,1702.9297 A0,0 0 0 0 1239.5,1702.9297 L1239.5,1597 L1229.5,1587 L1111.5,1587 L1107.5,1419.0758 L1103.5,1587 L975.5,1587 A0,0 0 0 0 975.5,1587 " fill="#FBFB77" filter="url(#fujoep6dbpit)" style="stroke:#A80036;stroke-width:1.0;"/><path d="M1229.5,1587 L1229.5,1597 L1239.5,1597 L1229.5,1587 " fill="#FBFB77" style="stroke:#A80036;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="191" x="981.5" y="1604.0669">Stmt lower(IndexStmt stmt) {</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="134" x="989.5" y="1619.1997">this-&gt;stmt = Stmt();</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="219" x="989.5" y="1634.3325">impl-&gt;accessibleIterators.scope();</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="217" x="989.5" y="1649.4653">IndexStmtVisitorStrict::visit(stmt);</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="235" x="989.5" y="1664.5981">impl-&gt;accessibleIterators.unscope();</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="116" x="989.5" y="1679.731">return this-&gt;stmt;</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="8" x="981.5" y="1694.8638">}</text><!--MD5=[ae51cb2269b8d9d23f4eb16ba4c021c2]
+reverse link Uncopyable to IRNode--><path codeLine="26" d="M820.4411,310.2452 C805.2002,341.1699 780.565,381.9466 747.5,407 C656.8632,475.6756 606.7888,436.4312 503.5,484 C464.9666,501.7462 424.4666,526.9962 393.5999,547.8097 " fill="none" id="Uncopyable-backto-IRNode" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="814.1208,307.2354,828.901,292.0517,826.8155,313.1384,814.1208,307.2354" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[910d35d02fd37b100f27db676215561a]
+reverse link IRNode to BaseStmtNode--><path codeLine="27" d="M307.5211,639.0848 C270.9373,697.3157 214.3898,787.3234 187.5844,829.9899 " fill="none" id="IRNode-backto-BaseStmtNode" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="301.6118,635.3322,318.1787,622.1209,313.4664,642.7799,301.6118,635.3322" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[d15399140a2ba01317e3af6505b2f237]
+reverse link IRNode to BaseExprNode--><path codeLine="28" d="M348.9536,642.2864 C356.1642,697.705 366.8158,779.5704 372.5311,823.496 " fill="none" id="IRNode-backto-BaseExprNode" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="341.9689,642.8569,346.3299,622.1209,355.8519,641.0505,341.9689,642.8569" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[6fc58d354eb039aa71a812145e71cc51]
+reverse link BaseStmtNode to StmtNode--><path codeLine="29" d="M167.1367,898.6343 C157.8757,975.7069 139.117,1131.8206 131.1797,1197.8764 " fill="none" id="BaseStmtNode-backto-StmtNode" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="160.246,897.3055,169.5821,878.2834,174.146,898.9757,160.246,897.3055" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[b34aab7e28a3dad0196efbcf2402ad4b]
+reverse link BaseExprNode to ExprNode--><path codeLine="30" d="M377.9886,904.6786 C380.3078,983.6383 384.7133,1133.6261 386.6028,1197.9549 " fill="none" id="BaseExprNode-backto-ExprNode" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="370.9905,904.8438,377.4002,884.6469,384.9845,904.4327,370.9905,904.8438" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[6982bf9bb66925c6ba3afaae707aa75e]
+reverse link IntrusivePtr to IRHandle--><path codeLine="38" d="M613.7823,69.7043 C517.4991,114.1798 345.0472,193.8394 250.6748,237.4323 " fill="none" id="IntrusivePtr-backto-IRHandle" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="610.8857,63.3316,631.9777,61.2994,616.7566,76.0412,610.8857,63.3316" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[59037bcbfab734c29d5f29f29345990a]
+reverse link IRHandle to Expr--><path codeLine="39" d="M161.5974,316.9128 C141.1096,360.6844 110.2697,426.612 83.5,484 C71.2362,510.2908 57.2078,540.4477 47.6932,560.9138 " fill="none" id="IRHandle-backto-Expr" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="155.2791,313.8992,170.0982,298.7534,167.9586,319.8347,155.2791,313.8992" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[2e3a655b6bb23e034db67107221f412a]
+reverse link IRHandle to Stmt--><path codeLine="40" d="M176.0078,318.793 C164.419,388.1073 144.1107,509.5737 135.5197,560.9576 " fill="none" id="IRHandle-backto-Stmt" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="169.1677,317.2552,179.37,298.6833,182.976,319.5639,169.1677,317.2552" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[2a0bead0c725d8d45864c262e97ce783]
+reverse link IRHandle to IRNode--><path codeLine="42" d="M205.5204,310.4425 C236.4803,372.9538 293.8347,488.7585 323.1741,547.998 " fill="none" id="IRHandle-backto-IRNode" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="#A80036" points="199.6965,298.6833,198.7749,305.8353,205.0223,309.4367,205.9439,302.2848,199.6965,298.6833" style="stroke:#A80036;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="54" x="273.5" y="450.0669">contains</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="8" x="193.9008" y="318.2998">1</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="8" x="311.0578" y="536.9081">1</text><!--MD5=[58cf07e15c029e621d8edfba03fa64a2]
+reverse link IRVisitorStrict to IRVisitor--><path codeLine="94" d="M2743.085,79.3089 C2719.9753,94.626 2693.8639,112.3013 2670.5,129 C2620.0401,165.0649 2563.4947,208.7917 2527.0182,237.4501 " fill="none" id="IRVisitorStrict-backto-IRVisitor" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="2739.5252,73.2715,2760.0768,68.1107,2747.2291,84.9613,2739.5252,73.2715" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[75bfe22e38cc091da0fbb6f74406d06e]
+reverse link IRVisitorStrict to IRPrinter--><path codeLine="95" d="M2880.6802,78.0481 C2916.73,98.0228 2962.2732,123.2577 3008.2876,148.7537 " fill="none" id="IRVisitorStrict-backto-IRPrinter" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="2876.843,83.9248,2862.7416,68.1086,2883.6283,71.679,2876.843,83.9248" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[85c48ab67ed60544567d89d58f94870d]
+reverse link IRVisitorStrict to IRRewriter--><path codeLine="96" d="M2807.5,88.3035 C2807.5,120.7899 2807.5,163.3447 2807.5,198.8875 " fill="none" id="IRVisitorStrict-backto-IRRewriter" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="2800.5001,88.1087,2807.5,68.1086,2814.5001,88.1086,2800.5001,88.1087" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[4028845cddf1255230d8af57eef922a1]
+reverse link IRVisitor to IRVerifier--><path codeLine="97" d="M2464.2278,317.0083 C2430.0567,386.0033 2369.0761,509.1298 2343.4074,560.9576 " fill="none" id="IRVisitor-backto-IRVerifier" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="2458.1543,313.4989,2473.3035,298.6833,2470.6999,319.7124,2458.1543,313.4989" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[4a94ddfc410010d5e6723affae8cc10d]
+reverse link IRRewriter to ExpressionSimplifier--><path codeLine="102" d="M2720.1218,351.0244 C2678.6358,390.8516 2628.5995,439.4826 2584.5,484 C2558.962,509.78 2530.3923,540.0247 2511.1115,560.6511 " fill="none" id="IRRewriter-backto-ExpressionSimplifier" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="2715.4205,345.8343,2734.7035,337.0499,2725.1073,355.9421,2715.4205,345.8343" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[949c6b7bfd4235ac14da91ff0f1abad4]
+reverse link IRRewriter to RemoveRedundantStatements--><path codeLine="109" d="M2781.238,356.5644 C2760.6434,426.0163 2733.4041,517.8765 2720.6551,560.8705 " fill="none" id="IRRewriter-backto-RemoveRedundantStatements" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="2774.5903,354.3599,2786.9875,337.1753,2788.0127,358.3401,2774.5903,354.3599" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[429f4895fdac785b3cbc97ff72ee188d]
+reverse link IRRewriter to RemoveRedundantLoops--><path codeLine="110" d="M2847.8795,355.6733 C2879.9145,425.2286 2922.5003,517.6924 2942.3867,560.8705 " fill="none" id="IRRewriter-backto-RemoveRedundantLoops" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="2841.3685,358.2695,2839.3599,337.1753,2854.0847,352.4128,2841.3685,358.2695" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[68110a92c5ea6b91d77066a3850c99de]
+reverse link IRRewriter to RemoveDuplicateBody--><path codeLine="111" d="M2908.6453,349.9792 C2956.6158,389.6232 3014.286,438.4014 3064.5,484 C3092.4965,509.4232 3123.3012,539.9945 3143.8609,560.7956 " fill="none" id="IRRewriter-backto-RemoveDuplicateBody" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="2904.088,355.2943,2893.1039,337.1739,2912.9906,344.4895,2904.088,355.2943" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[ae50f65b4cbe15dbcd3dbdd752b04bad]
+reverse link IRPrinter to CodeGen--><path codeLine="120" d="M3277.8266,425.9959 C3295.9202,478.6165 3314.0392,531.3112 3324.2066,560.8807 " fill="none" id="IRPrinter-backto-CodeGen" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="3271.1927,428.2301,3271.309,407.0408,3284.4319,423.6778,3271.1927,428.2301" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[319cfb8dc735ef16cddbcdc70ff637f3]
+reverse link CodeGen to CodeGen_C--><path codeLine="121" d="M3308.9871,626.8873 C3276.9808,683.9051 3220.7457,784.0854 3195.0269,829.9024 " fill="none" id="CodeGen-backto-CodeGen_C" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="3302.9395,623.3601,3318.8334,609.3464,3315.1476,630.213,3302.9395,623.3601" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[aa96aa54faae34d6ca9ace804ab720b5]
+reverse link CodeGen to CodeGen_ISPC--><path codeLine="122" d="M3357.3479,626.516 C3391.4351,683.4693 3451.5956,783.9858 3479.0773,829.9024 " fill="none" id="CodeGen-backto-CodeGen_ISPC" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="3351.3364,630.1024,3347.0716,609.3464,3363.3492,622.9126,3351.3364,630.1024" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[442ed43531516b32839cb3faf9b2f28c]
+reverse link CodeGen to CodeGen_CUDA--><path codeLine="123" d="M3332.5,629.5199 C3332.5,686.9415 3332.5,784.7827 3332.5,829.9024 " fill="none" id="CodeGen-backto-CodeGen_CUDA" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="3325.5001,629.3464,3332.5,609.3464,3339.5001,629.3464,3325.5001,629.3464" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[e163d187ef5eaf663efea2335f5ab426]
+reverse link Manageable to IndexStmtNode--><path codeLine="135" d="M1257.9638,310.6351 C1244.4457,339.9959 1223.6542,378.6672 1197.5,407 C1172.0822,434.5351 1162.6727,441.0723 1127.5,454 C1016.6611,494.7386 975.8786,447.724 863.5,484 C813.372,500.1814 761.462,531.0819 726.345,554.4621 " fill="none" id="Manageable-backto-IndexStmtNode" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="1251.6817,307.531,1266.1369,292.0374,1264.4987,313.1636,1251.6817,307.531" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[d7ac6dc268d898e1c32d87af860f66f6]
+reverse link Uncopyable to IndexStmtNode--><path codeLine="136" d="M817.879,310.1732 C786.2624,374.8344 726.5439,496.9683 698.6147,554.088 " fill="none" id="Uncopyable-backto-IndexStmtNode" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="811.6488,306.9791,826.7226,292.0867,824.2258,313.1288,811.6488,306.9791" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[648f85c83359671aa27d8bdab5afe684]
+reverse link Manageable to IndexExprNode--><path codeLine="137" d="M1297.8505,310.1732 C1332.1189,374.8344 1396.846,496.9683 1427.1176,554.088 " fill="none" id="Manageable-backto-IndexExprNode" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="1291.4456,313.0363,1288.2652,292.0867,1303.8158,306.4805,1291.4456,313.0363" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[81b6c5025c45f10b02aba636448b0629]
+reverse link Uncopyable to IndexExprNode--><path codeLine="138" d="M854.8121,310.9496 C869.2284,342.7749 893.4553,384.3933 928.5,407 C1003.6621,455.4857 1040.645,415.6362 1127.5,437 C1189.6011,452.2751 1205.1592,457.804 1263.5,484 C1309.4283,504.6226 1359.1614,533.0311 1394.6887,554.4675 " fill="none" id="Uncopyable-backto-IndexExprNode" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="848.2286,313.3522,846.9174,292.2032,861.1311,307.9185,848.2286,313.3522" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[4a1ee9a433488db7e06ed4f91810d452]
+reverse link IntrusivePtr to IndexStmt--><path codeLine="143" d="M683.5,88.2338 C683.5,136.5801 683.5,207.2721 683.5,243.9383 " fill="none" id="IntrusivePtr-backto-IndexStmt" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="676.5001,88.1087,683.5,68.1086,690.5001,88.1086,676.5001,88.1087" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[be16824624a74e7da7bb67b6f377f820]
+reverse link IndexStmt to IndexStmtNode--><path codeLine="144" d="M683.5,305.2739 C683.5,368.4736 683.5,495.4911 683.5,554.088 " fill="none" id="IndexStmt-backto-IndexStmtNode" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="#A80036" points="683.5,292.0867,679.5,298.0867,683.5,304.0867,687.5,298.0867,683.5,292.0867" style="stroke:#A80036;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="8" x="674.475" y="312.2647">1</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="8" x="674.675" y="543.2663">1</text><!--MD5=[8120730d6b32269f0970ddfe15f91d14]
+reverse link IntrusivePtr to IndexExpr--><path codeLine="145" d="M755.3761,39.2555 C906.9425,44.2375 1253.1256,62.7483 1347.5,129 C1386.7641,156.5638 1406.5439,211.9266 1415.072,243.8366 " fill="none" id="IntrusivePtr-backto-IndexExpr" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="755.1028,46.2505,735.3276,38.6387,755.5334,32.2571,755.1028,46.2505" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[1972b08ae664b2d2310d03537cd7a5e1]
+reverse link IndexExpr to IndexExprNode--><path codeLine="146" d="M1423.2044,305.2739 C1427.7899,368.4736 1437.0057,495.4911 1441.2572,554.088 " fill="none" id="IndexExpr-backto-IndexExprNode" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="#A80036" points="1422.2476,292.0867,1418.6923,298.3605,1423.1161,304.0552,1426.6714,297.7815,1422.2476,292.0867" style="stroke:#A80036;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="8" x="1413.6704" y="312.2647">1</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="8" x="1431.9696" y="543.2663">1</text><!--MD5=[b26c9c8d3a5b29d8271f45d68507eadd]
+reverse link IndexExprVisitorStrict to IndexNotationVisitorStrict--><path codeLine="295" d="M1632.9995,699.6227 C1626.9972,705.0539 1621.1388,710.5255 1615.5,716 C1577.5728,752.8222 1539.4548,801.3814 1518.0255,829.9928 " fill="none" id="IndexExprVisitorStrict-backto-IndexNotationVisitorStrict" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="1628.5467,694.2161,1648.1726,686.2269,1637.8124,704.7112,1628.5467,694.2161" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[7234130c38761532599f2f7b36911e2f]
+reverse link IndexStmtVisitorStrict to IndexNotationVisitorStrict--><path codeLine="296" d="M1966.5197,675.467 C1958.1897,679.3054 1949.8174,682.8546 1941.5,686 C1803.2176,738.2949 1746.3714,680.1665 1614.5,747 C1575.0345,767.0014 1539.609,804.9113 1519.0786,829.8154 " fill="none" id="IndexStmtVisitorStrict-backto-IndexNotationVisitorStrict" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="1963.6036,669.1012,1984.6506,666.6464,1969.7282,681.6905,1963.6036,669.1012" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[d405f4886b031ffa84d6c62850f61924]
+reverse link IndexNotationVisitorStrict to IndexNotationVisitor--><path codeLine="297" d="M1526.3846,895.2501 C1552.042,936.1383 1593.0382,1001.4705 1631.9613,1063.499 " fill="none" id="IndexNotationVisitorStrict-backto-IndexNotationVisitor" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="1520.4391,898.945,1515.7379,878.2834,1532.2978,891.5037,1520.4391,898.945" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[9298265fb9c2b46a51654cec95663d88]
+reverse link IndexNotationVisitorStrict to IndexNotationPrinter--><path codeLine="298" d="M1491.8998,898.1203 C1484.7524,934.7873 1474.0689,989.595 1463.4478,1044.0828 " fill="none" id="IndexNotationVisitorStrict-backto-IndexNotationPrinter" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="1485.0692,896.5747,1495.7665,878.2834,1498.8106,899.2533,1485.0692,896.5747" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[2164c356666f1a365c7584220eeab5ce]
+reverse link IndexNotationVisitor to Matcher--><path codeLine="299" d="M1735.5,1413.8401 C1735.5,1494.6098 1735.5,1580.0329 1735.5,1620.7139 " fill="none" id="IndexNotationVisitor-backto-Matcher" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="1728.5001,1413.7101,1735.5,1393.71,1742.5001,1413.71,1728.5001,1413.7101" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[81658007f5a451634c394e4129ce2328]
+reverse link IndexExprVisitorStrict to IndexExprRewriterStrict--><path codeLine="301" d="M1782.5,706.5527 C1782.5,720.0234 1782.5,733.669 1782.5,746.9421 " fill="none" id="IndexExprVisitorStrict-backto-IndexExprRewriterStrict" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="1775.5001,706.3141,1782.5,686.3141,1789.5001,706.314,1775.5001,706.3141" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[ad82d38a65963623a4dbc072e2395c0a]
+reverse link IndexStmtVisitorStrict to IndexStmtRewriterStrict--><path codeLine="302" d="M2113.9155,687.085 C2114.78,712.9253 2115.7014,740.463 2116.5412,765.5657 " fill="none" id="IndexStmtVisitorStrict-backto-IndexStmtRewriterStrict" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="2106.9097,687.0274,2113.237,666.8045,2120.9019,686.5592,2106.9097,687.0274" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[17cef803f955afc58233a06ff8ed6ced]
+reverse link IndexExprRewriterStrict to IndexNotationRewriterStrict--><path codeLine="303" d="M1876.8558,977.1228 C1939.4095,1058.7478 2016.1337,1158.8635 2050.856,1204.1719 " fill="none" id="IndexExprRewriterStrict-backto-IndexNotationRewriterStrict" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="1871.16,981.1985,1864.5506,961.066,1882.2722,972.6826,1871.16,981.1985" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[f1a4c69017cc3acf68d02aa5998e72c7]
+reverse link IndexStmtRewriterStrict to IndexNotationRewriterStrict--><path codeLine="304" d="M2105.0534,962.205 C2093.838,1046.2084 2079.1518,1156.2078 2072.7352,1204.2683 " fill="none" id="IndexStmtRewriterStrict-backto-IndexNotationRewriterStrict" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="2098.1288,961.1748,2107.714,942.277,2112.0056,963.0275,2098.1288,961.1748" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[ea60607216d1741e9a004dc3b2ad9bc4]
+reverse link IndexNotationRewriterStrict to IndexNotationRewriter--><path codeLine="306" d="M2069.5,1272.8869 C2069.5,1322.2639 2069.5,1404.9692 2069.5,1479.8852 " fill="none" id="IndexNotationRewriterStrict-backto-IndexNotationRewriter" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="2062.5001,1272.6931,2069.5,1252.6931,2076.5001,1272.693,2062.5001,1272.6931" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[ce0e28a833df6d388c2232cca949e33a]
+reverse link Uncopyable to LowererImpl--><path codeLine="357" d="M864.1964,309.4349 C881.7851,337.3534 905.9227,374.8161 928.5,407 C947.767,434.4652 969.4267,463.6921 989.6625,490.3547 " fill="none" id="Uncopyable-backto-LowererImpl" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="858.0963,312.8828,853.4161,292.2165,869.9625,305.4535,858.0963,312.8828" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[595f18298affe1361dad6c88d07b3ae8]
+reverse link Lowerer to LowererImpl--><path codeLine="358" d="M1063.5,311.7072 C1063.5,357.6007 1063.5,431.1895 1063.5,490.4492 " fill="none" id="Lowerer-backto-LowererImpl" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="#A80036" points="1063.5,298.6833,1059.5,304.6833,1063.5,310.6833,1067.5,304.6833,1063.5,298.6833" style="stroke:#A80036;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="54" x="1064.5" y="450.0669">contains</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="8" x="1054.7125" y="318.2998">1</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="8" x="1055.1813" y="479.3951">1</text><!--MD5=[76c844881f8770258bad5028aba6ca47]
+reverse link IndexNotationVisitorStrict to Visitor--><path codeLine="396" d="M1453.9953,890.688 C1409.0705,926.7585 1339.7531,984.0918 1283.5,1038 C1267.164,1053.655 1250.4599,1070.5041 1234.1618,1087.4753 " fill="none" id="IndexNotationVisitorStrict-backto-Visitor" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="1449.6605,885.1915,1469.657,878.1815,1458.3965,896.1315,1449.6605,885.1915" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[f3857c0b64c12f6416059a5dcd8ca3ae]
+reverse link LowererImpl to Visitor--><path codeLine="397" d="M1070.9927,694.5811 C1077.4133,788.4824 1086.8084,925.8861 1094.4598,1037.7864 " fill="none" id="LowererImpl-backto-Visitor" style="stroke:#A80036;stroke-width:1.0;"/><ellipse cx="1070.5145" cy="687.5863" fill="#FFFFFF" rx="8" ry="8" style="stroke:#A80036;stroke-width:1.0;"/><line style="stroke:#A80036;stroke-width:1.0;" x1="1071.0602" x2="1069.9687" y1="695.5676" y2="679.6049"/><line style="stroke:#A80036;stroke-width:1.0;" x1="1062.5331" x2="1078.4958" y1="688.132" y2="687.0405"/><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="54" x="1090.5" y="858.5669">contains</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="8" x="1062.1577" y="699.5">1</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="8" x="1085.7942" y="1027.1147">1</text><!--MD5=[44db9126e684c102525c4f7b853b119b]
+reverse link Visitor to LowererImpl--><path codeLine="398" d="M1157.8948,1024.7712 C1171.4272,938.063 1176.6901,836.4934 1153.5,747 C1147.5682,724.1083 1137.6236,701.0075 1126.4596,679.7046 " fill="none" id="Visitor-backto-LowererImpl" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="#A80036" points="1155.7945,1037.728,1160.703,1032.4454,1157.7146,1025.8826,1152.8061,1031.1653,1155.7945,1037.728" style="stroke:#A80036;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="54" x="1170.5" y="858.5669">contains</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="8" x="1149.5248" y="1027.054">1</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="8" x="1122.7894" y="699.2875">1</text><!--MD5=[7cec337d4232ea69c4a4e115b7f1c391]
+reverse link LowererImpl to LowererImplImperative--><path codeLine="400" d="M979.2866,696.0461 C963.4034,716.9902 946.9263,738.7172 931.4609,759.1104 " fill="none" id="LowererImpl-backto-LowererImplImperative" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="none" points="973.9145,691.5453,991.5772,679.8393,985.0696,700.005,973.9145,691.5453" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[d416585c3fdacb879af8752baa2327bb]
+reverse link LowererImplImperative to Visitor--><path codeLine="401" d="M879.1422,963.0891 C885.3942,979.2794 893.6358,994.8993 904.5,1008 C923.2784,1030.6441 940.9429,1019.1172 963.5,1038 C969.3812,1042.9232 975.1644,1048.1297 980.833,1053.5529 " fill="none" id="LowererImplImperative-backto-Visitor" style="stroke:#A80036;stroke-width:1.0;"/><ellipse cx="876.7795" cy="956.3169" fill="#FFFFFF" rx="8" ry="8" style="stroke:#A80036;stroke-width:1.0;"/><line style="stroke:#A80036;stroke-width:1.0;" x1="879.4147" x2="874.1442" y1="963.8704" y2="948.7634"/><line style="stroke:#A80036;stroke-width:1.0;" x1="869.226" x2="884.333" y1="958.9522" y2="953.6816"/><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="54" x="905.5" y="1004.0669">contains</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="8" x="866.9259" y="968.3506">1</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="8" x="964.8705" y="1039.5084">1</text><!--MD5=[cd8dd7ca9f18b6220f591f64794d3d39]
+reverse link Visitor to LowererImplImperative--><path codeLine="402" d="M988.0443,1026.5409 C980.2164,1014.4253 972.331,1002.497 964.5,991 C955.0529,977.1302 944.7145,962.7728 934.3268,948.8048 " fill="none" id="Visitor-backto-LowererImplImperative" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="#A80036" points="995.2137,1037.7132,995.3397,1030.5032,988.7328,1027.6138,988.6068,1034.8238,995.2137,1037.7132" style="stroke:#A80036;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="54" x="974.5" y="1004.0669">contains</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="8" x="982.6713" y="1027.0386">1</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="8" x="931.5281" y="968.395">1</text><!--MD5=[59ff6f047f3ce21caa7eb37a22acd23c]
+@startuml taco
+scale 1
+
+
+class IntrusivePtr {
+    +T *ptr
+}
+class Uncopyable {}
+
+class IRNode {
+    +virtual void accept(IRVisitorStrict *v) const = 0
+    +virtual IRNodeType type_info() const = 0;
+}
+
+class BaseStmtNode {}
+class BaseExprNode {
+    +Datatype type
+}
+
+class StmtNode {
+    +void accept(IRVisitorStrict *v) const
+}
+class ExprNode {
+    +void accept(IRVisitorStrict *v) const
+}
+
+Uncopyable <|- - IRNode
+IRNode <|- - BaseStmtNode
+IRNode <|- - BaseExprNode
+BaseStmtNode <|- - StmtNode
+BaseExprNode <|- - ExprNode
+
+class IRHandle {
+    +void accept(IRVisitorStrict *v) const
+}
+class Expr {}
+class Stmt {}
+
+IntrusivePtr <|- - IRHandle
+IRHandle <|- - Expr
+IRHandle <|- - Stmt
+
+IRHandle "1" *- - "1" IRNode : contains
+
+
+
+' this class is abstract but plantuml version does not support interface keyword
+interface IRVisitorStrict {
+    +virtual void visit(const IRNode*) const = 0
+}
+
+/' 
+IRVisitor is not an interface or abstract because it 
+has not pure virtual methods
+'/
+class IRVisitor {
+    +virtual void visit(const IRNode*)
+}
+
+class IRRewriter {
+    ' protected fields and methods
+    #Expr expr 
+    #Stmt stmt
+
+    #virtual void visit(const ExprNode* op)
+    #virtual void visit(const StmtNode* op)
+
+    ' public fields and methods
+    +Expr rewrite(Expr)
+    +Stmt rewrite(Stmt)
+}
+class IRPrinter {
+    #std::ostream &stream
+    #std::ostream &stream2
+    #int indent
+    #bool color
+    #bool simplify
+    #enum Precedence
+    #Precedence parentPrecedence = BOTTOM
+    #NameGenerator varNameGenerator
+    #scopedMap<Expr, std::String> varNames
+
+    #void doIndent()
+    #void printBinOp(Expr a, Expr b, std::string op, Precedence precedence)
+    #void fewMoreMethods()
+    
+    #virtual void visit(const ExprNode*)
+    #virtual void visit(const StmtNode*)
+
+    +setColor(bool color)
+    +print(Stmt)
+}
+class IRVerifier {}
+
+IRVisitorStrict <|- - IRVisitor
+IRVisitorStrict <|- - IRPrinter
+IRVisitorStrict <|- - IRRewriter
+IRVisitor <|- - IRVerifier
+
+' Inheritance from IRRewriter
+' simplifier for ir::Expr
+class ExpressionSimplifier {}
+IRRewriter <|- - ExpressionSimplifier
+
+' simplifiers for ir::Stmt
+class RemoveRedundantStatements {}
+class RemoveRedundantLoops {}
+class RemoveDuplicateBody {}
+
+IRRewriter <|- - RemoveRedundantStatements
+IRRewriter <|- - RemoveRedundantLoops
+IRRewriter <|- - RemoveDuplicateBody
+
+
+' Inheritance from IRPrinter
+class CodeGen {}
+class CodeGen_C {}
+class CodeGen_CUDA {}
+class CodeGen_ISPC {}
+
+IRPrinter <|- - CodeGen
+CodeGen <|- - CodeGen_C
+CodeGen <|- - CodeGen_ISPC
+CodeGen <|- - CodeGen_CUDA
+
+
+class Manageable {}
+class IndexStmtNode {
+    -virtual void accept(IndexStmtVisitorStrict*) const = 0
+}
+class IndexExprNode {
+    -virtual void accept(IndexStmtVisitorStrict*) const = 0
+}
+
+
+Manageable <|- - IndexStmtNode
+Uncopyable <|- - IndexStmtNode
+Manageable <|- - IndexExprNode
+Uncopyable <|- - IndexExprNode
+
+class IndexStmt {}
+class IndexExpr {}
+
+IntrusivePtr <|- - IndexStmt
+IndexStmt "1" *- - "1" IndexStmtNode
+IntrusivePtr <|- - IndexExpr
+IndexExpr "1" *- - "1" IndexExprNode
+
+
+abstract class IndexExprVisitorStrict {
+    +void visit(const IndexStmt&)
+    +virtual void visit(const AccessNode*) = 0
+    +virtual void visit(const LiteralNode*) = 0
+    +virtual void visit(const NegNode*) = 0
+    +virtual void visit(const AddNode*) = 0
+    +virtual void visit(const SubNode*) = 0
+    +virtual void visit(const MulNode*) = 0
+    +virtual void visit(const DivNode*) = 0
+    +virtual void visit(const SqrtNode*) = 0
+    +virtual void visit(const CastNode*) = 0
+    +virtual void visit(const CallIntrinsicNode*) = 0
+    +virtual void visit(const ReductionNode*) = 0
+}
+abstract class IndexStmtVisitorStrict {
+    +void visit(const IndexStmt&)
+    +virtual void visit(const AssignmentNode*) = 0
+    +virtual void visit(const YieldNode*) = 0
+    +virtual void visit(const ForallNode*) = 0
+    +virtual void visit(const WhereNode*) = 0
+    +virtual void visit(const SequenceNode*) = 0
+    +virtual void visit(const AssembleNode*) = 0
+    +virtual void visit(const MultiNode*) = 0
+    +virtual void visit(const SuchThatNode*) = 0
+}
+
+abstract class IndexNotationVisitorStrict {}
+class IndexNotationPrinter {
+    +void print(const IndexExpr& expr)
+    +void print(const IndexStmt& expr)
+
+    ' Index Expressions visit()
+    +void visit(const AccessNode* node)
+    +void visit(const LiteralNode* node)
+    + void visit(const NegNode* node)
+    + void visit(const AddNode* node)
+    + void visit(const SubNode* node)
+    + void visit(const MulNode* node)
+    + void visit(const DivNode* node)
+    + void visit(const SqrtNode* node)
+    + void visit(const CastNode* node)
+    + void visit(const CallIntrinsicNode* node)
+    + void visit(const UnaryExprNode* node)
+    + void visit(const BinaryExprNode* node)
+    + void visit(const ReductionNode* node)
+
+    ' Index Statement visit()
+    + void visit(const AssignmentNode* node)
+    + void visit(const YieldNode* node)
+    + void visit(const ForallNode* node)
+    + void visit(const WhereNode* node)
+    + void visit(const SequenceNode* node)
+    + void visit(const AssembleNode* node)
+    + void visit(const MultiNode* node)
+    + void visit(const SuchThatNode* node)
+}
+class IndexNotationVisitor {
+    ' Index Expressions visit()
+    +virtual void visit(const AccessNode* node)
+    +virtual void visit(const LiteralNode* node)
+    +virtual void visit(const NegNode* node)
+    +virtual void visit(const AddNode* node)
+    +virtual void visit(const SubNode* node)
+    +virtual void visit(const MulNode* node)
+    +virtual void visit(const DivNode* node)
+    +virtual void visit(const SqrtNode* node)
+    +virtual void visit(const CastNode* node)
+    +virtual void visit(const CallIntrinsicNode* node)
+    +virtual void visit(const UnaryExprNode* node)
+    +virtual void visit(const BinaryExprNode* node)
+    +virtual void visit(const ReductionNode* node)
+
+    ' Index Statement visit()
+    +virtual void visit(const AssignmentNode* node)
+    +virtual void visit(const YieldNode* node)
+    +virtual void visit(const ForallNode* node)
+    +virtual void visit(const WhereNode* node)
+    +virtual void visit(const SequenceNode* node)
+    +virtual void visit(const AssembleNode* node)
+    +virtual void visit(const MultiNode* node)
+    +virtual void visit(const SuchThatNode* node)
+}
+class Matcher {
+
+}
+
+abstract class IndexExprRewriterStrict {
+    +IndexExpr rewrite(IndexExpr)
+
+    #IndexExpr expr
+
+    #virtual void visit(const AccessNode* op) = 0
+    #virtual void visit(const LiteralNode* op) = 0
+    #virtual void visit(const NegNode* op) = 0
+    #virtual void visit(const SqrtNode* op) = 0
+    #virtual void visit(const AddNode* op) = 0
+    #virtual void visit(const SubNode* op) = 0
+    #virtual void visit(const MulNode* op) = 0
+    #virtual void visit(const DivNode* op) = 0
+    #virtual void visit(const CastNode* op) = 0
+    #virtual void visit(const CallIntrinsicNode* op) = 0
+    #virtual void visit(const ReductionNode* op) = 0
+}
+abstract class IndexStmtRewriterStrict {
+    +IndexStmt rewrite(IndexStmt)
+
+    #IndexStmt stmt
+
+    #virtual void visit(const AssignmentNode* op) = 0
+    #virtual void visit(const YieldNode* op) = 0
+    #virtual void visit(const ForallNode* op) = 0
+    #virtual void visit(const WhereNode* op) = 0
+    #virtual void visit(const SequenceNode* op) = 0
+    #virtual void visit(const AssembleNode* op) = 0
+    #virtual void visit(const MultiNode* op) = 0
+    #virtual void visit(const SuchThatNode* op) = 0
+}
+abstract class IndexNotationRewriterStrict {}
+class IndexNotationRewriter {
+    ' Index Expressions visit()
+    +virtual void visit(const AccessNode* node)
+    +virtual void visit(const LiteralNode* node)
+    +virtual void visit(const NegNode* node)
+    +virtual void visit(const AddNode* node)
+    +virtual void visit(const SubNode* node)
+    +virtual void visit(const MulNode* node)
+    +virtual void visit(const DivNode* node)
+    +virtual void visit(const SqrtNode* node)
+    +virtual void visit(const CastNode* node)
+    +virtual void visit(const CallIntrinsicNode* node)
+    +virtual void visit(const UnaryExprNode* node)
+    +virtual void visit(const BinaryExprNode* node)
+    +virtual void visit(const ReductionNode* node)
+
+    ' Index Statement visit()
+    +virtual void visit(const AssignmentNode* node)
+    +virtual void visit(const YieldNode* node)
+    +virtual void visit(const ForallNode* node)
+    +virtual void visit(const WhereNode* node)
+    +virtual void visit(const SequenceNode* node)
+    +virtual void visit(const AssembleNode* node)
+    +virtual void visit(const MultiNode* node)
+    +virtual void visit(const SuchThatNode* node)
+}
+
+
+IndexExprVisitorStrict <|- - IndexNotationVisitorStrict
+IndexStmtVisitorStrict <|- - IndexNotationVisitorStrict
+IndexNotationVisitorStrict <|- - IndexNotationVisitor
+IndexNotationVisitorStrict <|- - IndexNotationPrinter
+IndexNotationVisitor <|- - Matcher
+
+IndexExprVisitorStrict <|- - IndexExprRewriterStrict
+IndexStmtVisitorStrict <|- - IndexStmtRewriterStrict
+IndexExprRewriterStrict <|- - IndexNotationRewriterStrict
+IndexStmtRewriterStrict <|- - IndexNotationRewriterStrict
+
+IndexNotationRewriterStrict <|- - IndexNotationRewriter
+
+' - private
+' # protected
+' ~ package private
+' + public
+
+' {static}
+' {abstract} virtual methods
+
+' lowering part - - convertion from IndexExpr and IndexStmt to ir::Expr and ir::Stmt
+class Lowerer {
+    +std::shared_ptr<LowererImpl> impl;
+}
+abstract class LowererImpl {
+    ' protected fields and methods
+    #class Visitor;
+    #friend class Visitor;
+    #std::shared_ptr<Visitor> visitor;
+
+    #virtual ir::Stmt lower(IndexStmt stmt);
+    #virtual ir::Expr lower(IndexExpr expr);
+
+    #virtual ir::Expr lowerExpr(IndexExpr expr) = 0;
+    #virtual ir::Stmt lowerStmt(IndexStmt stmt) = 0;
+
+    ' public fields and methods
+    +virtual ir::Stmt lower(IndexStmt stmt, std::string name, 
+                 bool assemble, bool compute, bool pack, bool unpack) = 0;
+}
+
+class LowererImplImperative {
+    ' private fields and methods
+    -class Visitor
+    -fiend class Visitor
+    -std::shared_ptr<Visitor> visitor
+    -bool assemble
+    -bool compute
+    -vars a_bunch_of_other_fields
+
+    ' protected fields and methods
+    #virtual ir::Stmt lowerExpr(IndexExpr expr);
+    #virtual ir::Stmt lowerStmt(IndexStmt stmt);
+
+    ' public fields and methods
+    +ir::Stmt lower(IndexStmt stmt, std::string name, 
+                 bool assemble, bool compute, bool pack, bool unpack)
+
+}
+note bottom of LowererImplImperative : Stmt LowererImplImperative::lower(IndexStmt stmt) {\n  return visitor->lower(stmt);\n}
+
+Uncopyable <|- - LowererImpl
+Lowerer "1" *- - "1" LowererImpl : contains
+
+
+' visitor that does the lowering
+class Visitor {
+    ' private fields and methods
+    -LowererImpl* impl
+    -Expr expr
+    -Stmt stmt
+
+    -void visit(const AssignmentNode* node)
+    -void visit(const YieldNode* node)
+    -void visit(const ForallNode* node) 
+    -void visit(const WhereNode* node) 
+    -void visit(const MultiNode* node) 
+    -void visit(const SuchThatNode* node) 
+    -void visit(const SequenceNode* node) 
+    -void visit(const AssembleNode* node) 
+    -void visit(const AccessNode* node) 
+    -void visit(const LiteralNode* node) 
+    -void visit(const NegNode* node) 
+    -void visit(const AddNode* node) 
+    -void visit(const SubNode* node) 
+    -void visit(const MulNode* node) 
+    -void visit(const DivNode* node) 
+    -void visit(const SqrtNode* node) 
+    -void visit(const CastNode* node) 
+    -void visit(const CallIntrinsicNode* node) 
+    -void visit(const ReductionNode* node) 
+
+    ' public fields and methods
+    +Visitor(LowererImplImperative* impl)
+    +Stmt lower(IndexStmt stmt)
+    +Expr lower(IndexExpr expr)
+}
+
+note bottom of Visitor:   Stmt lower(IndexStmt stmt) {\n  this->stmt = Stmt();\n  impl->accessibleIterators.scope();\n  IndexStmtVisitorStrict::visit(stmt);\n  impl->accessibleIterators.unscope();\n  return this->stmt;\n}
+
+IndexNotationVisitorStrict <|- - Visitor
+LowererImpl "1" +- - "1" Visitor : contains
+Visitor "1" *- - "1" LowererImpl : contains
+
+LowererImpl <|- - LowererImplImperative
+LowererImplImperative "1" +- - "1" Visitor : contains
+Visitor "1" *- - "1" LowererImplImperative : contains
+
+@enduml
+
+@startuml taco
+scale 1
+
+
+class IntrusivePtr {
+    +T *ptr
+}
+class Uncopyable {}
+
+class IRNode {
+    +virtual void accept(IRVisitorStrict *v) const = 0
+    +virtual IRNodeType type_info() const = 0;
+}
+
+class BaseStmtNode {}
+class BaseExprNode {
+    +Datatype type
+}
+
+class StmtNode {
+    +void accept(IRVisitorStrict *v) const
+}
+class ExprNode {
+    +void accept(IRVisitorStrict *v) const
+}
+
+Uncopyable <|- - IRNode
+IRNode <|- - BaseStmtNode
+IRNode <|- - BaseExprNode
+BaseStmtNode <|- - StmtNode
+BaseExprNode <|- - ExprNode
+
+class IRHandle {
+    +void accept(IRVisitorStrict *v) const
+}
+class Expr {}
+class Stmt {}
+
+IntrusivePtr <|- - IRHandle
+IRHandle <|- - Expr
+IRHandle <|- - Stmt
+
+IRHandle "1" *- - "1" IRNode : contains
+
+
+
+interface IRVisitorStrict {
+    +virtual void visit(const IRNode*) const = 0
+}
+
+class IRVisitor {
+    +virtual void visit(const IRNode*)
+}
+
+class IRRewriter {
+    #Expr expr 
+    #Stmt stmt
+
+    #virtual void visit(const ExprNode* op)
+    #virtual void visit(const StmtNode* op)
+
+    +Expr rewrite(Expr)
+    +Stmt rewrite(Stmt)
+}
+class IRPrinter {
+    #std::ostream &stream
+    #std::ostream &stream2
+    #int indent
+    #bool color
+    #bool simplify
+    #enum Precedence
+    #Precedence parentPrecedence = BOTTOM
+    #NameGenerator varNameGenerator
+    #scopedMap<Expr, std::String> varNames
+
+    #void doIndent()
+    #void printBinOp(Expr a, Expr b, std::string op, Precedence precedence)
+    #void fewMoreMethods()
+    
+    #virtual void visit(const ExprNode*)
+    #virtual void visit(const StmtNode*)
+
+    +setColor(bool color)
+    +print(Stmt)
+}
+class IRVerifier {}
+
+IRVisitorStrict <|- - IRVisitor
+IRVisitorStrict <|- - IRPrinter
+IRVisitorStrict <|- - IRRewriter
+IRVisitor <|- - IRVerifier
+
+class ExpressionSimplifier {}
+IRRewriter <|- - ExpressionSimplifier
+
+class RemoveRedundantStatements {}
+class RemoveRedundantLoops {}
+class RemoveDuplicateBody {}
+
+IRRewriter <|- - RemoveRedundantStatements
+IRRewriter <|- - RemoveRedundantLoops
+IRRewriter <|- - RemoveDuplicateBody
+
+
+class CodeGen {}
+class CodeGen_C {}
+class CodeGen_CUDA {}
+class CodeGen_ISPC {}
+
+IRPrinter <|- - CodeGen
+CodeGen <|- - CodeGen_C
+CodeGen <|- - CodeGen_ISPC
+CodeGen <|- - CodeGen_CUDA
+
+
+class Manageable {}
+class IndexStmtNode {
+    -virtual void accept(IndexStmtVisitorStrict*) const = 0
+}
+class IndexExprNode {
+    -virtual void accept(IndexStmtVisitorStrict*) const = 0
+}
+
+
+Manageable <|- - IndexStmtNode
+Uncopyable <|- - IndexStmtNode
+Manageable <|- - IndexExprNode
+Uncopyable <|- - IndexExprNode
+
+class IndexStmt {}
+class IndexExpr {}
+
+IntrusivePtr <|- - IndexStmt
+IndexStmt "1" *- - "1" IndexStmtNode
+IntrusivePtr <|- - IndexExpr
+IndexExpr "1" *- - "1" IndexExprNode
+
+
+abstract class IndexExprVisitorStrict {
+    +void visit(const IndexStmt&)
+    +virtual void visit(const AccessNode*) = 0
+    +virtual void visit(const LiteralNode*) = 0
+    +virtual void visit(const NegNode*) = 0
+    +virtual void visit(const AddNode*) = 0
+    +virtual void visit(const SubNode*) = 0
+    +virtual void visit(const MulNode*) = 0
+    +virtual void visit(const DivNode*) = 0
+    +virtual void visit(const SqrtNode*) = 0
+    +virtual void visit(const CastNode*) = 0
+    +virtual void visit(const CallIntrinsicNode*) = 0
+    +virtual void visit(const ReductionNode*) = 0
+}
+abstract class IndexStmtVisitorStrict {
+    +void visit(const IndexStmt&)
+    +virtual void visit(const AssignmentNode*) = 0
+    +virtual void visit(const YieldNode*) = 0
+    +virtual void visit(const ForallNode*) = 0
+    +virtual void visit(const WhereNode*) = 0
+    +virtual void visit(const SequenceNode*) = 0
+    +virtual void visit(const AssembleNode*) = 0
+    +virtual void visit(const MultiNode*) = 0
+    +virtual void visit(const SuchThatNode*) = 0
+}
+
+abstract class IndexNotationVisitorStrict {}
+class IndexNotationPrinter {
+    +void print(const IndexExpr& expr)
+    +void print(const IndexStmt& expr)
+
+    +void visit(const AccessNode* node)
+    +void visit(const LiteralNode* node)
+    + void visit(const NegNode* node)
+    + void visit(const AddNode* node)
+    + void visit(const SubNode* node)
+    + void visit(const MulNode* node)
+    + void visit(const DivNode* node)
+    + void visit(const SqrtNode* node)
+    + void visit(const CastNode* node)
+    + void visit(const CallIntrinsicNode* node)
+    + void visit(const UnaryExprNode* node)
+    + void visit(const BinaryExprNode* node)
+    + void visit(const ReductionNode* node)
+
+    + void visit(const AssignmentNode* node)
+    + void visit(const YieldNode* node)
+    + void visit(const ForallNode* node)
+    + void visit(const WhereNode* node)
+    + void visit(const SequenceNode* node)
+    + void visit(const AssembleNode* node)
+    + void visit(const MultiNode* node)
+    + void visit(const SuchThatNode* node)
+}
+class IndexNotationVisitor {
+    +virtual void visit(const AccessNode* node)
+    +virtual void visit(const LiteralNode* node)
+    +virtual void visit(const NegNode* node)
+    +virtual void visit(const AddNode* node)
+    +virtual void visit(const SubNode* node)
+    +virtual void visit(const MulNode* node)
+    +virtual void visit(const DivNode* node)
+    +virtual void visit(const SqrtNode* node)
+    +virtual void visit(const CastNode* node)
+    +virtual void visit(const CallIntrinsicNode* node)
+    +virtual void visit(const UnaryExprNode* node)
+    +virtual void visit(const BinaryExprNode* node)
+    +virtual void visit(const ReductionNode* node)
+
+    +virtual void visit(const AssignmentNode* node)
+    +virtual void visit(const YieldNode* node)
+    +virtual void visit(const ForallNode* node)
+    +virtual void visit(const WhereNode* node)
+    +virtual void visit(const SequenceNode* node)
+    +virtual void visit(const AssembleNode* node)
+    +virtual void visit(const MultiNode* node)
+    +virtual void visit(const SuchThatNode* node)
+}
+class Matcher {
+
+}
+
+abstract class IndexExprRewriterStrict {
+    +IndexExpr rewrite(IndexExpr)
+
+    #IndexExpr expr
+
+    #virtual void visit(const AccessNode* op) = 0
+    #virtual void visit(const LiteralNode* op) = 0
+    #virtual void visit(const NegNode* op) = 0
+    #virtual void visit(const SqrtNode* op) = 0
+    #virtual void visit(const AddNode* op) = 0
+    #virtual void visit(const SubNode* op) = 0
+    #virtual void visit(const MulNode* op) = 0
+    #virtual void visit(const DivNode* op) = 0
+    #virtual void visit(const CastNode* op) = 0
+    #virtual void visit(const CallIntrinsicNode* op) = 0
+    #virtual void visit(const ReductionNode* op) = 0
+}
+abstract class IndexStmtRewriterStrict {
+    +IndexStmt rewrite(IndexStmt)
+
+    #IndexStmt stmt
+
+    #virtual void visit(const AssignmentNode* op) = 0
+    #virtual void visit(const YieldNode* op) = 0
+    #virtual void visit(const ForallNode* op) = 0
+    #virtual void visit(const WhereNode* op) = 0
+    #virtual void visit(const SequenceNode* op) = 0
+    #virtual void visit(const AssembleNode* op) = 0
+    #virtual void visit(const MultiNode* op) = 0
+    #virtual void visit(const SuchThatNode* op) = 0
+}
+abstract class IndexNotationRewriterStrict {}
+class IndexNotationRewriter {
+    +virtual void visit(const AccessNode* node)
+    +virtual void visit(const LiteralNode* node)
+    +virtual void visit(const NegNode* node)
+    +virtual void visit(const AddNode* node)
+    +virtual void visit(const SubNode* node)
+    +virtual void visit(const MulNode* node)
+    +virtual void visit(const DivNode* node)
+    +virtual void visit(const SqrtNode* node)
+    +virtual void visit(const CastNode* node)
+    +virtual void visit(const CallIntrinsicNode* node)
+    +virtual void visit(const UnaryExprNode* node)
+    +virtual void visit(const BinaryExprNode* node)
+    +virtual void visit(const ReductionNode* node)
+
+    +virtual void visit(const AssignmentNode* node)
+    +virtual void visit(const YieldNode* node)
+    +virtual void visit(const ForallNode* node)
+    +virtual void visit(const WhereNode* node)
+    +virtual void visit(const SequenceNode* node)
+    +virtual void visit(const AssembleNode* node)
+    +virtual void visit(const MultiNode* node)
+    +virtual void visit(const SuchThatNode* node)
+}
+
+
+IndexExprVisitorStrict <|- - IndexNotationVisitorStrict
+IndexStmtVisitorStrict <|- - IndexNotationVisitorStrict
+IndexNotationVisitorStrict <|- - IndexNotationVisitor
+IndexNotationVisitorStrict <|- - IndexNotationPrinter
+IndexNotationVisitor <|- - Matcher
+
+IndexExprVisitorStrict <|- - IndexExprRewriterStrict
+IndexStmtVisitorStrict <|- - IndexStmtRewriterStrict
+IndexExprRewriterStrict <|- - IndexNotationRewriterStrict
+IndexStmtRewriterStrict <|- - IndexNotationRewriterStrict
+
+IndexNotationRewriterStrict <|- - IndexNotationRewriter
+
+
+
+class Lowerer {
+    +std::shared_ptr<LowererImpl> impl;
+}
+abstract class LowererImpl {
+    #class Visitor;
+    #friend class Visitor;
+    #std::shared_ptr<Visitor> visitor;
+
+    #virtual ir::Stmt lower(IndexStmt stmt);
+    #virtual ir::Expr lower(IndexExpr expr);
+
+    #virtual ir::Expr lowerExpr(IndexExpr expr) = 0;
+    #virtual ir::Stmt lowerStmt(IndexStmt stmt) = 0;
+
+    +virtual ir::Stmt lower(IndexStmt stmt, std::string name, 
+                 bool assemble, bool compute, bool pack, bool unpack) = 0;
+}
+
+class LowererImplImperative {
+    -class Visitor
+    -fiend class Visitor
+    -std::shared_ptr<Visitor> visitor
+    -bool assemble
+    -bool compute
+    -vars a_bunch_of_other_fields
+
+    #virtual ir::Stmt lowerExpr(IndexExpr expr);
+    #virtual ir::Stmt lowerStmt(IndexStmt stmt);
+
+    +ir::Stmt lower(IndexStmt stmt, std::string name, 
+                 bool assemble, bool compute, bool pack, bool unpack)
+
+}
+note bottom of LowererImplImperative : Stmt LowererImplImperative::lower(IndexStmt stmt) {\n  return visitor->lower(stmt);\n}
+
+Uncopyable <|- - LowererImpl
+Lowerer "1" *- - "1" LowererImpl : contains
+
+
+class Visitor {
+    -LowererImpl* impl
+    -Expr expr
+    -Stmt stmt
+
+    -void visit(const AssignmentNode* node)
+    -void visit(const YieldNode* node)
+    -void visit(const ForallNode* node) 
+    -void visit(const WhereNode* node) 
+    -void visit(const MultiNode* node) 
+    -void visit(const SuchThatNode* node) 
+    -void visit(const SequenceNode* node) 
+    -void visit(const AssembleNode* node) 
+    -void visit(const AccessNode* node) 
+    -void visit(const LiteralNode* node) 
+    -void visit(const NegNode* node) 
+    -void visit(const AddNode* node) 
+    -void visit(const SubNode* node) 
+    -void visit(const MulNode* node) 
+    -void visit(const DivNode* node) 
+    -void visit(const SqrtNode* node) 
+    -void visit(const CastNode* node) 
+    -void visit(const CallIntrinsicNode* node) 
+    -void visit(const ReductionNode* node) 
+
+    +Visitor(LowererImplImperative* impl)
+    +Stmt lower(IndexStmt stmt)
+    +Expr lower(IndexExpr expr)
+}
+
+note bottom of Visitor:   Stmt lower(IndexStmt stmt) {\n  this->stmt = Stmt();\n  impl->accessibleIterators.scope();\n  IndexStmtVisitorStrict::visit(stmt);\n  impl->accessibleIterators.unscope();\n  return this->stmt;\n}
+
+IndexNotationVisitorStrict <|- - Visitor
+LowererImpl "1" +- - "1" Visitor : contains
+Visitor "1" *- - "1" LowererImpl : contains
+
+LowererImpl <|- - LowererImplImperative
+LowererImplImperative "1" +- - "1" Visitor : contains
+Visitor "1" *- - "1" LowererImplImperative : contains
+
+@enduml
+
+PlantUML version 1.2021.7(Sun May 23 08:40:07 EDT 2021)
+(GPL source distribution)
+Java Runtime: OpenJDK Runtime Environment
+JVM: OpenJDK 64-Bit Server VM
+Default Encoding: ANSI_X3.4-1968
+Language: en
+Country: US
+--></g></svg>
\ No newline at end of file
diff --git a/src/codegen/codegen.cpp b/src/codegen/codegen.cpp
index f0c09d98a..6ec54a2f8 100644
--- a/src/codegen/codegen.cpp
+++ b/src/codegen/codegen.cpp
@@ -2,6 +2,7 @@
 #include "taco/cuda.h"
 #include "codegen_cuda.h"
 #include "codegen_c.h"
+#include "codegen_ispc.h"
 #include <algorithm>
 #include <unordered_set>
 
@@ -26,6 +27,21 @@ shared_ptr<CodeGen> CodeGen::init_default(std::ostream &dest, OutputKind outputK
   if (should_use_CUDA_codegen()) {
     return make_shared<CodeGen_CUDA>(dest, outputKind);
   }
+  else if (should_use_ISPC_codegen()) {
+    return make_shared<CodeGen_ISPC>(dest, outputKind);
+  }
+  else {
+    return make_shared<CodeGen_C>(dest, outputKind);
+  }
+}
+
+shared_ptr<CodeGen> CodeGen::init_default(std::ostream &dest, std::ostream &dest2, OutputKind outputKind) {
+  if (should_use_CUDA_codegen()) {
+    return make_shared<CodeGen_CUDA>(dest, outputKind);
+  }
+  else if (should_use_ISPC_codegen()) {
+    return make_shared<CodeGen_ISPC>(dest, dest2, outputKind);
+  }
   else {
     return make_shared<CodeGen_C>(dest, outputKind);
   }
@@ -229,6 +245,49 @@ string CodeGen::printTensorProperty(string varname, const GetProperty* op, bool
   return ret.str();
 }
 
+string CodeGen::getUnpackedTensorArgument(string varname, const GetProperty* op,
+                            bool is_output_prop) {
+  stringstream ret;
+  ret << "";
+
+  auto tensor = op->tensor.as<Var>();
+  if (op->property == TensorProperty::Values) {
+    // for the values, it's in the last slot
+    ret << "uniform " << printType(tensor->type, false) << " " << varname << "[]";
+    return ret.str();
+  } else if (op->property == TensorProperty::ValuesSize) {
+    ret << "int32 " << varname;
+    return ret.str();
+  }
+
+  // for a Dense level, nnz is an int
+  // for a Fixed level, ptr is an int
+  // all others are int*
+  if (op->property == TensorProperty::Dimension) {
+    if (op->type == Int32) {
+      ret << "uniform int32 ";
+    } else if (op->type == Int64) {
+      ret << "uniform int64 ";
+    } else {
+      ret << "int ";
+    }
+    ret << varname;
+    
+  } else {
+    taco_iassert(op->property == TensorProperty::Indices);
+    if (op->type == Int32) {
+      ret << "uniform int32 ";
+    } else if (op->type == Int64) {
+      ret << "uniform int64 ";
+    } else {
+      ret << "uniform int ";
+    }
+    ret << varname << "[]";
+  }
+
+  return ret.str();
+}
+
 string CodeGen::unpackTensorProperty(string varname, const GetProperty* op,
                             bool is_output_prop) {
   stringstream ret;
@@ -310,13 +369,9 @@ string CodeGen::pointTensorProperty(std::string varname) {
   return ret.str();
 }
 
-// helper to print declarations
-string CodeGen::printDecls(map<Expr, string, ExprCompare> varMap,
-                           vector<Expr> inputs, vector<Expr> outputs) {
-  stringstream ret;
-  unordered_set<string> propsAlreadyGenerated;
-
-  vector<const GetProperty*> sortedProps;
+void CodeGen::getSortedProps(map<Expr, string, ExprCompare> &varMap,
+              vector<const GetProperty*> &sortedProps, vector<Expr> &inputs,
+              vector<Expr> &outputs) {
 
   for (auto const& p: varMap) {
     if (p.first.as<GetProperty>())
@@ -355,6 +410,17 @@ string CodeGen::printDecls(map<Expr, string, ExprCompare> varMap,
          return a->index < b->index;
        });
 
+}
+
+// helper to print declarations
+string CodeGen::printDecls(map<Expr, string, ExprCompare> varMap,
+                           vector<Expr> inputs, vector<Expr> outputs) {
+  stringstream ret;
+  unordered_set<string> propsAlreadyGenerated;
+
+  vector<const GetProperty*> sortedProps;
+  getSortedProps(varMap, sortedProps, inputs, outputs);
+
   for (auto prop: sortedProps) {
     bool isOutputProp = (find(outputs.begin(), outputs.end(),
                               prop->tensor) != outputs.end());
@@ -375,7 +441,6 @@ string CodeGen::printDecls(map<Expr, string, ExprCompare> varMap,
   return ret.str();
 }
 
-
 string CodeGen::printPack(map<tuple<Expr, TensorProperty, int, int>,
         string> outputProperties, vector<Expr> outputs) {
   stringstream ret;
diff --git a/src/codegen/codegen.h b/src/codegen/codegen.h
index cc25c80d6..db891f995 100644
--- a/src/codegen/codegen.h
+++ b/src/codegen/codegen.h
@@ -16,9 +16,13 @@ class CodeGen : public IRPrinter {
   enum CodeGenType { C, CUDA };
 
   CodeGen(std::ostream& stream, CodeGenType type) : IRPrinter(stream), codeGenType(type) {};
-  CodeGen(std::ostream& stream, bool color, bool simplify, CodeGenType type) : IRPrinter(stream, color, simplify), codeGenType(type) {};
+  CodeGen(std::ostream& stream, bool color, bool simplify, CodeGenType type) 
+    : IRPrinter(stream, color, simplify), codeGenType(type) {};
+  CodeGen(std::ostream& stream, std::ostream& stream2, bool color, bool simplify, CodeGenType type) 
+    : IRPrinter(stream, stream2, color, simplify), codeGenType(type) {};
   /// Initialize the default code generator
   static std::shared_ptr<CodeGen> init_default(std::ostream &dest, OutputKind outputKind);
+  static std::shared_ptr<CodeGen> init_default(std::ostream &dest, std::ostream &dest2, OutputKind outputKind);
 
   /// Compile a lowered function
   virtual void compile(Stmt stmt, bool isFirst=false) =0;
@@ -26,6 +30,9 @@ class CodeGen : public IRPrinter {
 protected:
   static bool checkForAlloc(const Function *func);
   static int countYields(const Function *func);
+  void getSortedProps(std::map<Expr, std::string, ExprCompare> &varMap,
+              std::vector<const GetProperty*> &sortedProps, std::vector<Expr> &inputs,
+              std::vector<Expr> &outputs);
 
   static std::string printCType(Datatype type, bool is_ptr);
   static std::string printCUDAType(Datatype type, bool is_ptr);
@@ -52,6 +59,10 @@ class CodeGen : public IRPrinter {
   std::string printFuncName(const Function *func, 
           std::map<Expr, std::string, ExprCompare> inputMap={}, 
           std::map<Expr, std::string, ExprCompare> outputMap={});
+  
+  std::string printTensorProperty(std::string varname, const GetProperty* op, bool is_ptr);
+  std::string getUnpackedTensorArgument(std::string varname, const GetProperty* op,
+                              bool is_output_prop); 
 
   void resetUniqueNameCounters();
   std::string genUniqueName(std::string name);
@@ -61,9 +72,8 @@ class CodeGen : public IRPrinter {
 private:
   virtual std::string restrictKeyword() const { return ""; }
 
-  std::string printTensorProperty(std::string varname, const GetProperty* op, bool is_ptr);
   std::string unpackTensorProperty(std::string varname, const GetProperty* op,
-                              bool is_output_prop);
+                              bool is_output_prop); 
   std::string packTensorProperty(std::string varname, Expr tnsr, TensorProperty property,
                             int mode, int index);
   std::string pointTensorProperty(std::string varname);
diff --git a/src/codegen/codegen_c.cpp b/src/codegen/codegen_c.cpp
index 2ade9d7f6..89a98a20e 100644
--- a/src/codegen/codegen_c.cpp
+++ b/src/codegen/codegen_c.cpp
@@ -240,7 +240,10 @@ class CodeGen_C::FindVars : public IRVisitor {
 };
 
 CodeGen_C::CodeGen_C(std::ostream &dest, OutputKind outputKind, bool simplify)
-    : CodeGen(dest, false, simplify, C), out(dest), outputKind(outputKind) {}
+    : CodeGen(dest, false, simplify, C), out(dest), out2(dest), outputKind(outputKind) {}
+  
+CodeGen_C::CodeGen_C(std::ostream &dest, std::ostream &dest2, OutputKind outputKind, bool simplify)
+    : CodeGen(dest, dest2, false, simplify, C), out(dest), out2(dest2), outputKind(outputKind) {}
 
 CodeGen_C::~CodeGen_C() {}
 
diff --git a/src/codegen/codegen_c.h b/src/codegen/codegen_c.h
index 55c9d01a8..471f3658a 100644
--- a/src/codegen/codegen_c.h
+++ b/src/codegen/codegen_c.h
@@ -16,6 +16,7 @@ class CodeGen_C : public CodeGen {
   /// Initialize a code generator that generates code to an
   /// output stream.
   CodeGen_C(std::ostream &dest, OutputKind outputKind, bool simplify=true);
+  CodeGen_C(std::ostream &dest, std::ostream &dest2, OutputKind outputKind, bool simplify=true);
   ~CodeGen_C();
 
   /// Compile a lowered function
@@ -28,23 +29,24 @@ class CodeGen_C : public CodeGen {
 protected:
   using IRPrinter::visit;
 
-  void visit(const Function*);
-  void visit(const VarDecl*);
-  void visit(const Yield*);
-  void visit(const Var*);
-  void visit(const For*);
-  void visit(const While*);
-  void visit(const GetProperty*);
-  void visit(const Min*);
-  void visit(const Max*);
-  void visit(const Allocate*);
-  void visit(const Sqrt*);
-  void visit(const Store*);
-  void visit(const Assign*);
+  virtual void visit(const Function*);
+  virtual void visit(const VarDecl*);
+  virtual void visit(const Yield*);
+  virtual void visit(const Var*);
+  virtual void visit(const For*);
+  virtual void visit(const While*);
+  virtual void visit(const GetProperty*);
+  virtual void visit(const Min*);
+  virtual void visit(const Max*);
+  virtual void visit(const Allocate*);
+  virtual void visit(const Sqrt*);
+  virtual void visit(const Store*);
+  virtual void visit(const Assign*);
 
   std::map<Expr, std::string, ExprCompare> varMap;
   std::vector<Expr> localVars;
   std::ostream &out;
+  std::ostream &out2;
   
   OutputKind outputKind;
 
diff --git a/src/codegen/codegen_cuda.cpp b/src/codegen/codegen_cuda.cpp
index 77cf0cd88..14505f740 100644
--- a/src/codegen/codegen_cuda.cpp
+++ b/src/codegen/codegen_cuda.cpp
@@ -646,6 +646,7 @@ void CodeGen_CUDA::printDeviceFunctions(const Function* func) {
   // Collect device functions
   resetUniqueNameCounters();
   deviceFunctionLoopDepth = 0;
+  // here they calculate the device FunctionCollecor
   DeviceFunctionCollector deviceFunctionCollector(func->inputs, func->outputs, this);
   func->body.accept(&deviceFunctionCollector);
   deviceFunctions = deviceFunctionCollector.blockFors;
diff --git a/src/codegen/codegen_ispc.cpp b/src/codegen/codegen_ispc.cpp
new file mode 100644
index 000000000..d4f428ccf
--- /dev/null
+++ b/src/codegen/codegen_ispc.cpp
@@ -0,0 +1,1097 @@
+#include <iostream>
+#include <fstream>
+#include <dlfcn.h>
+#include <algorithm>
+#include <unordered_set>
+#include <taco.h>
+
+#include "taco/cuda.h"
+#include "taco/ir/ir_printer.h"
+#include "taco/ir/ir_visitor.h"
+#include "taco/ir/ir_rewriter.h"
+#include "taco/ir/simplify.h"
+
+#include "codegen_c.h"
+#include "codegen_ispc.h"
+#include "taco/error.h"
+#include "taco/util/strings.h"
+#include "taco/util/collections.h"
+
+using namespace std;
+
+namespace taco {
+namespace ir {
+
+// Some helper functions
+namespace {
+
+// Include stdio.h for printf
+// stdlib.h for malloc/realloc
+// math.h for sqrt
+// MIN preprocessor macro
+// This *must* be kept in sync with taco_tensor_t.h
+const string cHeaders =
+  "#ifndef TACO_C_HEADERS\n"
+  "#define TACO_C_HEADERS\n"
+  "#include <stdio.h>\n"
+  "#include <stdlib.h>\n"
+  "#include <stdint.h>\n"
+  "#include <stdbool.h>\n"
+  "#include <math.h>\n"
+  "#include <complex.h>\n"
+  "#include <string.h>\n"
+  "#if _OPENMP\n"
+  "#include <omp.h>\n"
+  "#endif\n"
+  "#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))\n"
+  "#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))\n"
+  "#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)\n"
+  "#ifndef TACO_TENSOR_T_DEFINED\n"
+  "#define TACO_TENSOR_T_DEFINED\n"
+  "typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;\n"
+  "typedef struct {\n"
+  "  int32_t      order;         // tensor order (number of modes)\n"
+  "  int32_t*     dimensions;    // tensor dimensions\n"
+  "  int32_t      csize;         // component size\n"
+  "  int32_t*     mode_ordering; // mode storage ordering\n"
+  "  taco_mode_t* mode_types;    // mode storage types\n"
+  "  uint8_t***   indices;       // tensor index data (per mode)\n"
+  "  uint8_t*     vals;          // tensor values\n"
+  "  int32_t      vals_size;     // values array size\n"
+  "} taco_tensor_t;\n"
+  "#endif\n"
+  "#if !_OPENMP\n"
+  "int omp_get_thread_num() { return 0; }\n"
+  "int omp_get_max_threads() { return 1; }\n"
+  "#endif\n"
+  "int cmp(const void *a, const void *b) {\n"
+  "  return *((const int*)a) - *((const int*)b);\n"
+  "}\n"
+  "int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {\n"
+  "  if (array[arrayStart] >= target) {\n"
+  "    return arrayStart;\n"
+  "  }\n"
+  "  int lowerBound = arrayStart; // always < target\n"
+  "  int upperBound = arrayEnd; // always >= target\n"
+  "  while (upperBound - lowerBound > 1) {\n"
+  "    int mid = (upperBound + lowerBound) / 2;\n"
+  "    int midValue = array[mid];\n"
+  "    if (midValue < target) {\n"
+  "      lowerBound = mid;\n"
+  "    }\n"
+  "    else if (midValue > target) {\n"
+  "      upperBound = mid;\n"
+  "    }\n"
+  "    else {\n"
+  "      return mid;\n"
+  "    }\n"
+  "  }\n"
+  "  return upperBound;\n"
+  "}\n"
+  "int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {\n"
+  "  if (array[arrayEnd] <= target) {\n"
+  "    return arrayEnd;\n"
+  "  }\n"
+  "  int lowerBound = arrayStart; // always <= target\n"
+  "  int upperBound = arrayEnd; // always > target\n"
+  "  while (upperBound - lowerBound > 1) {\n"
+  "    int mid = (upperBound + lowerBound) / 2;\n"
+  "    int midValue = array[mid];\n"
+  "    if (midValue < target) {\n"
+  "      lowerBound = mid;\n"
+  "    }\n"
+  "    else if (midValue > target) {\n"
+  "      upperBound = mid;\n"
+  "    }\n"
+  "    else {\n"
+  "      return mid;\n"
+  "    }\n"
+  "  }\n"
+  "  return lowerBound;\n"
+  "}\n"
+  "taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,\n"
+  "                                  int32_t* dimensions, int32_t* mode_ordering,\n"
+  "                                  taco_mode_t* mode_types) {\n"
+  "  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));\n"
+  "  t->order         = order;\n"
+  "  t->dimensions    = (int32_t *) malloc(order * sizeof(int32_t));\n"
+  "  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));\n"
+  "  t->mode_types    = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));\n"
+  "  t->indices       = (uint8_t ***) malloc(order * sizeof(uint8_t***));\n"
+  "  t->csize         = csize;\n"
+  "  for (int32_t i = 0; i < order; i++) {\n"
+  "    t->dimensions[i]    = dimensions[i];\n"
+  "    t->mode_ordering[i] = mode_ordering[i];\n"
+  "    t->mode_types[i]    = mode_types[i];\n"
+  "    switch (t->mode_types[i]) {\n"
+  "      case taco_mode_dense:\n"
+  "        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));\n"
+  "        break;\n"
+  "      case taco_mode_sparse:\n"
+  "        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));\n"
+  "        break;\n"
+  "    }\n"
+  "  }\n"
+  "  return t;\n"
+  "}\n"
+  "void deinit_taco_tensor_t(taco_tensor_t* t) {\n"
+  "  for (int i = 0; i < t->order; i++) {\n"
+  "    free(t->indices[i]);\n"
+  "  }\n"
+  "  free(t->indices);\n"
+  "  free(t->dimensions);\n"
+  "  free(t->mode_ordering);\n"
+  "  free(t->mode_types);\n"
+  "  free(t);\n"
+  "}\n"
+  "#endif\n";
+
+const string ispcHeaders = 
+  "#define __TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))\n"
+  "#define __TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))\n"
+  "#define __TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)\n"
+  "int __cmp(const void *a, const void *b) {\n"
+  "  return *((const int*)a) - *((const int*)b);\n"
+  "}\n"
+  "int __taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {\n"
+  "  if (array[arrayStart] >= target) {\n"
+  "    return arrayStart;\n"
+  "  }\n"
+  "  int lowerBound = arrayStart; // always < target\n"
+  "  int upperBound = arrayEnd; // always >= target\n"
+  "  while (upperBound - lowerBound > 1) {\n"
+  "    int mid = (upperBound + lowerBound) / 2;\n"
+  "    int midValue = array[mid];\n"
+  "    if (midValue < target) {\n"
+  "      lowerBound = mid;\n"
+  "    }\n"
+  "    else if (midValue > target) {\n"
+  "      upperBound = mid;\n"
+  "    }\n"
+  "    else {\n"
+  "      return mid;\n"
+  "    }\n"
+  "  }\n"
+  "  return upperBound;\n"
+  "}\n"
+  "int __taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {\n"
+  "  if (array[arrayEnd] <= target) {\n"
+  "    return arrayEnd;\n"
+  "  }\n"
+  "  int lowerBound = arrayStart; // always <= target\n"
+  "  int upperBound = arrayEnd; // always > target\n"
+  "  while (upperBound - lowerBound > 1) {\n"
+  "    int mid = (upperBound + lowerBound) / 2;\n"
+  "    int midValue = array[mid];\n"
+  "    if (midValue < target) {\n"
+  "      lowerBound = mid;\n"
+  "    }\n"
+  "    else if (midValue > target) {\n"
+  "      upperBound = mid;\n"
+  "    }\n"
+  "    else {\n"
+  "      return mid;\n"
+  "    }\n"
+  "  }\n"
+  "  return lowerBound;\n"
+  "}\n\n\n";
+
+} // anonymous namespace
+
+
+
+// find variables for generating declarations
+// generates a single var for each GetProperty
+class CodeGen_ISPC::FindVars : public IRVisitor {
+public:
+  map<Expr, string, ExprCompare> varMap;
+
+  // the variables for which we need to add declarations
+  map<Expr, string, ExprCompare> varDecls;
+
+  vector<Expr> localVars;
+
+  // this maps from tensor, property, mode, index to the unique var
+  map<tuple<Expr, TensorProperty, int, int>, string> canonicalPropertyVar;
+
+  // this is for convenience, recording just the properties unpacked
+  // from the output tensor so we can re-save them at the end
+  map<tuple<Expr, TensorProperty, int, int>, string> outputProperties;
+
+  // TODO: should replace this with an unordered set
+  vector<Expr> outputTensors;
+  vector<Expr> inputTensors;
+
+  CodeGen_ISPC *codeGen;
+
+  // copy inputs and outputs into the map
+  FindVars(vector<Expr> inputs, vector<Expr> outputs, CodeGen_ISPC *codeGen)
+  : codeGen(codeGen) {
+    for (auto v: inputs) {
+      auto var = v.as<Var>();
+      taco_iassert(var) << "Inputs must be vars in codegen";
+      taco_iassert(varMap.count(var)==0) << "Duplicate input found in codegen";
+      inputTensors.push_back(v);
+      varMap[var] = var->name;
+    }
+    for (auto v: outputs) {
+      auto var = v.as<Var>();
+      taco_iassert(var) << "Outputs must be vars in codegen";
+      taco_iassert(varMap.count(var)==0) << "Duplicate output found in codegen";
+      outputTensors.push_back(v);
+      varMap[var] = var->name;
+    }
+  }
+
+protected:
+  using IRVisitor::visit;
+
+  virtual void visit(const Var *op) {
+    if (varMap.count(op) == 0) {
+      varMap[op] = op->is_ptr? op->name : codeGen->genUniqueName(op->name);
+    }
+  }
+
+  virtual void visit(const VarDecl *op) {
+    if (!util::contains(localVars, op->var)) {
+      localVars.push_back(op->var);
+    }
+    op->var.accept(this);
+    op->rhs.accept(this);
+  }
+
+  virtual void visit(const For *op) {
+    if (!util::contains(localVars, op->var)) {
+      localVars.push_back(op->var);
+    }
+    op->var.accept(this);
+    op->start.accept(this);
+    op->end.accept(this);
+    op->increment.accept(this);
+    op->contents.accept(this);
+  }
+
+  virtual void visit(const GetProperty *op) {
+    if (!util::contains(inputTensors, op->tensor) &&
+        !util::contains(outputTensors, op->tensor)) {
+      // Don't create header unpacking code for temporaries
+      return;
+    }
+
+    if (varMap.count(op) == 0) {
+      auto key =
+              tuple<Expr,TensorProperty,int,int>(op->tensor,op->property,
+                                                 (size_t)op->mode,
+                                                 (size_t)op->index);
+      if (canonicalPropertyVar.count(key) > 0) {
+        varMap[op] = canonicalPropertyVar[key];
+      } else {
+        auto unique_name = codeGen->genUniqueName(op->name);
+        canonicalPropertyVar[key] = unique_name;
+        varMap[op] = unique_name;
+        varDecls[op] = unique_name;
+        if (util::contains(outputTensors, op->tensor)) {
+          outputProperties[key] = unique_name;
+        }
+      }
+    }
+  }
+};
+
+
+// Finds all for loops tagged with accelerator and adds statements to deviceFunctions
+// Also tracks scope of when device function is called and
+// tracks which variables must be passed to function.
+class CodeGen_ISPC::FunctionCollector : public IRVisitor {
+public:
+  vector<Stmt> threadFors; // contents is device function
+  vector<Stmt> initFors;  // for loops to initialize statements
+  map<Expr, string, ExprCompare> scopeMap;
+
+  // the variables to pass to each device function
+  vector<vector<pair<string, Expr>>> functionParameters;
+  vector<pair<string, Expr>> currentParameters; // keep as vector so code generation is deterministic
+  set<Expr> currentParameterSet;
+
+  set<Expr> variablesDeclaredInKernel;
+
+  vector<pair<string, Expr>> threadIDVars;
+  vector<pair<string, Expr>> blockIDVars;
+  vector<pair<string, Expr>> warpIDVars;
+  vector<Expr> numThreads;
+  vector<Expr> numWarps;
+
+  CodeGen_ISPC *codeGen;
+  // copy inputs and outputs into the map
+  FunctionCollector(vector<Expr> inputs, vector<Expr> outputs, CodeGen_ISPC *codeGen) : codeGen(codeGen)  {
+    inDeviceFunction = false;
+    for (auto v: inputs) {
+      auto var = v.as<Var>();
+      taco_iassert(var) << "Inputs must be vars in codegen";
+      taco_iassert(scopeMap.count(var) == 0) <<
+                                             "Duplicate input found in codegen";
+      scopeMap[var] = var->name;
+    }
+    for (auto v: outputs) {
+      auto var = v.as<Var>();
+      taco_iassert(var) << "Outputs must be vars in codegen";
+      taco_iassert(scopeMap.count(var) == 0) <<
+                                             "Duplicate output found in codegen";
+
+      scopeMap[var] = var->name;
+    }
+  }
+
+protected:
+  bool inDeviceFunction;
+  using IRVisitor::visit;
+
+  virtual void visit(const For *op) {
+    if (op->parallel_unit == ParallelUnit::CPUSpmd) {
+      std::cout << "ParallelUnit::CPUSpmd directive found\n";
+
+      inDeviceFunction = false;
+      op->var.accept(this);
+      inDeviceFunction = true;
+
+      threadFors.push_back(op);
+      std::cout << "scopeMap: [" << scopeMap[op->var] << "], varExpr: [" << op->var << "]\n";
+      threadIDVars.push_back(pair<string, Expr>(scopeMap[op->var], op->var));
+      Expr blockSize = ir::simplify(ir::Div::make(ir::Sub::make(op->end, op->start), op->increment));
+      numThreads.push_back(blockSize);
+
+    }
+    else if (op->parallel_unit == ParallelUnit::CPUSimd) {
+      std::cout << "************************************************************************** CPUSimd For node\n";
+    }
+    else if (op->kind == LoopKind::Init) {
+      std::cout << "************************************************************************* Init loop kind found\n";
+      initFors.push_back(op);
+    }
+    else{
+      op->var.accept(this);
+    }
+    op->start.accept(this);
+    op->end.accept(this);
+    op->increment.accept(this);
+    op->contents.accept(this);
+  }
+
+  virtual void visit(const Var *op) {
+    if (scopeMap.count(op) == 0) {
+      string name = codeGen->genUniqueName(op->name);
+      if (!inDeviceFunction) {
+        scopeMap[op] = name;
+      }
+    }
+    else if (scopeMap.count(op) == 1 && inDeviceFunction && currentParameterSet.count(op) == 0
+            && (threadIDVars.empty() || op != threadIDVars.back().second)
+            && !variablesDeclaredInKernel.count(op)) {
+      currentParameters.push_back(pair<string, Expr>(scopeMap[op], op));
+      currentParameterSet.insert(op);
+    }
+  }
+
+  virtual void visit(const VarDecl *op) {
+    if (inDeviceFunction) {
+      variablesDeclaredInKernel.insert(op->var);
+    }
+    op->var.accept(this);
+    op->rhs.accept(this);
+  }
+
+  virtual void visit(const GetProperty *op) {
+    if (scopeMap.count(op->tensor) == 0 && !inDeviceFunction) {
+      auto key =
+              tuple<Expr,TensorProperty,int,int>(op->tensor,op->property,
+                                                 (size_t)op->mode,
+                                                 (size_t)op->index);
+      auto unique_name = codeGen->genUniqueName(op->name);
+      scopeMap[op->tensor] = unique_name;
+    }
+    else if (scopeMap.count(op->tensor) == 1 && inDeviceFunction && currentParameterSet.count(op->tensor) == 0) {
+      currentParameters.push_back(pair<string, Expr>(op->tensor.as<Var>()->name, op->tensor));
+      currentParameterSet.insert(op->tensor);
+    }
+  }
+};
+
+
+CodeGen_ISPC::CodeGen_ISPC(std::ostream &dest, OutputKind outputKind, bool simplify)
+    : CodeGen_C(dest, dest, outputKind, simplify) {}
+
+CodeGen_ISPC::CodeGen_ISPC(std::ostream &dest, std::ostream &dest2, OutputKind outputKind, bool simplify)
+    : CodeGen_C(dest, dest2, outputKind, simplify) {}
+
+CodeGen_ISPC::~CodeGen_ISPC() {}
+
+void CodeGen_ISPC::compile(Stmt stmt, bool isFirst) {
+  varMap = {};
+  localVars = {};
+
+  if (isFirst) {
+    // output the headers
+    out << cHeaders;
+
+    if (&out != &out2) {
+      out2 << ispcHeaders;
+    }
+  }
+  out << endl;
+  // generate code for the Stmt
+  std::cout << "Compiling the code\n";
+  stmt.accept(this);
+}
+
+
+
+string CodeGen_ISPC::printCallISPCFunc(const std::string& funcName, map<Expr, string, ExprCompare> varMap,
+                                  vector<const GetProperty*> &sortedProps) {
+  std::stringstream ret;
+  ret << "  ";
+  unordered_set<string> propsAlreadyGenerated;
+
+  ret << "__" << funcName << "(";
+
+
+  for (unsigned long i=0; i < sortedProps.size(); i++) {
+    ret << varMap[sortedProps[i]];
+    if (i != sortedProps.size()-1) {
+      ret << ", ";
+    }
+    propsAlreadyGenerated.insert(varMap[sortedProps[i]]);
+  }
+
+  ret << ");\n";
+  return ret.str();
+}
+
+// varMap is already sorted <- make sure to pass the sorted varMap
+void CodeGen_ISPC::printISPCFunc(const Function *func, map<Expr, string, ExprCompare> varMap,
+                                  vector<const GetProperty*> &sortedProps) {
+
+  FunctionCollector functionCollector(func->inputs, func->outputs, this);
+  func->body.accept(&functionCollector);
+
+  vector<Expr> inputs = func->inputs;
+  vector<Expr> outputs = func->outputs;
+  unordered_set<string> propsAlreadyGenerated;
+
+  for (unsigned long i=0; i < sortedProps.size(); i++) {
+    auto prop = sortedProps[i];
+    bool isOutputProp = (find(outputs.begin(), outputs.end(),
+                              prop->tensor) != outputs.end());
+    
+    auto var = prop->tensor.as<Var>();
+    if (var->is_parameter) {
+      if (isOutputProp) {
+        funcVariables << "  " << printTensorProperty(varMap[prop], prop, false) << ";" << endl;
+      } else {
+        break; 
+      }
+    } else {
+      funcVariables << getUnpackedTensorArgument(varMap[prop], prop, isOutputProp);
+    }
+    propsAlreadyGenerated.insert(varMap[prop]);
+
+    if (i!=sortedProps.size()-1) {
+      funcVariables << ", ";
+    }
+    if (i%2==0) {
+      funcVariables << "\n\t";
+    }
+  }
+
+  resetUniqueNameCounters();
+
+  // threadFors code generation
+  for (size_t i = 0; i < functionCollector.threadFors.size(); i++) {
+
+    const For *threadloop = to<For>(functionCollector.threadFors[i]);
+    taco_iassert(threadloop->parallel_unit == ParallelUnit::CPUSpmd);
+    Stmt function = threadloop->contents;
+    std::cout << "threadloop function: " << function << std::endl;
+
+    out2 << "\nstatic task void __" << func->name << "__ (";
+    out2 << funcVariables.str();
+    out2 << "\n) {\n\n";
+
+    indent++;
+    // output body of the threadloop
+    taskCode = true;
+    print(threadloop);
+    indent--;
+    out2 << "}\n\n";  
+
+  }
+
+  taskCode = false;
+  out2 << "export void __" << func->name << " (";
+  out2 << funcVariables.str();
+  out2 << "\n) {\n\n";
+
+  indent++;
+  // output body
+  print(func->body);
+  indent--;
+  out2 << "}\n";
+  
+}
+
+void CodeGen_ISPC::sendToStream(std::stringstream &stream) {
+  if (is_ISPC_code_stream_enabled()) {
+    this->out2 << stream.str();
+  }
+  else {
+    CodeGen_C::sendToStream(stream);
+  }
+}
+
+void CodeGen_ISPC::visit(const Function* func) {
+  set_ISPC_code_stream_enabled(false);
+
+  // if generating a header, protect the function declaration with a guard
+  if (func->name == "assemble") {
+    if (outputKind == HeaderGen) {
+      out << "#ifndef TACO_GENERATED_" << func->name << "\n";
+      out << "#define TACO_GENERATED_" << func->name << "\n";
+    }
+
+    int numYields = countYields(func);
+    emittingCoroutine = (numYields > 0);
+    funcName = func->name;
+    labelCount = 0;
+
+    resetUniqueNameCounters();
+    FindVars inputVarFinder(func->inputs, {}, this);
+    func->body.accept(&inputVarFinder);
+    FindVars outputVarFinder({}, func->outputs, this);
+    func->body.accept(&outputVarFinder);
+
+    // output function declaration
+    doIndent();
+    out << printFuncName(func, inputVarFinder.varDecls, outputVarFinder.varDecls);
+
+    // if we're just generating a header, this is all we need to do
+    if (outputKind == HeaderGen) {
+      out << ";\n";
+      out << "#endif\n";
+      return;
+    }
+
+    out << " {\n";
+
+    indent++;
+
+    // find all the vars that are not inputs or outputs and declare them
+    resetUniqueNameCounters();
+    FindVars varFinder(func->inputs, func->outputs, this);
+    func->body.accept(&varFinder);
+    varMap = varFinder.varMap;
+    localVars = varFinder.localVars;
+
+    // Print variable declarations
+    out << printDecls(varFinder.varDecls, func->inputs, func->outputs) << endl;
+
+    if (emittingCoroutine) {
+      out << printContextDeclAndInit(varMap, localVars, numYields, func->name)
+          << endl;
+    }
+
+    // output body
+    print(func->body);
+
+    // output repack only if we allocated memory
+    if (checkForAlloc(func))
+      out << endl << printPack(varFinder.outputProperties, func->outputs);
+
+    if (emittingCoroutine) {
+      out << printCoroutineFinish(numYields, funcName);
+    }
+
+    doIndent();
+    out << "return 0;\n";
+    indent--;
+
+    doIndent();
+    out << "}\n";
+    return;
+
+  }
+
+
+  if (outputKind == HeaderGen) {
+    out << "#ifndef TACO_GENERATED_" << func->name << "\n";
+    out << "#define TACO_GENERATED_" << func->name << "\n";
+  }
+
+  int numYields = countYields(func);
+  emittingCoroutine = (numYields > 0);
+  funcName = func->name;
+  labelCount = 0;
+
+  resetUniqueNameCounters();
+  FindVars inputVarFinder(func->inputs, {}, this);
+  func->body.accept(&inputVarFinder);
+  FindVars outputVarFinder({}, func->outputs, this);
+  func->body.accept(&outputVarFinder);
+
+  // output function declaration
+  doIndent();
+  out << printFuncName(func, inputVarFinder.varDecls, outputVarFinder.varDecls);
+
+  // if we're just generating a header, this is all we need to do
+  if (outputKind == HeaderGen) {
+    out << ";\n";
+    out << "#endif\n";
+    return;
+  }
+
+  out << " {\n";
+
+  indent++;
+
+  // find all the vars that are not inputs or outputs and declare them
+  resetUniqueNameCounters();
+  FindVars varFinder(func->inputs, func->outputs, this);
+  func->body.accept(&varFinder);
+  varMap = varFinder.varMap;
+  localVars = varFinder.localVars;
+
+  // Print variable declarations
+  out << printDecls(varFinder.varDecls, func->inputs, func->outputs) << endl;
+
+  sortedProps = {};
+  vector<Expr> inputs = func->inputs;
+  vector<Expr> outputs = func->outputs;
+  getSortedProps(varFinder.varDecls, sortedProps, inputs, outputs);
+  out << printCallISPCFunc(func->name, varFinder.varDecls, sortedProps);
+
+  if (emittingCoroutine) {
+    out << printContextDeclAndInit(varMap, localVars, numYields, func->name)
+        << endl;
+  }
+
+  // output repack only if we allocated memory
+  if (checkForAlloc(func))
+    out << endl << printPack(varFinder.outputProperties, func->outputs);
+
+  if (emittingCoroutine) {
+    out << printCoroutineFinish(numYields, funcName);
+  }
+
+  doIndent();
+  out << "return 0;\n";
+  indent--;
+
+  doIndent();
+  out << "}\n\n";
+
+  set_ISPC_code_stream_enabled(true);
+  printISPCFunc(func, varFinder.varDecls, sortedProps);
+  set_ISPC_code_stream_enabled(false);
+
+}
+
+void CodeGen_ISPC::visit(const VarDecl* op) {
+  // std::stringstream stream;
+  if (is_ISPC_code_stream_enabled()) {
+    if (emittingCoroutine) {
+      doIndent();
+      op->var.accept(this);
+      parentPrecedence = Precedence::TOP;
+      stream2 << " = ";
+      op->rhs.accept(this);
+      stream2 << ";";
+      stream2 << endl;
+    } else {
+      IRPrinter::visit(op);
+    }
+  }
+  else {
+    CodeGen_C::visit(op);   
+  }
+
+  // sendToStream(stream);
+}
+
+void CodeGen_ISPC::visit(const Yield* op) {
+  printYield(op, localVars, varMap, labelCount, funcName);
+}
+
+// For Vars, we replace their names with the generated name,
+// since we match by reference (not name)
+void CodeGen_ISPC::visit(const Var* op) {
+  if (is_ISPC_code_stream_enabled()) {
+    taco_iassert(varMap.count(op) > 0) <<
+        "Var " << op->name << " not found in varMap";
+    if (emittingCoroutine) {
+  //    out << "TACO_DEREF(";
+    }
+    out2 << varMap[op];
+    if (emittingCoroutine) {
+  //    out << ")";
+    }
+  }
+  else {
+    CodeGen_C::visit(op);
+  }
+}
+
+static string genVectorizePragma(int width) {
+  stringstream ret;
+  ret << "#pragma clang loop interleave(enable) ";
+  if (!width)
+    ret << "vectorize(enable)";
+  else
+    ret << "vectorize_width(" << width << ")";
+
+  return ret.str();
+}
+
+// static string getParallelizePragma(LoopKind kind) {
+//   stringstream ret;
+//   ret << "#pragma omp parallel for schedule";
+//   switch (kind) {
+//     case LoopKind::Static:
+//       ret << "(static, 1)";
+//       break;
+//     case LoopKind::Dynamic:
+//       ret << "(dynamic, 1)";
+//       break;
+//     case LoopKind::Runtime:
+//       ret << "(runtime)";
+//       break;
+//     case LoopKind::Static_Chunked:
+//       ret << "(static)";
+//       break;
+//     default:
+//       break;
+//   }
+//   return ret.str();
+// }
+
+// static string getUnrollPragma(size_t unrollFactor) {
+//   return "#pragma unroll " + std::to_string(unrollFactor);
+// }
+
+static string getAtomicPragma() {
+  return "#pragma omp atomic";
+}
+
+// The next two need to output the correct pragmas depending
+// on the loop kind (Serial, Static, Dynamic, Vectorized)
+//
+// Docs for vectorization pragmas:
+// http://clang.llvm.org/docs/LanguageExtensions.html#extensions-for-loop-hint-optimizations
+void CodeGen_ISPC::visit(const For* op) {
+  if (!is_ISPC_code_stream_enabled()) {
+    CodeGen_C::visit(op);
+    return;
+  }
+  doIndent();
+
+  if (op->kind == LoopKind::Mul_Thread) {
+    if (!taskCode) {
+      out2 << "launch[4] " << printCallISPCFunc(funcName+"__", varMap, sortedProps) << "\n";
+      return;
+    }
+    stream2 << "uniform unsigned int chunk_size = (";
+    op->end.accept(this);
+    stream2 << " - ";
+    op->start.accept(this);
+    stream2 << ") / taskCount;\n";
+    stream2 << "  uniform unsigned int modulo = (";
+    op->end.accept(this);
+    stream2 << " - ";
+    op->start.accept(this);
+    stream2 << ") % taskCount;\n";
+
+    stream2 << "  uniform unsigned int start = ";
+    op->start.accept(this);
+    stream2 << " + chunk_size * taskIndex;\n";
+
+    stream2 << "  if (taskIndex != 0) {\n";
+    stream2 << "    start += modulo;\n";
+    stream2 << "  }\n";
+    
+    stream2 << "  uniform unsigned int end = start + chunk_size;\n";
+    stream2 << "  if (taskIndex == 0) {\n";
+    stream2 << "    end += modulo;\n";
+    stream2 << "  }\n\n";
+        
+    stream2 << keywordString("  for") << " (";
+    if (!emittingCoroutine) {
+      if (op->var.type() == Int32) {
+          stream2 << "int32 ";
+      }
+      else if (op->var.type() == Int64) {
+          stream2 << "int64 ";
+      }
+      
+    }
+    op->var.accept(this);
+    stream2 << " = ";
+    stream2 << "start";
+    // op->start.accept(this);
+    stream2 << keywordString("; ");
+    op->var.accept(this);
+    stream2 << " < ";
+    parentPrecedence = BOTTOM;
+    stream2 << "end";
+    // op->end.accept(this);
+    stream2 << keywordString("; ");
+    op->var.accept(this);
+
+    auto lit = op->increment.as<Literal>();
+    if (lit != nullptr && ((lit->type.isInt()  && lit->equalsScalar(1)) ||
+                          (lit->type.isUInt() && lit->equalsScalar(1)))) {
+      stream2 << "++";
+    }
+    else {
+      stream2 << " += ";
+      op->increment.accept(this);
+    }
+
+  }
+
+  else if (op->kind == LoopKind::Foreach) {
+    stream2 << keywordString("foreach") << " (";
+
+    op->var.accept(this);
+    stream2 << " = ";
+    op->start.accept(this);
+    stream2 << keywordString(" ... ");
+    op->end.accept(this);
+
+  } else {
+    stream2 << keywordString("for") << " (";
+    if (!emittingCoroutine) {
+      if (op->var.type() == Int32) {
+          stream2 << "int32 ";
+      }
+      else if (op->var.type() == Int64) {
+          stream2 << "int64 ";
+      }
+      
+    }
+    op->var.accept(this);
+    stream2 << " = ";
+    op->start.accept(this);
+    stream2 << keywordString("; ");
+    op->var.accept(this);
+    stream2 << " < ";
+    parentPrecedence = BOTTOM;
+    op->end.accept(this);
+    stream2 << keywordString("; ");
+    op->var.accept(this);
+
+    auto lit = op->increment.as<Literal>();
+    if (lit != nullptr && ((lit->type.isInt()  && lit->equalsScalar(1)) ||
+                          (lit->type.isUInt() && lit->equalsScalar(1)))) {
+      stream2 << "++";
+    }
+    else {
+      stream2 << " += ";
+      op->increment.accept(this);
+    }
+    
+  }
+
+  stream2 << ") {\n";
+  op->contents.accept(this);
+  doIndent();
+  stream2 << "}";
+  stream2 << endl;
+
+}
+
+void CodeGen_ISPC::visit(const While* op) {
+  // it's not clear from documentation that clang will vectorize
+  // while loops
+  // however, we'll output the pragmas anyway
+  if (op->kind == LoopKind::Vectorized) {
+    doIndent();
+    out << genVectorizePragma(op->vec_width);
+    out << "\n";
+  }
+
+  CodeGen_C::visit(op);
+}
+
+void CodeGen_ISPC::visit(const GetProperty* op) {
+  taco_iassert(varMap.count(op) > 0) <<
+      "Property " << Expr(op) << " of " << op->tensor << " not found in varMap";
+  if (is_ISPC_code_stream_enabled()) {
+    out2 << varMap[op];
+  }
+  else {
+    out << varMap[op];
+  }
+
+}
+
+void CodeGen_ISPC::visit(const Min* op) {
+  if (op->operands.size() == 1) {
+    op->operands[0].accept(this);
+    return;
+  }
+  for (size_t i=0; i<op->operands.size()-1; i++) {
+    stream << "TACO_MIN(";
+    op->operands[i].accept(this);
+    stream << ",";
+  }
+  op->operands.back().accept(this);
+  for (size_t i=0; i<op->operands.size()-1; i++) {
+    stream << ")";
+  }
+}
+
+void CodeGen_ISPC::visit(const Max* op) {
+  if (op->operands.size() == 1) {
+    op->operands[0].accept(this);
+    return;
+  }
+  for (size_t i=0; i<op->operands.size()-1; i++) {
+    stream << "TACO_MAX(";
+    op->operands[i].accept(this);
+    stream << ",";
+  }
+  op->operands.back().accept(this);
+  for (size_t i=0; i<op->operands.size()-1; i++) {
+    stream << ")";
+  }
+}
+
+void CodeGen_ISPC::visit(const Allocate* op) {
+
+
+  if (is_ISPC_code_stream_enabled()) {
+    string elementType = printCType(op->var.type(), false);
+    doIndent();
+
+    op->var.accept(this);
+    stream2 << " = ";
+    // stream2 << " = (";
+    // stream2 << elementType << "*";
+    // stream2 << ")";
+    if (op->is_realloc) {
+      stream2 << "realloc(";
+      op->var.accept(this);
+      stream2 << ", ";
+    }
+    else {
+      // If the allocation was requested to clear the allocated memory,
+      // use calloc instead of malloc.
+      if (op->clear) {
+        stream2 << "calloc(1, ";
+      } else {
+        stream2 << "new ";
+      }
+    }
+    stream2 << elementType << "[";
+    parentPrecedence = MUL;
+    op->num_elements.accept(this);
+    parentPrecedence = TOP;
+    stream2 << "];";
+    stream2 << endl;
+
+
+  } else {
+    CodeGen_C::visit(op);
+
+  }
+
+
+}
+
+void CodeGen_ISPC::visit(const Sqrt* op) {
+  taco_tassert(op->type.isFloat() && op->type.getNumBits() == 64) <<
+      "Codegen doesn't currently support non-double sqrt";
+  stream << "sqrt(";
+  op->a.accept(this);
+  stream << ")";
+}
+
+void CodeGen_ISPC::visit(const Assign* op) {
+  if (is_ISPC_code_stream_enabled()) {
+    doIndent();
+    op->lhs.accept(this);
+    parentPrecedence = Precedence::TOP;
+    bool printed = false;
+    if (simplify) {
+      if (isa<ir::Add>(op->rhs)) {
+        auto add = to<Add>(op->rhs);
+        if (add->a == op->lhs) {
+          const Literal* lit = add->b.as<Literal>();
+          if (lit != nullptr && ((lit->type.isInt()  && lit->equalsScalar(1)) ||
+                                (lit->type.isUInt() && lit->equalsScalar(1)))) {
+            stream2 << "++";
+          }
+          else {
+            if (op->use_atomics) {
+              stream2 << " += reduce_add(";
+              add->b.accept(this);
+              stream2 << ")";
+            }
+            else {
+              stream2 << " += ";
+              add->b.accept(this);
+            }
+          }
+          printed = true;
+        }
+      }
+      else if (isa<Mul>(op->rhs)) {
+        auto mul = to<Mul>(op->rhs);
+        if (mul->a == op->lhs) {
+          stream2 << " *= ";
+          mul->b.accept(this);
+          printed = true;
+        }
+      }
+      else if (isa<BitOr>(op->rhs)) {
+        auto bitOr = to<BitOr>(op->rhs);
+        if (bitOr->a == op->lhs) {
+          stream2 << " |= ";
+          bitOr->b.accept(this);
+          printed = true;
+        }
+      }
+    }
+    if (!printed) {
+      stream2 << " = ";
+      op->rhs.accept(this);
+    }
+
+    stream2 << ";";
+    stream2 << endl;
+
+    IRPrinter::visit(op);
+  }
+  else {
+    CodeGen_C::visit(op);
+  
+  }
+
+  
+}
+
+void CodeGen_ISPC::visit(const Store* op) {
+  if (is_ISPC_code_stream_enabled()) {
+    if (op->use_atomics) {
+      doIndent();
+      stream2 << getAtomicPragma() << endl;
+    }
+  }
+  else {
+    if (op->use_atomics) {
+      doIndent();
+      stream << getAtomicPragma() << endl;
+    }    
+  }
+  IRPrinter::visit(op);
+}
+
+}
+}
diff --git a/src/codegen/codegen_ispc.h b/src/codegen/codegen_ispc.h
new file mode 100644
index 000000000..62d2897ca
--- /dev/null
+++ b/src/codegen/codegen_ispc.h
@@ -0,0 +1,68 @@
+#ifndef TACO_BACKEND_ISPC_H
+#define TACO_BACKEND_ISPC_H
+#include <map>
+#include <vector>
+#include <stdbool.h>
+
+#include "taco/ir/ir.h"
+#include "taco/ir/ir_printer.h"
+#include "codegen_c.h"
+
+namespace taco {
+namespace ir {
+
+
+class CodeGen_ISPC : public CodeGen_C {
+public:
+  /// Initialize a code generator that generates code to an
+  /// output stream.
+  CodeGen_ISPC(std::ostream &dest, OutputKind outputKind, bool simplify=true);
+  CodeGen_ISPC(std::ostream &dest, std::ostream &dest2, OutputKind outputKind, bool simplify=true);
+  ~CodeGen_ISPC();
+
+  /// Compile a lowered function
+  void compile(Stmt stmt, bool isFirst=false);
+
+  /// Generate shims that unpack an array of pointers representing
+  /// a mix of taco_tensor_t* and scalars into a function call
+  static void generateShim(const Stmt& func, std::stringstream &stream);
+
+protected:
+  using CodeGen_C::visit;
+
+  void visit(const Function*);
+  void visit(const VarDecl*);
+  void visit(const Yield*);
+  void visit(const Var*);
+  void visit(const For*);
+  void visit(const While*);
+  void visit(const GetProperty*);
+  void visit(const Min*);
+  void visit(const Max*);
+  void visit(const Allocate*);
+  void visit(const Sqrt*);
+  void visit(const Store*);
+  void visit(const Assign*);
+
+  Stmt simplifyFunctionBodies(Stmt stmt);
+  std::string printCallISPCFunc(const std::string& funcName, std::map<Expr, std::string, ExprCompare> varMap,
+                                std::vector<const GetProperty*> &sortedProps);
+  void printISPCFunc(const Function *func, std::map<Expr, std::string, ExprCompare> varMap,
+                                  std::vector<const GetProperty*> &sortedProps);
+
+  bool taskCode = false;
+
+  std::stringstream funcVariables;
+  std::vector<const GetProperty*> sortedProps;
+
+  class FindVars;
+  class FunctionCollector;
+
+private:
+  virtual std::string restrictKeyword() const { return "restrict"; }
+  void sendToStream(std::stringstream &stream);
+};
+
+} // namespace ir
+} // namespace taco
+#endif
diff --git a/src/codegen/module.cpp b/src/codegen/module.cpp
index bd0f487b1..c95999365 100644
--- a/src/codegen/module.cpp
+++ b/src/codegen/module.cpp
@@ -13,6 +13,7 @@
 #include "taco/util/strings.h"
 #include "taco/util/env.h"
 #include "codegen/codegen_c.h"
+#include "codegen/codegen_ispc.h"
 #include "codegen/codegen_cuda.h"
 #include "taco/cuda.h"
 
@@ -42,6 +43,7 @@ void Module::addFunction(Stmt func) {
 
 void Module::compileToSource(string path, string prefix) {
   if (!moduleFromUserSource) {
+    std::cout << "module not from user source\n";
   
     // create a codegen instance and add all the funcs
     bool didGenRuntime = false;
@@ -50,11 +52,13 @@ void Module::compileToSource(string path, string prefix) {
     header.clear();
     source.str("");
     source.clear();
+    additional_source.str("");
+    additional_source.clear();
 
     taco_tassert(target.arch == Target::C99) <<
         "Only C99 codegen supported currently";
     std::shared_ptr<CodeGen> sourcegen =
-        CodeGen::init_default(source, CodeGen::ImplementationGen);
+        CodeGen::init_default(source, additional_source, CodeGen::ImplementationGen);
     std::shared_ptr<CodeGen> headergen =
             CodeGen::init_default(header, CodeGen::HeaderGen);
 
@@ -68,8 +72,17 @@ void Module::compileToSource(string path, string prefix) {
   ofstream source_file;
   string file_ending = should_use_CUDA_codegen() ? ".cu" : ".c";
   source_file.open(path+prefix+file_ending);
+  if (should_use_ISPC_codegen()) {
+    source_file << "#include \"" << path+prefix+"_ispc.h\"\n";
+  }
   source_file << source.str();
   source_file.close();
+
+  ofstream additional_source_file;
+  string file_ending2 = ".ispc";
+  additional_source_file.open(path+prefix+file_ending2);
+  additional_source_file << additional_source.str();
+  additional_source_file.close();
   
   ofstream header_file;
   header_file.open(path+prefix+".h");
@@ -89,6 +102,9 @@ void writeShims(vector<Stmt> funcs, string path, string prefix) {
     if (should_use_CUDA_codegen()) {
       CodeGen_CUDA::generateShim(func, shims);
     }
+    // else if (should_use_ISPC_codegen()) {
+    //   CodeGen_ISPC::generateShim(func, shims);
+    // }
     else {
       CodeGen_C::generateShim(func, shims);
     }
@@ -98,6 +114,9 @@ void writeShims(vector<Stmt> funcs, string path, string prefix) {
   if (should_use_CUDA_codegen()) {
     shims_file.open(path+prefix+"_shims.cpp");
   }
+  // else if (should_use_ISPC_codegen()) {
+  //   shims_file.open(path+prefix+".c", ios::app);
+  // }
   else {
     shims_file.open(path+prefix+".c", ios::app);
   }
@@ -109,6 +128,7 @@ void writeShims(vector<Stmt> funcs, string path, string prefix) {
 } // anonymous namespace
 
 string Module::compile() {
+  std::cout << "Module::compile\n";
   string prefix = tmpdir+libname;
   string fullpath = prefix + ".so";
   
@@ -123,6 +143,13 @@ string Module::compile() {
     file_ending = ".cu";
     shims_file = prefix + "_shims.cpp";
   }
+  // else if (should_use_ISPC_codegen()) {
+  //   cc = util::getFromEnv("TACO_ISPC", "ispc");
+  //   cflags = util::getFromEnv("TACO_ISPC_FLAGS",
+  //   " --target=sse2-i32x4,sse4-i32x8,avx1-i32x8,avx2-i32x8,avx512knl-i32x16,avx512skx-i32x16 --pic -O3 --addressing=64 --arch=x86-64"
+  //   ) + " ";
+
+  // }
   else {
     cc = util::getFromEnv(target.compiler_env, target.compiler);
     cflags = util::getFromEnv("TACO_CFLAGS",
@@ -137,17 +164,55 @@ string Module::compile() {
   string cmd = cc + " " + cflags + " " +
     prefix + file_ending + " " + shims_file + " " + 
     "-o " + fullpath + " -lm";
+  std::cout << "--------------------------------------------------------------------------------tmpdir: " << tmpdir << std::endl;
+  std::cout << "--------------------------------------------------------------------------------libname: " << libname << std::endl;
+  std::cout << "--------------------------------------------------------------------------------prefix: " << prefix << std::endl;
+  std::cout << "--------------------------------------------------------------------------------fullpath: " << fullpath << std::endl;
+  std::cout << "--------------------------------------------------------------------------------cmd: " << cmd << std::endl;
 
   // open the output file & write out the source
   compileToSource(tmpdir, libname);
+
   
   // write out the shims
   writeShims(funcs, tmpdir, libname);
+  for (auto &statement : funcs) {
+    std::cout << "----- statement --------" << std::endl;
+    // std::cout << statement;
+    std::cout << std::endl;
+  }
+  std::cout << tmpdir << std::endl << libname << std::endl;
   
-  // now compile it
-  int err = system(cmd.data());
-  taco_uassert(err == 0) << "Compilation command failed:\n" << cmd
-    << "\nreturned " << err;
+  if (should_use_ISPC_codegen()) {
+    string ispc = util::getFromEnv("TACO_ISPC", "ispc");
+    string ispcflags = util::getFromEnv("TACO_ISPC_FLAGS",
+    " --target=sse2-i32x4,sse4-i32x8,avx1-i32x8,avx2-i32x8,avx512knl-i32x16,avx512skx-i32x16 --pic -O3 --addressing=64 --arch=x86-64"
+    ) + " ";
+    string cmd = ispc + " " + ispcflags + " -o " + prefix + ".ispc.o " + " --emit-obj " + prefix + ".ispc " + "-h " + prefix + "_ispc.h";
+
+    // now compile the ispc file to generate the object file and the ispc header file
+    std::cout << "--------------------------------------------------------------------------------cmd: " << cmd << std::endl;
+    int err = system(cmd.data());
+    taco_uassert(err == 0) << "Compilation command failed:\n" << cmd
+      << "\nreturned " << err;
+
+    string ispc_object_file = " " + prefix + ".ispc.o ";
+    string ispc_object_files_for_diff_targets = " " + prefix + ".ispc_* ";
+    cmd = cc + " " + cflags + " " +
+      prefix + file_ending + " " + ispc_object_file + ispc_object_files_for_diff_targets + shims_file + " " + 
+      "-o " + fullpath + " -lm -lrt ";
+
+    // now compile the c file linking the ispc object file. ispc header is added to the top of the c file
+    std::cout << "--------------------------------------------------------------------------------cmd: " << cmd << std::endl;
+    err = system(cmd.data());
+    taco_uassert(err == 0) << "Compilation command failed:\n" << cmd
+      << "\nreturned " << err;
+  } else {
+    // now compile it
+    int err = system(cmd.data());
+    taco_uassert(err == 0) << "Compilation command failed:\n" << cmd
+      << "\nreturned " << err;
+  }
 
   // use dlsym() to open the compiled library
   if (lib_handle) {
diff --git a/src/cuda.cpp b/src/cuda.cpp
index 059c60105..68e49fe98 100644
--- a/src/cuda.cpp
+++ b/src/cuda.cpp
@@ -7,6 +7,25 @@
 
 using namespace std;
 namespace taco {
+
+static bool ISPC_codegen_enabled = ISPC_BUILT;
+static bool ISPC_code_stream_enabled = false;
+bool should_use_ISPC_codegen() {
+  return ISPC_codegen_enabled;
+}
+
+bool is_ISPC_code_stream_enabled() {
+  return ISPC_code_stream_enabled;
+}
+
+void set_ISPC_codegen_enabled(bool enabled) {
+  ISPC_codegen_enabled = enabled;
+}
+
+void set_ISPC_code_stream_enabled(bool enabled) {
+  ISPC_code_stream_enabled = enabled;
+}
+
 /// Functions used by taco to interface with CUDA (especially unified memory)
 static bool CUDA_codegen_enabled = CUDA_BUILT;
 static bool CUDA_unified_memory_enabled = CUDA_BUILT;
diff --git a/src/index_notation/index_notation.cpp b/src/index_notation/index_notation.cpp
index 51fb8770c..d3483c2d6 100644
--- a/src/index_notation/index_notation.cpp
+++ b/src/index_notation/index_notation.cpp
@@ -2438,6 +2438,7 @@ bool isConcreteNotation(IndexStmt stmt, std::string* reason) {
   return isConcrete;
 }
 
+// make reduction notation
 Assignment makeReductionNotation(Assignment assignment) {
   IndexExpr expr = assignment.getRhs();
   std::vector<IndexVar> free = assignment.getLhs().getIndexVars();
@@ -2513,7 +2514,10 @@ IndexStmt makeReductionNotation(IndexStmt stmt) {
   return makeReductionNotation(to<Assignment>(stmt));
 }
 
+// make concrete notation
 IndexStmt makeConcreteNotation(IndexStmt stmt) {
+  std::cout << "concrete notation original assignment: " << stmt << std::endl;
+
   std::string reason;
   taco_iassert(isReductionNotation(stmt, &reason))
       << "Not reduction notation: " << stmt << std::endl << reason;
@@ -2521,6 +2525,7 @@ IndexStmt makeConcreteNotation(IndexStmt stmt) {
 
   // Free variables and reductions covering the whole rhs become top level loops
   vector<IndexVar> freeVars = to<Assignment>(stmt).getFreeVars();
+  std::cout << "free vars: " << freeVars << std::endl;
 
   struct RemoveTopLevelReductions : IndexNotationRewriter {
     using IndexNotationRewriter::visit;
@@ -2535,12 +2540,17 @@ IndexStmt makeConcreteNotation(IndexStmt stmt) {
         topLevelReductions.push_back(reduction.getVar());
         rhs = reduction.getExpr();
       }
+      std::cout << "top level reductions: " << topLevelReductions << std::endl;
 
       if (rhs != node->rhs) {
-        stmt = Assignment(node->lhs, rhs, Add());
+        stmt = Assignment(node->lhs, rhs, Add()); // write with add
+        int idx = 0;
         for (auto& i : util::reverse(topLevelReductions)) {
+          std::cout << idx << ": " << stmt << std::endl;
+          idx++;
           stmt = forall(i, stmt);
         }
+        std::cout << idx << ": " << stmt << std::endl;
       }
       else {
         stmt = node;
@@ -2548,11 +2558,18 @@ IndexStmt makeConcreteNotation(IndexStmt stmt) {
     }
   };
   stmt = RemoveTopLevelReductions().rewrite(stmt);
+  std::cout << "after remove top level reductions: " << stmt << std::endl;
 
+  // now we form the stmt in reverse order of freeVars
+  int idx = 0;
   for (auto& i : util::reverse(freeVars)) {
+    std::cout << idx << ": " << stmt << std::endl;
     stmt = forall(i, stmt);
+    idx++;
   }
+  std::cout << idx << ": " << stmt << std::endl;
 
+  std::cout << "replacing reductions with whereas statements\n";
   // Replace other reductions with where and forall statements
   struct ReplaceReductionsWithWheres : IndexNotationRewriter {
     using IndexNotationRewriter::visit;
diff --git a/src/index_notation/index_notation_printer.cpp b/src/index_notation/index_notation_printer.cpp
index 0b41615ad..d7ee998ae 100644
--- a/src/index_notation/index_notation_printer.cpp
+++ b/src/index_notation/index_notation_printer.cpp
@@ -224,9 +224,9 @@ void IndexNotationPrinter::visit(const YieldNode* op) {
 void IndexNotationPrinter::visit(const ForallNode* op) {
   os << "forall(" << op->indexVar << ", ";
   op->stmt.accept(this);
-  if (op->parallel_unit != ParallelUnit::NotParallel) {
+  // if (op->parallel_unit != ParallelUnit::NotParallel) {
     os << ", " << ParallelUnit_NAMES[(int) op->parallel_unit] << ", " << OutputRaceStrategy_NAMES[(int) op->output_race_strategy];
-  }
+  // }
   os << ")";
 }
 
diff --git a/src/index_notation/transformations.cpp b/src/index_notation/transformations.cpp
index 47fc1dd55..37a89e617 100644
--- a/src/index_notation/transformations.cpp
+++ b/src/index_notation/transformations.cpp
@@ -1,9 +1,16 @@
 #include "taco/index_notation/transformations.h"
 
+#include "lower/iteration_graph.h"
+#include "lower/tensor_path.h"
+#include "taco/cuda.h"
 #include "taco/index_notation/index_notation.h"
+#include "taco/index_notation/index_notation_nodes_abstract.h"
 #include "taco/index_notation/index_notation_rewriter.h"
 #include "taco/index_notation/index_notation_nodes.h"
+#include "taco/index_notation/index_notation_printer.h"
 #include "taco/error/error_messages.h"
+#include "taco/index_notation/intrinsic.h"
+#include "taco/type.h"
 #include "taco/util/collections.h"
 #include "taco/lower/iterator.h"
 #include "taco/lower/merge_lattice.h"
@@ -305,6 +312,7 @@ IndexStmt Precompute::apply(IndexStmt stmt, std::string* reason) const {
         IndexExpr e = precompute.getExpr();
         IndexVar iw = precompute.getiw();
 
+        // these lines of code looks interesting when creating the producer consumer relationship
         IndexStmt consumer = forall(i, replace(s, {{e, ws(i)}}));
         IndexStmt producer = forall(iw, Assignment(ws(iw), replace(e, {{i,iw}}), 
                                                    assign.getOperator()));
@@ -592,7 +600,10 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const {
     std::string reason = "";
 
     IndexStmt rewriteParallel(IndexStmt stmt) {
+      std::cout << "1 rewriting IndexStmt to support parallelize schedule directive\n--------------------------------------------\n";
+      // std::cout << stmt << std::endl;
       provGraph = ProvenanceGraph(stmt);
+      std::cout << "2 rewriting IndexStmt to support parallelize schedule directive\n--------------------------------------------\n";
 
       const auto reductionVars = getReductionVars(stmt);
       reductionIndexVars.clear();
@@ -607,15 +618,22 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const {
       tensorVars = createIRTensorVars(stmt);
 
       assembledByUngroupedInsert.clear();
+      std::cout << "3 rewriting IndexStmt to support parallelize schedule directive\n--------------------------------------------\n";
       for (const auto& result : getAssembledByUngroupedInsertion(stmt)) {
         assembledByUngroupedInsert.push_back(tensorVars[result]);
       }
 
+      std::cout << "4 rewriting IndexStmt to support parallelize schedule directive\n--------------------------------------------\n";
+      // std::cout << stmt << std::endl;
       return rewrite(stmt);
     }
 
     void visit(const ForallNode* node) {
+      std::cout << "transformations.cpp void visit(const ForallNode* node)\n";
+      std::cout << "node: \n" << node << std::endl;
       Forall foralli(node);
+      std::cout << "foralli: \n" << foralli << std::endl;
+      std::cout << "before stmt update stmt: \n" << stmt << std::endl;
       IndexVar i = parallelize.geti();
 
       definedIndexVars.insert(foralli.getIndexVar());
@@ -632,6 +650,7 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const {
         Iterators iterators(foralli, tensorVars);
         MergeLattice lattice = MergeLattice::make(foralli, iterators, provGraph, 
                                                   definedIndexVars);
+        std::cout << "iter: " << i << ", lattice: \n" << lattice << std::endl;
 
         // Precondition 2: No coiteration of modes (i.e., merge lattice has 
         //                 only one iterator)
@@ -660,6 +679,7 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const {
         MergeLattice underivedLattice = MergeLattice::make(underivedForall, 
                                                            iterators, provGraph, 
                                                            definedIndexVars);
+        std::cout << "iter: " << i << ", underivedLattice: \n" << lattice << std::endl;
 
         // Precondition 3: Every result iterator must have insert capability
         for (Iterator iterator : underivedLattice.results()) {
@@ -721,6 +741,7 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const {
             // build consumer that writes from temporary to output, mark consumer as parallel reduction
             ParallelUnit reductionUnit = ParallelUnit::CPUThreadGroupReduction;
             if (should_use_CUDA_codegen()) {
+              std::cout << "should_use_CUDA_codegen() true\n";
               if (parentParallelUnits.count(ParallelUnit::GPUWarp)) {
                 reductionUnit = ParallelUnit::GPUWarpReduction;
               }
@@ -728,6 +749,9 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const {
                 reductionUnit = ParallelUnit::GPUBlockReduction;
               }
             }
+            else {
+              std::cout << "should_use_CUDA_codegen() false\n";
+            }
             IndexStmt consumer = forall(i, Assignment(assignment->lhs, w(i), assignment->op), reductionUnit, OutputRaceStrategy::ParallelReduction);
             precomputed_stmt = where(consumer, producer);
           }
@@ -746,8 +770,9 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const {
           return;
         }
 
-
+        std::cout << "updated stmt: \n";
         stmt = forall(i, foralli.getStmt(), parallelize.getParallelUnit(), parallelize.getOutputRaceStrategy(), foralli.getUnrollFactor());
+        std::cout << stmt << std::endl;
         return;
       }
 
@@ -1181,6 +1206,7 @@ std::ostream& operator<<(std::ostream& os,
 
 IndexStmt parallelizeOuterLoop(IndexStmt stmt) {
   // get outer ForAll
+  std::cout << "get outer ForAll ----------------- \n";
   Forall forall;
   bool matched = false;
   match(stmt,
@@ -1215,7 +1241,19 @@ IndexStmt parallelizeOuterLoop(IndexStmt stmt) {
     }
     return parallelized256;
   }
+  else if (should_use_ISPC_codegen()) {
+    std::cout << "outer loop parallelization for ISPC codegen\n";
+    // IndexStmt parallelized = Parallelize(forall.getIndexVar(), ParallelUnit::CPUSpmd, OutputRaceStrategy::NoRaces).apply(stmt, &reason);
+    // if (parallelized == IndexStmt()) {
+    //   // can't parallelize
+    //   return stmt;
+    // }
+    // return parallelized;
+
+    return stmt;
+  }
   else {
+    std::cout << "outer loop parallelization for CPU codgen index statement\n";
     IndexStmt parallelized = Parallelize(forall.getIndexVar(), ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces).apply(stmt, &reason);
     if (parallelized == IndexStmt()) {
       // can't parallelize
@@ -1320,8 +1358,669 @@ topologicallySort(map<IndexVar,set<IndexVar>> hardDeps,
   return sortedVars;
 }
 
+bool checkFromBack(const TensorPath& resultTensorPath, 
+                  const vector<TensorPath>& tensorPaths, 
+                  string& removedAccessNode, 
+                  vector<IndexVar>& producerVars, 
+                  vector<IndexVar>& consumerVars,
+                  vector<IndexVar>& modifiedResultIndexesAccessed, 
+                  vector<IndexVar>& sortedAllIndexes) {
+
+  std::cout << "check from back function execution\n";
+
+  const std::vector<IndexVar>& resultIndexesVisited = resultTensorPath.getVariables();
+  IndexVar lastVisitedIndexVar = resultIndexesVisited.back();
+
+  std::cout << "last visited index variable: " << lastVisitedIndexVar << std::endl;
+
+  bool onlyLastTensorContainLastIndexOfOutput = true;
+  bool fissionFromBack = false;
+
+  // check from the back
+  for (unsigned long i=0; i<tensorPaths.size()-1; i++) { // change tensor paths to recursively use the functionality
+    const TensorPath& otherIndexPaths = tensorPaths.at(i);
+    const vector<IndexVar>& indexesVisited = otherIndexPaths.getVariables();
+    cout << "index paths: " << otherIndexPaths << endl;
+
+    // if (i < tensorPaths.size()-1) { 
+      // check if other tensors also contain last index of output tensor
+      for (auto index : indexesVisited) {
+        cout << "checking " << index << " " << lastVisitedIndexVar << endl;
+        if (index == lastVisitedIndexVar) {
+          onlyLastTensorContainLastIndexOfOutput = false;
+        }
+      }
+    // }
+  }
+
+  if (onlyLastTensorContainLastIndexOfOutput) { // last accessed tensorVariable
+    const TensorPath& otherIndexPaths = tensorPaths.back();
+    const vector<IndexVar>& indexesVisited = otherIndexPaths.getVariables();
+    cout << "index paths: " << otherIndexPaths << endl;
+
+    cout << "index variable maybe removed from the back\n";
+    auto lastTensorLastVisited = indexesVisited.back();
+    cout << "last index last visited " << lastTensorLastVisited << endl;
+
+    if (lastTensorLastVisited == lastVisitedIndexVar) {
+      cout << "we can diffuse from the back\n";
+      fissionFromBack = true;
+      removedAccessNode = otherIndexPaths.getAccess().getTensorVar().getName();
+      cout << "removed access node " << removedAccessNode << endl;
+
+      // mark producer accessed index variables
+      for (auto indexVar : sortedAllIndexes) {
+        if (indexVar != lastVisitedIndexVar) { // add everything except the last accessed index
+          std::cout << "producer vars: " << indexVar << std::endl;
+          producerVars.push_back(indexVar);
+        }
+      }
+
+      for (auto indexVar : sortedAllIndexes) {
+        if (indexVar != lastVisitedIndexVar) {
+          if (
+            find(resultIndexesVisited.begin(), resultIndexesVisited.end(), indexVar) 
+              != resultIndexesVisited.end() ||
+            find(indexesVisited.begin(), indexesVisited.end(), indexVar)
+              != indexesVisited.end()
+          ) {
+            modifiedResultIndexesAccessed.push_back(indexVar);
+          }
+        }
+      }
+
+      // // get modified index for the intermediate calculated tensor expression
+      // for (unsigned long j=0; j<resultIndexesVisited.size(); j++) {
+      //   std::cout << "resultIndexesVisited: " << resultIndexesVisited[j] << std::endl;
+      //   modifiedResultIndexesAccessed.push_back(resultIndexesVisited[j]);
+      // }
+      for (auto& idx : modifiedResultIndexesAccessed) {
+        std::cout << "modifiedResultIndexesAccessed: " << idx << std::endl;
+      }
+
+      // auto it = modifiedResultIndexesAccessed.begin();
+      // for (; it != modifiedResultIndexesAccessed.end(); ++it) {
+      //   cout << "modified index " << *it << ", last visited index var: "  << lastVisitedIndexVar << endl;
+      //   if (*it != lastVisitedIndexVar) {
+      //     std::cout << "modified index is not the last visited index variable\n";
+      //     modifiedResultIndexesAccessed.back() = *it;
+          
+      //   }
+      //   else {
+      //     cout << "modified index " << *it << " is the last visited index var " << lastVisitedIndexVar << endl;
+      //   }
+      // }
+      // for (unsigned long j=0; j<modifiedResultIndexesAccessed.size(); j++) {
+      //   std::cout << "modifiedResultIndexesAccessed: " << modifiedResultIndexesAccessed[j] << std::endl;
+      // }
+
+      // mark consumer accessed index variables
+      for (auto indexVar : sortedAllIndexes) {
+        if (
+          find(modifiedResultIndexesAccessed.begin(), modifiedResultIndexesAccessed.end(), indexVar) 
+            != modifiedResultIndexesAccessed.end() ||
+          find(indexesVisited.begin(), indexesVisited.end(), indexVar) 
+            != indexesVisited.end()
+        ) {
+          std::cout << "consumer var: " << indexVar << std::endl;
+          consumerVars.emplace_back(indexVar);
+        }
+      }
+
+    }
+  }
+
+  return fissionFromBack;
+}
+
+bool checkFromFront(const TensorPath& resultTensorPath, 
+                  const vector<TensorPath>& tensorPaths, 
+                  string& removedAccessNode, 
+                  vector<IndexVar>& producerVars, 
+                  vector<IndexVar>& consumerVars,
+                  vector<IndexVar>& modifiedResultIndexesAccessed, 
+                  vector<IndexVar>& sortedAllIndexes) {
+
+  std::cout << "check from front function execution\n";
+
+  const std::vector<IndexVar>& resultIndexesVisited = resultTensorPath.getVariables();
+  IndexVar firstVisitedIndexVar = resultIndexesVisited.front();
+
+  std::cout << "first fisited index variable: " << firstVisitedIndexVar << std::endl;
+  std::cout << "tensor path size: " << tensorPaths.size() << std::endl;
+
+  bool onlyFirstTensorContainFirstIndexOfOutput = true;
+  bool fissionFromFront = false;
+
+  // check from the front
+  for (long i=tensorPaths.size()-1; i>0; i--) { // change tensor paths to recursively use the functionality
+    std::cout << "i: " << i << std::endl;
+    const TensorPath& otherIndexPaths = tensorPaths.at(i);
+    const vector<IndexVar>& indexesVisited = otherIndexPaths.getVariables();
+    cout << "index paths: " << otherIndexPaths << endl;
+
+    if (i != 0) { // check if other tensors also contain last index of output tensor
+      for (auto index : indexesVisited) {
+        cout << "checking " << index << " " << firstVisitedIndexVar << endl;
+        if (index == firstVisitedIndexVar) {
+          onlyFirstTensorContainFirstIndexOfOutput = false;
+        }
+      }
+    } 
+  }
+
+
+  if (onlyFirstTensorContainFirstIndexOfOutput) { // last accessed tensorVariable
+    const TensorPath& otherIndexPaths = tensorPaths.front();
+    const vector<IndexVar>& indexesVisited = otherIndexPaths.getVariables();
+    cout << "index paths: " << otherIndexPaths << endl;
+
+    cout << "index variable maybe removed from the front\n";
+    auto firstTensorFirstVisited = indexesVisited.front();
+    cout << "first index first visited " << firstTensorFirstVisited << endl;
+
+    if (firstTensorFirstVisited == firstVisitedIndexVar) {
+      cout << "we can diffuse from the front\n";
+      fissionFromFront = true;
+      removedAccessNode = otherIndexPaths.getAccess().getTensorVar().getName();
+      cout << "removed access node " << removedAccessNode << endl;
+
+      // mark producer accessed index variables
+      for (auto indexVar : sortedAllIndexes) {
+        if (indexVar != firstVisitedIndexVar) { // add everything except the first accessed index
+          producerVars.emplace_back(indexVar);
+        }
+      }
+
+      for (auto indexVar : sortedAllIndexes) {
+        if (indexVar != firstVisitedIndexVar) {
+          if (
+            find(resultIndexesVisited.begin(), resultIndexesVisited.end(), indexVar) 
+              != resultIndexesVisited.end() ||
+            find(indexesVisited.begin(), indexesVisited.end(), indexVar)
+              != indexesVisited.end()
+          ) {
+            modifiedResultIndexesAccessed.push_back(indexVar);
+          }
+        }
+      }
+
+      for (auto& idx : modifiedResultIndexesAccessed) {
+        std::cout << "modifiedResultIndexesAccessed: " << idx << std::endl;
+      }
+
+      // get modified index for the intermediate calculated tensor expression
+      // for (unsigned long j=0; j<resultIndexesVisited.size(); j++) {
+      //   std::cout << "modified result indexes accessed: " << resultIndexesVisited[j];
+      //   modifiedResultIndexesAccessed.emplace_back(resultIndexesVisited[j]);
+      // }
+      // auto it = modifiedResultIndexesAccessed.begin();
+      // for (; it != modifiedResultIndexesAccessed.end(); it++) {
+      //   cout << "modified index " << *it << endl;
+      //   if (*it != firstVisitedIndexVar) {
+      //     std::cout << "modifying the last index \n";
+      //     modifiedResultIndexesAccessed.front() = *it;
+      //   }
+      // }
+
+      // mark consumer accessed index variables
+      for (auto indexVar : sortedAllIndexes) {
+        if (
+        find(modifiedResultIndexesAccessed.begin(), modifiedResultIndexesAccessed.end(), indexVar) 
+          != modifiedResultIndexesAccessed.end() ||
+        find(indexesVisited.begin(), indexesVisited.end(), indexVar) != indexesVisited.end()) {
+          consumerVars.emplace_back(indexVar);
+        }
+      }
+
+    }
+  } else {
+    std::cout << "fission from the front is not possible\n";
+  }
+
+  
+  return fissionFromFront;
+
+}
+
+
+// let's assume the user gives the removable index node and 
+// the removable expression from front or end
+
+IndexStmt loopFusionOverFission(IndexStmt stmt, Assignment assignment, 
+  std::string side, int iters) {
+  std::cout << "executing travese operation written by me\n";
+
+  if (iters < 1) {
+    return stmt;
+  }
+
+  // IndexVar keeps i, j, k, l, values.
+  // so if we know what index to remove, the rest defines
+  // the order of the producer
+  struct SortedIndexVars : public IndexNotationVisitor {
+    using IndexNotationVisitor::visit;
+    map <IndexVar, ParallelUnit> forallParallelUnit;
+    map <IndexVar, OutputRaceStrategy> forallOutputRaceStrategy;
+    vector<IndexVar> sortedIndexes;
+    Assignment innerBody;
+
+    SortedIndexVars() {};
+
+    void visit(const ForallNode* node) {
+      Forall forallNode(node);
+      IndexVar i = forallNode.getIndexVar();
+      std::cout << forallNode << std::endl;
+
+      sortedIndexes.push_back(i);
+      forallParallelUnit[i] = forallNode.getParallelUnit();
+      forallOutputRaceStrategy[i] = forallNode.getOutputRaceStrategy();
+
+      if (isa<Assignment>(forallNode.getStmt())) {
+        cout << "assignment node found: " << forallNode.getStmt() << endl;;
+        innerBody = to<Assignment>(forallNode.getStmt());
+        return; // Only reorder first contiguous section of ForAlls
+      }
+
+      IndexNotationVisitor::visit(node);
+    }
+  };
+
+  std::cout << "traversing through the index statement\n";
+  SortedIndexVars sortedIndexVars;
+  stmt.accept(&sortedIndexVars);
+  std::cout << std::endl;
+
+  struct IndexExprBuilder : public IndexNotationVisitor {
+
+    using IndexNotationVisitor::visit;
+    vector<Access> accessLeftToRight;
+    map<IndexVar, vector<pair<Dimension,Type>>> indexDimensionsMap;
+
+    void visit(const AccessNode* node) {
+      Access accessNode(node);
+      std::cout << "access node: " << accessNode << std::endl;
+      accessLeftToRight.push_back(accessNode);
+
+      TensorVar tensorVar = accessNode.getTensorVar();
+
+      for (unsigned long i=0; i < accessNode.getIndexVars().size(); i++) {
+        auto var = accessNode.getIndexVars()[i];
+
+        if (indexDimensionsMap.find(var) != indexDimensionsMap.end()) {
+          indexDimensionsMap[var].emplace_back(
+            pair<Dimension,Type>(tensorVar.getType().getShape().getDimension(i),
+            tensorVar.getType()));
+        }
+        else {
+          indexDimensionsMap[var] = {
+            pair<Dimension,Type>(
+              tensorVar.getType().getShape().getDimension(i),
+              tensorVar.getType())
+          };
+        }
+      }
+
+    }
+
+  };
+
+  IndexExpr rhsExpr = assignment.getRhs();
+  Access lhsAccess = to<Access>(assignment.getLhs());
+  std::cout << "right hand side expression: " << rhsExpr << std::endl;
+  IndexExprBuilder indexExprBuilder;
+  rhsExpr.accept(&indexExprBuilder);
+  TensorVar resultVar = lhsAccess.getTensorVar();
+
+  for (auto item : indexExprBuilder.indexDimensionsMap) {
+    auto indexVar = item.first;
+    cout << "var: " << indexVar << " ";
+    for (auto elem : item.second) {
+      cout << elem.first << " " << elem.second << " " ;
+    }
+    cout << endl;
+  }
+
+
+  // now I have the iteration graph
+  IterationGraph iterationGraph = IterationGraph::make(assignment);
+  std::cout << "/*******************************************/\n";
+  std::cout << "/********** ITERATION GRAPH ****************/\n";
+  std::cout << "/*******************************************/\n";
+  std::cout << iterationGraph << std::endl;
+
+  const TensorPath& resultTensorPath = iterationGraph.getResultTensorPath();
+  const std::vector<TensorPath>& tensorPaths = iterationGraph.getTensorPaths();
+  
+
+  string removedAccessNode;
+  vector<IndexVar> producerVars; // producer accessed index variables
+  vector<IndexVar> consumerVars; // consumer accessed index variables
+  vector<IndexVar> fusedVars;
+  vector<IndexVar> modifiedResultIndexesAccessed;
+  bool fissionFromBack = false;
+  if (side == "b") {
+    fissionFromBack = true;
+  }
+
+  if (fissionFromBack) {
+    fissionFromBack = checkFromBack(resultTensorPath, tensorPaths, 
+      removedAccessNode, producerVars, consumerVars,
+      modifiedResultIndexesAccessed, sortedIndexVars.sortedIndexes
+    );
+  }
+
+  vector<Dimension> newAccessDims{};
+  for (auto var : modifiedResultIndexesAccessed) {
+    auto item = indexExprBuilder.indexDimensionsMap[var];
+    cout << "shared vars: " << var << endl;
+    newAccessDims.emplace_back(item[0].first);
+  }
+  TensorVar newAccessVar(resultVar.getName() + "_inner", 
+              Type(resultVar.getType().getDataType(), newAccessDims));
+  Access newResultAccess(newAccessVar, modifiedResultIndexesAccessed);
+  cout << "new access variable for iterative apply: " << newResultAccess << std::endl;
+
+  bool fissionFromFront = false;
+  if (side == "f") {
+    fissionFromFront = true;
+  }
+  if (fissionFromBack == false && fissionFromFront) {
+      fissionFromFront = checkFromFront(resultTensorPath, tensorPaths, 
+        removedAccessNode, producerVars, consumerVars,
+        modifiedResultIndexesAccessed, sortedIndexVars.sortedIndexes
+      );
+  }  
+
+  if (!fissionFromBack && !fissionFromFront) {
+    cout << "fission operation cannot be performed from the back\n";
+    return stmt;
+  }
+
+  if (fissionFromBack) {
+    std::cout << "fission from the back is possible\n";
+  }
+  if (fissionFromFront) {
+    std::cout << "fission from the front is possible\n";
+  }
+
+  // // check from the front
+  // struct IndexExprSeparator : public IndexNotationVisitor {
+
+  //   using IndexNotationVisitor::visit;
+  //   vector<Access> accessLeftToRight;
+
+  //   void visit(const MulNode* node) {
+  //     Mul mulNode(node);
+  //     IndexExpr lhs = mulNode.getA();
+  //     IndexExpr rhs = mulNode.getB();
+  //     std::cout << "access node: " << accessNode << std::endl;
+  //     accessLeftToRight.push_back(accessNode);
+  //   }
+
+  // };
+
+
+  cout << "\n\nProducer accessed index variables\n";
+  auto it = producerVars.begin();
+  for (; it != producerVars.end(); it++) {
+    cout << *it << endl;
+  }
+  cout << "\n\nConsumer accessed index variables\n";
+  it = consumerVars.begin();
+  for (; it != consumerVars.end(); it++) {
+    cout << *it << endl;
+  }
+  cout << endl << endl;
+
+  // check common vars that can be fused
+  for (auto var : sortedIndexVars.sortedIndexes) {
+    if (find(producerVars.begin(), producerVars.end(), var) != producerVars.end() &&
+    find(consumerVars.begin(), consumerVars.end(), var) != consumerVars.end()) {
+      fusedVars.emplace_back(var);
+    }
+    else {
+      break;
+    }
+  }
+
+  for (auto& fv : fusedVars) {
+    std::cout << "fusable vars: " << fv << std::endl;
+  }
+
+  vector<IndexVar> sharedVars;
+  for (auto var : sortedIndexVars.sortedIndexes) {
+    if (find(fusedVars.begin(), fusedVars.end(), var) == fusedVars.end() &&
+      find(producerVars.begin(), producerVars.end(), var) != producerVars.end() &&
+      find(consumerVars.begin(), consumerVars.end(), var) != consumerVars.end()
+    ) {
+      sharedVars.emplace_back(var);
+    }
+  }
+
+  for (auto& sv : sharedVars) {
+    std::cout << "shared vars: " << sv << std::endl;
+  }
+
+  vector<Dimension> sharedDims{};
+  for (auto var : sharedVars) {
+    auto item = indexExprBuilder.indexDimensionsMap[var];
+    cout << "shared vars: " << var << endl;
+    sharedDims.emplace_back(item[0].first);
+  }
+
+
+  // get removing tensorvars and workspace dimension
+  const Type& type = resultTensorPath.getAccess().getTensorVar().getType();
+  const Format& format = resultTensorPath.getAccess().getTensorVar().getFormat();
+  TensorVar intermediateTensor("ws", type, format);
+  cout << intermediateTensor << endl;
+
+  // TensorVar A("A", Type(), taco::dense);
+  TensorVar tempVar("t" + resultVar.getName(), 
+                Type(resultVar.getType().getDataType(), sharedDims));
+  cout << "tensor order: " << tempVar.getOrder() << endl;
+  cout << "tensor format: " << tempVar.getFormat() << endl;
+  cout << "format order: " << tempVar.getFormat().getOrder() << endl;
+  
+  // TensorVar* a = new TensorVar("A", Type());
+  // TensorVar ws("ws", Type(type<double>(), {jdim}) );
+
+  // get removing indexExpr and the rest of the indexExpr
+  Access workspace(tempVar, sharedVars);
+  std::cout << "workspace access tensor: " << workspace << std::endl;
+
+
+  
+  // construct producer expression right hand side
+  cout << "generating consumer expression\n";
+  IndexExpr producerExpr;
+  int num_muls = 0;
+  for (Access accessNode : indexExprBuilder.accessLeftToRight) {
+    std::cout << "accessNodes: " << accessNode << endl;
+    if (removedAccessNode != accessNode.getTensorVar().getName()) {
+      if (producerExpr == NULL) {
+        std::cout << "index expression is null";
+        producerExpr = accessNode;
+        std::cout << "producerExpr: " << producerExpr << std::endl;
+      } else {
+        num_muls++;
+        producerExpr = producerExpr * accessNode;
+        std::cout << "producerExpr: " << producerExpr << std::endl;
+      }
+    }
+  }
+  std::cout << producerExpr << std::endl;
+  Assignment producerAssignment(newResultAccess,
+    producerExpr);
+  std::cout << "new inner assignment statement: " << producerAssignment << std::endl;
+  Assignment producerInnerBody(workspace,
+    producerExpr,
+    sortedIndexVars.innerBody.getOperator()
+  );
+  std::cout << "producerInnerBody: " << producerInnerBody << std::endl;
+
+  // construct consumer expression right hand side
+  IndexExpr consumerExpr;
+  if (fissionFromBack) {
+    consumerExpr = workspace;
+  }
+  cout << "generating consumer expression: " << consumerExpr << std::endl;
+  for (Access accessNode : indexExprBuilder.accessLeftToRight) {
+    TensorVar tv = accessNode.getTensorVar();
+    std::cout << "accessNodes: " << accessNode << endl;
+    if (removedAccessNode == accessNode.getTensorVar().getName()) {
+      if (consumerExpr == NULL) {
+        std::cout << "index expression is null";
+        consumerExpr = accessNode;
+        std::cout << "consumerExpr: " << consumerExpr << std::endl;
+      } else {
+        consumerExpr = consumerExpr * accessNode;
+        std::cout << "consumerExpr: " << consumerExpr << std::endl;
+      }
+    }
+  }
+  if (fissionFromFront) {
+    consumerExpr = consumerExpr * workspace;
+  }
+  Assignment consumerInnerBody(lhsAccess,
+    consumerExpr,
+    sortedIndexVars.innerBody.getOperator()
+  );
+
+  cout << "Producer inner body: " << producerInnerBody << endl;
+  cout << "Consumer inner body: " << consumerInnerBody << endl;
+
+  // rewrite indexstmt
+  // Reorder Foralls use a rewriter in case new nodes introduced outside of Forall
+  struct ProducerConsumerRewriter : public IndexNotationRewriter {
+    using IndexNotationRewriter::visit;
+
+    const vector<IndexVar>& producerConsumerVars;
+    const vector<IndexVar>& fusedVars;
+    IndexStmt innerBody;
+    const map <IndexVar, ParallelUnit> forallParallelUnit;
+    const map <IndexVar, OutputRaceStrategy> forallOutputRaceStrategy;
+
+    ProducerConsumerRewriter(const vector<IndexVar>& producerConsumerVars, 
+                    const vector<IndexVar>& fusedVars, IndexStmt innerBody,
+                    const map <IndexVar, ParallelUnit> forallParallelUnit,
+                    const map <IndexVar, OutputRaceStrategy> forallOutputRaceStrategy)
+        : producerConsumerVars(producerConsumerVars), fusedVars(fusedVars), innerBody(innerBody),
+        forallParallelUnit(forallParallelUnit), forallOutputRaceStrategy(forallOutputRaceStrategy)  {
+    }
+
+    void visit(const ForallNode* node) {
+      Forall foralli(node);
+      IndexVar i = foralli.getIndexVar();
+      cout << "going through var: " << i << endl;
+
+      // first forall must be in collected variables
+      // taco_iassert(util::contains(producerVars, i));
+      // std::cout << "\ninner body of the statement\n" << innerBody;
+      // // done in reverse order?
+      // for (auto it = sortedVars.rbegin(); it != sortedVars.rend(); ++it) {
+      //   stmt = forall(*it, stmt, forallParallelUnit.at(*it), forallOutputRaceStrategy.at(*it), foralli.getUnrollFactor());
+      // }
+      stmt = rewrite(foralli.getStmt());
+      cout << "after rewrite statement: " << stmt << endl;
+
+      // omit the index variables in the fusedVar list
+      if (find(fusedVars.begin(), fusedVars.end(), i) == fusedVars.end() &&
+          find(producerConsumerVars.begin(), producerConsumerVars.end(), i) != producerConsumerVars.end()) {
+        stmt = forall(i, stmt, forallParallelUnit.at(i), forallOutputRaceStrategy.at(i), foralli.getUnrollFactor());
+      }
+    }
+
+    void visit (const AssignmentNode* node) {
+      cout << "assignment node: " << node << endl;
+      stmt = innerBody;
+      cout << "producerStmt: " << innerBody << endl;
+      cout << "stmt: " << stmt << endl;
+    }
+
+  };
+  ProducerConsumerRewriter producerRewriter(producerVars, fusedVars, 
+              producerInnerBody, 
+              sortedIndexVars.forallParallelUnit, 
+              sortedIndexVars.forallOutputRaceStrategy);
+  IndexStmt producerStmt = producerRewriter.rewrite(stmt);
+  std::cout << "\nAfter Producer rewriter\n";
+  std::cout << producerStmt << std::endl;
+  if (num_muls > 1) {
+    producerStmt = loopFusionOverFission(producerStmt, producerInnerBody, 
+      side, iters-1);
+  }
+  
+
+  ProducerConsumerRewriter consumerRewriter(consumerVars, fusedVars, 
+              consumerInnerBody, 
+              sortedIndexVars.forallParallelUnit, 
+              sortedIndexVars.forallOutputRaceStrategy);
+  IndexStmt consumerStmt = consumerRewriter.rewrite(stmt);
+  std::cout << "\nAfter Consumer rewriter\n";
+  std::cout << consumerStmt << std::endl;
+
+
+  struct CombineProducerConsumerRewriter : public IndexNotationRewriter {
+
+    const vector<IndexVar>& fusedVars;
+    IndexStmt consumerStmt;
+    IndexStmt producerStmt;
+    const map <IndexVar, ParallelUnit> forallParallelUnit;
+    const map <IndexVar, OutputRaceStrategy> forallOutputRaceStrategy;
+
+    CombineProducerConsumerRewriter(const vector<IndexVar>& fusedVars, 
+      IndexStmt producerStmt, IndexStmt consumerStmt, 
+      const map <IndexVar, ParallelUnit> forallParallelUnit,
+      const map <IndexVar, OutputRaceStrategy> forallOutputRaceStrategy)
+      : fusedVars(fusedVars), consumerStmt(consumerStmt), producerStmt(producerStmt),
+      forallParallelUnit(forallParallelUnit), 
+      forallOutputRaceStrategy(forallOutputRaceStrategy) {}
+  
+    using IndexNotationRewriter::visit;
+
+    void visit(const ForallNode* node) {
+      Forall foralli(node);
+      IndexVar i = foralli.getIndexVar();
+      cout << "going through var: " << i << endl;
+      
+      // omit the index variables in the fusedVar list
+      if (find(fusedVars.begin(), fusedVars.end(), i) != fusedVars.end()) {
+        cout << "fused var in stmt\n";
+        stmt = rewrite(foralli.getStmt());
+        cout << "rewritten stmt: " << stmt << endl;
+        stmt = forall(i, stmt, forallParallelUnit.at(i), forallOutputRaceStrategy.at(i), foralli.getUnrollFactor());
+      }
+      else {
+        cout << "fused var not in  stmt\n";
+        cout << "producerStmt: " << producerStmt << endl;
+        cout << "consumerStmt: " << consumerStmt << endl;
+        stmt = where(consumerStmt, producerStmt);
+        cout << "where stmt: " << stmt << endl;
+      }
+
+      cout << "after rewrite statement: " << stmt << endl;
+    }
+  
+  };
+
+  CombineProducerConsumerRewriter combineRewriter(fusedVars, 
+              producerStmt, consumerStmt, 
+              sortedIndexVars.forallParallelUnit, 
+              sortedIndexVars.forallOutputRaceStrategy);
+  IndexStmt combinedStmt = combineRewriter.rewrite(stmt);
+  std::cout << "\nAfter Combine rewriter\n";
+  std::cout << combinedStmt << std::endl;
+
+
+  return combinedStmt;
+  
+}
+
 
 IndexStmt reorderLoopsTopologically(IndexStmt stmt) {
+  std::cout << "executing reorderLoopsTopologically\n";
   // Collect tensorLevelVars which stores the pairs of IndexVar and tensor
   // level that each tensor is accessed at
   struct DAGBuilder : public IndexNotationVisitor {
@@ -1382,8 +2081,11 @@ IndexStmt reorderLoopsTopologically(IndexStmt stmt) {
   };
 
   Iterators iterators(stmt);
+  std::cout << "DAG builder with iterators" << std::endl;
   DAGBuilder dagBuilder(iterators);
   stmt.accept(&dagBuilder);
+  std::cout << "After DAGBuilder\n";
+  std::cout << stmt << std::endl;
 
   // Construct tensor dependencies (sorted list of IndexVars) from tensorLevelVars
   map<string, vector<pair<IndexVar, bool>>> tensorVarOrders;
@@ -1391,6 +2093,7 @@ IndexStmt reorderLoopsTopologically(IndexStmt stmt) {
     tensorVarOrders[tensorLevelVar.first] = 
         varOrderFromTensorLevels(tensorLevelVar.second);
   }
+  // hard dependencies
   const auto hardDeps = depsFromVarOrders(tensorVarOrders);
 
   struct CollectSoftDependencies : public IndexNotationVisitor {
@@ -1412,12 +2115,17 @@ IndexStmt reorderLoopsTopologically(IndexStmt stmt) {
       }
     }
   };
+  // soft dependencies
   CollectSoftDependencies collectSoftDeps;
   stmt.accept(&collectSoftDeps);
+  std::cout << "After CollectSoftDependencies\n";
+  std::cout << stmt << std::endl;
 
+  // topological sort
   const auto sortedVars = topologicallySort(hardDeps, collectSoftDeps.softDeps, 
                                             dagBuilder.indexVarOriginalOrder);
 
+  // rewrite indexstmt
   // Reorder Foralls use a rewriter in case new nodes introduced outside of Forall
   struct TopoReorderRewriter : public IndexNotationRewriter {
     using IndexNotationRewriter::visit;
@@ -1440,7 +2148,9 @@ IndexStmt reorderLoopsTopologically(IndexStmt stmt) {
 
       // first forall must be in collected variables
       taco_iassert(util::contains(sortedVars, i));
+      std::cout << "\ninner body of the statement\n" << innerBody;
       stmt = innerBody;
+      // done in reverse order?
       for (auto it = sortedVars.rbegin(); it != sortedVars.rend(); ++it) {
         stmt = forall(*it, stmt, forallParallelUnit.at(*it), forallOutputRaceStrategy.at(*it), foralli.getUnrollFactor());
       }
@@ -1450,7 +2160,11 @@ IndexStmt reorderLoopsTopologically(IndexStmt stmt) {
   };
   TopoReorderRewriter rewriter(sortedVars, dagBuilder.innerBody, 
                                dagBuilder.forallParallelUnit, dagBuilder.forallOutputRaceStrategy);
-  return rewriter.rewrite(stmt);
+  IndexStmt stmtChanged = rewriter.rewrite(stmt);
+  std::cout << "After TopoReorderRewriter\n";
+  std::cout << stmtChanged << std::endl;
+
+  return stmtChanged;
 }
 
 IndexStmt scalarPromote(IndexStmt stmt, ProvenanceGraph provGraph, 
@@ -1478,6 +2192,7 @@ IndexStmt scalarPromote(IndexStmt stmt, ProvenanceGraph provGraph,
 
     void visit(const ForallNode* node) {
       Forall foralli(node);
+      std::cout << "scalar promote: " << foralli << std::endl;
       IndexVar i = foralli.getIndexVar();
 
       // Don't allow hoisting out of forall's for GPU warp and block reduction
diff --git a/src/ir/ir_printer.cpp b/src/ir/ir_printer.cpp
index a1997a9b7..eddca3f29 100644
--- a/src/ir/ir_printer.cpp
+++ b/src/ir/ir_printer.cpp
@@ -1,6 +1,7 @@
 #include <sstream>
 #include <iostream>
 
+#include "taco/cuda.h"
 #include "taco/ir/ir.h"
 #include "taco/ir/ir_printer.h"
 #include "taco/ir/simplify.h"
@@ -34,7 +35,11 @@ IRPrinter::IRPrinter(ostream &s) : IRPrinter(s, false, false) {
 }
 
 IRPrinter::IRPrinter(ostream &s, bool color, bool simplify)
-    : stream(s), indent(0), color(color), simplify(simplify) {
+    : stream(s), stream2(s), indent(0), color(color), simplify(simplify) {
+}
+
+IRPrinter::IRPrinter(ostream &s, ostream &s2, bool color, bool simplify)
+    : stream(s), stream2(s2), indent(0), color(color), simplify(simplify) {
 }
 
 IRPrinter::~IRPrinter() {
@@ -59,79 +64,169 @@ void IRPrinter::print(Stmt stmt) {
 }
 
 void IRPrinter::visit(const Literal* op) {
-  if (color) {
-    stream << blue ;
-  }
-
-  switch (op->type.getKind()) {
-    case Datatype::Bool:
-      stream << op->getValue<bool>();
-    break;
-    case Datatype::UInt8:
-      stream << static_cast<uint16_t>(op->getValue<uint8_t>());
-    break;
-    case Datatype::UInt16:
-      stream << op->getValue<uint16_t>();
-    break;
-    case Datatype::UInt32:
-      stream << op->getValue<uint32_t>();
-    break;
-    case Datatype::UInt64:
-      stream << op->getValue<uint64_t>();
-    break;
-    case Datatype::UInt128:
-      taco_not_supported_yet;
-    break;
-    case Datatype::Int8:
-      stream << static_cast<int16_t>(op->getValue<int8_t>());
-    break;
-    case Datatype::Int16:
-      stream << op->getValue<int16_t>();
-    break;
-    case Datatype::Int32:
-      stream << op->getValue<int32_t>();
-    break;
-    case Datatype::Int64:
-      stream << op->getValue<int64_t>();
-    break;
-    case Datatype::Int128:
-      taco_not_supported_yet;
-    break;
-    case Datatype::Float32:
-      stream << ((op->getValue<float>() != 0.0)
-                 ? util::toString(op->getValue<float>()) : "0.0");
-    break;
-    case Datatype::Float64:
-      stream << ((op->getValue<double>()!=0.0)
-                 ? util::toString(op->getValue<double>()) : "0.0");
-    break;
-    case Datatype::Complex64: {
-      std::complex<float> val = op->getValue<std::complex<float>>();
-      stream << val.real() << " + I*" << val.imag();
-    }
-    break;
-    case Datatype::Complex128: {
-      std::complex<double> val = op->getValue<std::complex<double>>();
-      stream << val.real() << " + I*" << val.imag();
+  if (is_ISPC_code_stream_enabled()) {
+    if (color) {
+        stream2 << blue ;
+      }
+
+      // It seems this is where all the types get printed in the final code generation.
+      // Come up with a way to generate different values if stream2 is used to generate ispc code
+      switch (op->type.getKind()) {
+        case Datatype::Bool:
+          stream2 << op->getValue<bool>();
+        break;
+        case Datatype::UInt8:
+          stream2 << static_cast<uint16_t>(op->getValue<uint8_t>());
+        break;
+        case Datatype::UInt16:
+          stream2 << op->getValue<uint16_t>();
+        break;
+        case Datatype::UInt32:
+          stream2 << op->getValue<uint32_t>();
+        break;
+        case Datatype::UInt64:
+          stream2 << op->getValue<uint64_t>();
+        break;
+        case Datatype::UInt128:
+          taco_not_supported_yet;
+        break;
+        case Datatype::Int8:
+          stream2 << static_cast<int16_t>(op->getValue<int8_t>());
+        break;
+        case Datatype::Int16:
+          stream2 << op->getValue<int16_t>();
+        break;
+        case Datatype::Int32:
+          stream2 << op->getValue<int32_t>();
+        break;
+        case Datatype::Int64:
+          stream2 << op->getValue<int64_t>();
+        break;
+        case Datatype::Int128:
+          taco_not_supported_yet;
+        break;
+        case Datatype::Float32:
+          stream2 << ((op->getValue<float>() != 0.0)
+                    ? util::toString(op->getValue<float>()) : "0.0");
+        break;
+        case Datatype::Float64:
+          stream2 << ((op->getValue<double>()!=0.0)
+                    ? util::toString(op->getValue<double>()) : "0.0");
+        break;
+        case Datatype::Complex64: {
+          std::complex<float> val = op->getValue<std::complex<float>>();
+          stream2 << val.real() << " + I*" << val.imag();
+        }
+        break;
+        case Datatype::Complex128: {
+          std::complex<double> val = op->getValue<std::complex<double>>();
+          stream2 << val.real() << " + I*" << val.imag();
+        }
+        break;
+        case Datatype::Undefined:
+          taco_ierror << "Undefined type in IR";
+        break;
+      }
+
+      if (color) {
+        stream2 << nc;
+      }
     }
-    break;
-    case Datatype::Undefined:
-      taco_ierror << "Undefined type in IR";
-    break;
-  }
 
-  if (color) {
-    stream << nc;
+
+
+  else {
+
+    if (color) {
+        stream << blue ;
+      }
+
+      // It seems this is where all the types get printed in the final code generation.
+      // Come up with a way to generate different values if stream2 is used to generate ispc code
+      switch (op->type.getKind()) {
+        case Datatype::Bool:
+          stream << op->getValue<bool>();
+        break;
+        case Datatype::UInt8:
+          stream << static_cast<uint16_t>(op->getValue<uint8_t>());
+        break;
+        case Datatype::UInt16:
+          stream << op->getValue<uint16_t>();
+        break;
+        case Datatype::UInt32:
+          stream << op->getValue<uint32_t>();
+        break;
+        case Datatype::UInt64:
+          stream << op->getValue<uint64_t>();
+        break;
+        case Datatype::UInt128:
+          taco_not_supported_yet;
+        break;
+        case Datatype::Int8:
+          stream << static_cast<int16_t>(op->getValue<int8_t>());
+        break;
+        case Datatype::Int16:
+          stream << op->getValue<int16_t>();
+        break;
+        case Datatype::Int32:
+          stream << op->getValue<int32_t>();
+        break;
+        case Datatype::Int64:
+          stream << op->getValue<int64_t>();
+        break;
+        case Datatype::Int128:
+          taco_not_supported_yet;
+        break;
+        case Datatype::Float32:
+          stream << ((op->getValue<float>() != 0.0)
+                    ? util::toString(op->getValue<float>()) : "0.0");
+        break;
+        case Datatype::Float64:
+          stream << ((op->getValue<double>()!=0.0)
+                    ? util::toString(op->getValue<double>()) : "0.0");
+        break;
+        case Datatype::Complex64: {
+          std::complex<float> val = op->getValue<std::complex<float>>();
+          stream << val.real() << " + I*" << val.imag();
+        }
+        break;
+        case Datatype::Complex128: {
+          std::complex<double> val = op->getValue<std::complex<double>>();
+          stream << val.real() << " + I*" << val.imag();
+        }
+        break;
+        case Datatype::Undefined:
+          taco_ierror << "Undefined type in IR";
+        break;
+      }
+
+      if (color) {
+        stream << nc;
+      }
+
+    
   }
+  
 }
 
 void IRPrinter::visit(const Var* op) {
-  if (varNames.contains(op)) {
-    stream << varNames.get(op);
+  if (is_ISPC_code_stream_enabled()) {
+    if (varNames.contains(op)) {
+      stream2 << varNames.get(op);
+    }
+    else {
+      stream2 << op->name;
+    }
   }
   else {
-    stream << op->name;
+    if (varNames.contains(op)) {
+      stream << varNames.get(op);
+    }
+    else {
+      stream << op->name;
+    }
   }
+
 }
 
 void IRPrinter::visit(const Neg* op) {
@@ -238,51 +333,101 @@ void IRPrinter::visit(const Cast* op) {
 }
 
 void IRPrinter::visit(const Call* op) {
-  stream << op->func << "(";
-  parentPrecedence = Precedence::CALL;
-  acceptJoin(this, stream, op->args, ", ");
-  stream << ")";
+  if (!is_ISPC_code_stream_enabled()) {
+    stream << op->func << "(";
+    parentPrecedence = Precedence::CALL;
+    acceptJoin(this, stream, op->args, ", ");
+    stream << ")";
+  } else {
+    // statically added function to the ispc file has __ in the front
+    stream2 << "__" << op->func << "(";
+    parentPrecedence = Precedence::CALL;
+    acceptJoin(this, stream2, op->args, ", ");
+    stream2 << ")";
+  }
 }
 
 void IRPrinter::visit(const IfThenElse* op) {
   taco_iassert(op->cond.defined());
   taco_iassert(op->then.defined());
   doIndent();
-  stream << keywordString("if ");
-  stream << "(";
-  parentPrecedence = Precedence::TOP;
-  op->cond.accept(this);
-  stream << ")";
+  if (is_ISPC_code_stream_enabled()) {
+    stream2 << keywordString("if ");
+    stream2 << "(";
+    parentPrecedence = Precedence::TOP;
+    op->cond.accept(this);
+    stream2 << ")";
+
+    Stmt scopedStmt = Stmt(to<Scope>(op->then)->scopedStmt);
+    if (isa<Block>(scopedStmt)) {
+      stream2 << " {" << endl;
+      op->then.accept(this);
+      doIndent();
+      stream2 << "}";
+    }
+    else if (isa<Assign>(scopedStmt)) {
+      int tmp = indent;
+      indent = 0;
+      stream2 << " ";
+      scopedStmt.accept(this);
+      indent = tmp;
+    }
+    else {
+      stream2 << endl;
+      op->then.accept(this);
+    }
 
-  Stmt scopedStmt = Stmt(to<Scope>(op->then)->scopedStmt);
-  if (isa<Block>(scopedStmt)) {
-    stream << " {" << endl;
-    op->then.accept(this);
-    doIndent();
-    stream << "}";
-  }
-  else if (isa<Assign>(scopedStmt)) {
-    int tmp = indent;
-    indent = 0;
-    stream << " ";
-    scopedStmt.accept(this);
-    indent = tmp;
+    if (op->otherwise.defined()) {
+      stream2 << "\n";
+      doIndent();
+      stream2 << keywordString("else");
+      stream2 << " {\n";
+      op->otherwise.accept(this);
+      doIndent();
+      stream2 << "}";
+    }
+    stream2 << endl;    
   }
+
+
   else {
-    stream << endl;
-    op->then.accept(this);
-  }
+    stream << keywordString("if ");
+    stream << "(";
+    parentPrecedence = Precedence::TOP;
+    op->cond.accept(this);
+    stream << ")";
 
-  if (op->otherwise.defined()) {
-    stream << "\n";
-    doIndent();
-    stream << keywordString("else");
-    stream << " {\n";
-    op->otherwise.accept(this);
-    doIndent();
-    stream << "}";
+    Stmt scopedStmt = Stmt(to<Scope>(op->then)->scopedStmt);
+    if (isa<Block>(scopedStmt)) {
+      stream << " {" << endl;
+      op->then.accept(this);
+      doIndent();
+      stream << "}";
+    }
+    else if (isa<Assign>(scopedStmt)) {
+      int tmp = indent;
+      indent = 0;
+      stream << " ";
+      scopedStmt.accept(this);
+      indent = tmp;
+    }
+    else {
+      stream << endl;
+      op->then.accept(this);
+    }
+
+    if (op->otherwise.defined()) {
+      stream << "\n";
+      doIndent();
+      stream << keywordString("else");
+      stream << " {\n";
+      op->otherwise.accept(this);
+      doIndent();
+      stream << "}";
+    }
+    stream << endl;    
   }
-  stream << endl;
+
 }
 
 void IRPrinter::visit(const Case* op) {
@@ -345,12 +490,22 @@ void IRPrinter::visit(const Switch* op) {
 }
 
 void IRPrinter::visit(const Load* op) {
-  parentPrecedence = Precedence::LOAD;
-  op->arr.accept(this);
-  stream << "[";
-  parentPrecedence = Precedence::LOAD;
-  op->loc.accept(this);
-  stream << "]";
+  if (is_ISPC_code_stream_enabled()) {
+    parentPrecedence = Precedence::LOAD;
+    op->arr.accept(this);
+    stream2 << "[";
+    parentPrecedence = Precedence::LOAD;
+    op->loc.accept(this);
+    stream2 << "]";    
+  }
+  else {
+    parentPrecedence = Precedence::LOAD;
+    op->arr.accept(this);
+    stream << "[";
+    parentPrecedence = Precedence::LOAD;
+    op->loc.accept(this);
+    stream << "]";   
+  }
 }
 
 void IRPrinter::visit(const Malloc* op) {
@@ -367,66 +522,149 @@ void IRPrinter::visit(const Sizeof* op) {
 }
 
 void IRPrinter::visit(const Store* op) {
-  doIndent();
-  op->arr.accept(this);
-  stream << "[";
-  parentPrecedence = Precedence::TOP;
-  op->loc.accept(this);
-  stream << "] = ";
-  parentPrecedence = Precedence::TOP;
-  op->data.accept(this);
-  stream << ";";
-  stream << endl;
+  if (is_ISPC_code_stream_enabled()) {
+    doIndent();
+    op->arr.accept(this);
+    stream2 << "[";
+    parentPrecedence = Precedence::TOP;
+    op->loc.accept(this);
+    stream2 << "] = ";
+    parentPrecedence = Precedence::TOP;
+    op->data.accept(this);
+    stream2 << ";";
+    stream2 << endl;
+  }
+  else {
+    doIndent();
+    op->arr.accept(this);
+    stream << "[";
+    parentPrecedence = Precedence::TOP;
+    op->loc.accept(this);
+    stream << "] = ";
+    parentPrecedence = Precedence::TOP;
+    op->data.accept(this);
+    stream << ";";
+    stream << endl;
+  }
+
 }
 
 void IRPrinter::visit(const For* op) {
-  doIndent();
-  stream << keywordString("for") << " (" 
-         << keywordString(util::toString(op->var.type())) << " ";
-  op->var.accept(this);
-  stream << " = ";
-  op->start.accept(this);
-  stream << keywordString("; ");
-  op->var.accept(this);
-  stream << " < ";
-  parentPrecedence = BOTTOM;
-  op->end.accept(this);
-  stream << keywordString("; ");
-  op->var.accept(this);
+  // std::cout << "This is IRPrinter::visit For op method\n";
+  if (is_ISPC_code_stream_enabled()) {
+    doIndent();
+    stream2 << keywordString("for") << " (" 
+          << keywordString(util::toString(op->var.type())) << " ";
+    op->var.accept(this);
+    stream2 << " = ";
+    op->start.accept(this);
+    stream2 << keywordString("; ");
+    op->var.accept(this);
+    stream2 << " < ";
+    parentPrecedence = BOTTOM;
+    op->end.accept(this);
+    stream2 << keywordString("; ");
+    op->var.accept(this);
+
+    auto lit = op->increment.as<Literal>();
+    if (lit != nullptr && ((lit->type.isInt()  && lit->equalsScalar(1)) ||
+                          (lit->type.isUInt() && lit->equalsScalar(1)))) {
+      stream2 << "++";
+    }
+    else {
+      stream2 << " += ";
+      op->increment.accept(this);
+    }
+    stream2 << ") {\n";
 
-  auto lit = op->increment.as<Literal>();
-  if (lit != nullptr && ((lit->type.isInt()  && lit->equalsScalar(1)) ||
-                         (lit->type.isUInt() && lit->equalsScalar(1)))) {
-    stream << "++";
+    op->contents.accept(this);
+    doIndent();
+    stream2 << "}";
+    stream2 << endl;
   }
+  
+  
   else {
-    stream << " += ";
-    op->increment.accept(this);
+    doIndent();
+    stream << keywordString("for") << " (" 
+          << keywordString(util::toString(op->var.type())) << " ";
+    op->var.accept(this);
+    stream << " = ";
+    op->start.accept(this);
+    stream << keywordString("; ");
+    op->var.accept(this);
+    stream << " < ";
+    parentPrecedence = BOTTOM;
+    op->end.accept(this);
+    stream << keywordString("; ");
+    op->var.accept(this);
+
+    auto lit = op->increment.as<Literal>();
+    if (lit != nullptr && ((lit->type.isInt()  && lit->equalsScalar(1)) ||
+                          (lit->type.isUInt() && lit->equalsScalar(1)))) {
+      stream << "++";
+    }
+    else {
+      stream << " += ";
+      op->increment.accept(this);
+    }
+    stream << ") {\n";
+
+    op->contents.accept(this);
+    doIndent();
+    stream << "}";
+    stream << endl;    
   }
-  stream << ") {\n";
 
-  op->contents.accept(this);
-  doIndent();
-  stream << "}";
-  stream << endl;
+}
+
+void IRPrinter::sendToStream(std::stringstream &stream) {
+  if (is_ISPC_code_stream_enabled()) {
+    this->stream2 << stream.str();
+  }
+  else {
+    this->stream << stream.str();
+  }
 }
 
 void IRPrinter::visit(const While* op) {
-  doIndent();
-  stream << keywordString("while ");
-  stream << "(";
-  parentPrecedence = Precedence::TOP;
-  op->cond.accept(this);
-  stream << ")";
-  stream << " {\n";
-  op->contents.accept(this);
-  doIndent();
-  stream << "}";
-  stream << endl;
+  // std::stringstream stream;
+  if (is_ISPC_code_stream_enabled()) {
+    doIndent();
+    stream2 << keywordString("while ");
+    stream2 << "(";
+    parentPrecedence = Precedence::TOP;
+    op->cond.accept(this);
+    stream2 << ")";
+    stream2 << " {\n";
+    op->contents.accept(this);
+    doIndent();
+    stream2 << "}";
+    stream2 << endl;    
+  }
+  else {
+    doIndent();
+    stream << keywordString("while ");
+    stream << "(";
+    parentPrecedence = Precedence::TOP;
+    op->cond.accept(this);
+    stream << ")";
+    stream << " {\n";
+    op->contents.accept(this);
+    doIndent();
+    stream << "}";
+    stream << endl;
+  }
+  // sendToStream(stream);
 }
 
 void IRPrinter::visit(const Block* op) {
-  acceptJoin(this, stream, op->contents, "");
+  if (is_ISPC_code_stream_enabled()) {
+    acceptJoin(this, stream2, op->contents, "");
+  }
+  else {
+    acceptJoin(this, stream, op->contents, "");
+  }
 }
 
 void IRPrinter::visit(const Scope* op) {
@@ -438,85 +676,140 @@ void IRPrinter::visit(const Scope* op) {
 }
 
 void IRPrinter::visit(const Function* op) {
-  stream << keywordString("void ") << op->name;
-  stream << "(";
-  if (op->outputs.size() > 0) stream << "Tensor ";
-  acceptJoin(this, stream, op->outputs, ", Tensor ");
-  if (op->outputs.size() > 0 && op->inputs.size()) stream << ", ";
-  if (op->inputs.size() > 0) stream << "Tensor ";
-  acceptJoin(this, stream, op->inputs, ", Tensor ");
-  stream << ") {" << endl;
+  if (is_ISPC_code_stream_enabled()) {
+    stream2 << keywordString("void ") << op->name;
+    stream2 << "(";
+    if (op->outputs.size() > 0) stream2 << "Tensor ";
+    acceptJoin(this, stream2, op->outputs, ", Tensor ");
+    if (op->outputs.size() > 0 && op->inputs.size()) stream2 << ", ";
+    if (op->inputs.size() > 0) stream2 << "Tensor ";
+    acceptJoin(this, stream2, op->inputs, ", Tensor ");
+    stream2 << ") {" << endl;
+
+    resetNameCounters();
+    op->body.accept(this);
 
-  resetNameCounters();
-  op->body.accept(this);
+    doIndent();
+    stream2 << "}";
+  }
+  else {
+    stream << keywordString("void ") << op->name;
+    stream << "(";
+    if (op->outputs.size() > 0) stream << "Tensor ";
+    acceptJoin(this, stream, op->outputs, ", Tensor ");
+    if (op->outputs.size() > 0 && op->inputs.size()) stream << ", ";
+    if (op->inputs.size() > 0) stream << "Tensor ";
+    acceptJoin(this, stream, op->inputs, ", Tensor ");
+    stream << ") {" << endl;
+
+    resetNameCounters();
+    op->body.accept(this);
+
+    doIndent();
+    stream << "}";
+  }
 
-  doIndent();
-  stream << "}";
 }
 
 void IRPrinter::visit(const VarDecl* op) {
-  doIndent();
-  stream << keywordString(util::toString(op->var.type()));
-  taco_iassert(isa<Var>(op->var));
-  if (to<Var>(op->var)->is_ptr) {
-    stream << "* restrict";
-  }
-  stream << " ";
-  string varName = varNameGenerator.getUniqueName(util::toString(op->var));
-  varNames.insert({op->var, varName});
-  op->var.accept(this);
-  parentPrecedence = Precedence::TOP;
-  stream << " = ";
-  op->rhs.accept(this);
-  stream << ";";
-  stream << endl;
+  if (is_ISPC_code_stream_enabled()) {
+    doIndent();
+    if (op->var.type() == Int32) {
+      stream2 << keywordString("int32");
+    }
+    else if (op->var.type() == Int64) {
+      stream2 << keywordString("int64");
+    } else {
+      stream2 << keywordString(util::toString(op->var.type()));
+    }
+    taco_iassert(isa<Var>(op->var));
+    if (to<Var>(op->var)->is_ptr) {
+      stream2 << "* "; // removed restrict keyword from here
+    }
+    stream2 << " ";
+    string varName = varNameGenerator.getUniqueName(util::toString(op->var));
+    varNames.insert({op->var, varName});
+    op->var.accept(this);
+    parentPrecedence = Precedence::TOP;
+    stream2 << " = ";
+    op->rhs.accept(this);
+    stream2 << ";";
+    stream2 << endl;
+  }
+  else {
+    doIndent();
+    stream << keywordString(util::toString(op->var.type()));
+    taco_iassert(isa<Var>(op->var));
+    if (to<Var>(op->var)->is_ptr) {
+      stream << "* restrict";
+    }
+    stream << " ";
+    string varName = varNameGenerator.getUniqueName(util::toString(op->var));
+    varNames.insert({op->var, varName});
+    op->var.accept(this);
+    parentPrecedence = Precedence::TOP;
+    stream << " = ";
+    op->rhs.accept(this);
+    stream << ";";
+    stream << endl;
+  }
+
 }
 
 void IRPrinter::visit(const Assign* op) {
-  doIndent();
-  op->lhs.accept(this);
-  parentPrecedence = Precedence::TOP;
-  bool printed = false;
-  if (simplify) {
-    if (isa<ir::Add>(op->rhs)) {
-      auto add = to<Add>(op->rhs);
-      if (add->a == op->lhs) {
-        const Literal* lit = add->b.as<Literal>();
-        if (lit != nullptr && ((lit->type.isInt()  && lit->equalsScalar(1)) ||
-                               (lit->type.isUInt() && lit->equalsScalar(1)))) {
-          stream << "++";
+  if (is_ISPC_code_stream_enabled()) {
+
+  }
+  
+  
+  
+  else {
+    doIndent();
+    op->lhs.accept(this);
+    parentPrecedence = Precedence::TOP;
+    bool printed = false;
+    if (simplify) {
+      if (isa<ir::Add>(op->rhs)) {
+        auto add = to<Add>(op->rhs);
+        if (add->a == op->lhs) {
+          const Literal* lit = add->b.as<Literal>();
+          if (lit != nullptr && ((lit->type.isInt()  && lit->equalsScalar(1)) ||
+                                (lit->type.isUInt() && lit->equalsScalar(1)))) {
+            stream << "++";
+          }
+          else {
+            stream << " += ";
+            add->b.accept(this);
+          }
+          printed = true;
         }
-        else {
-          stream << " += ";
-          add->b.accept(this);
+      }
+      else if (isa<Mul>(op->rhs)) {
+        auto mul = to<Mul>(op->rhs);
+        if (mul->a == op->lhs) {
+          stream << " *= ";
+          mul->b.accept(this);
+          printed = true;
         }
-        printed = true;
       }
-    }
-    else if (isa<Mul>(op->rhs)) {
-      auto mul = to<Mul>(op->rhs);
-      if (mul->a == op->lhs) {
-        stream << " *= ";
-        mul->b.accept(this);
-        printed = true;
+      else if (isa<BitOr>(op->rhs)) {
+        auto bitOr = to<BitOr>(op->rhs);
+        if (bitOr->a == op->lhs) {
+          stream << " |= ";
+          bitOr->b.accept(this);
+          printed = true;
+        }
       }
     }
-    else if (isa<BitOr>(op->rhs)) {
-      auto bitOr = to<BitOr>(op->rhs);
-      if (bitOr->a == op->lhs) {
-        stream << " |= ";
-        bitOr->b.accept(this);
-        printed = true;
-      }
+    if (!printed) {
+      stream << " = ";
+      op->rhs.accept(this);
     }
-  }
-  if (!printed) {
-    stream << " = ";
-    op->rhs.accept(this);
+
+    stream << ";";
+    stream << endl;    
   }
 
-  stream << ";";
-  stream << endl;
 }
 
 void IRPrinter::visit(const Yield* op) {
@@ -544,12 +837,22 @@ void IRPrinter::visit(const Allocate* op) {
 }
 
 void IRPrinter::visit(const Free* op) {
-  doIndent();
-  stream << "free(";
-  parentPrecedence = Precedence::TOP;
-  op->var.accept(this);
-  stream << ");";
-  stream << endl;
+  if (is_ISPC_code_stream_enabled()) {
+    doIndent();
+    stream2 << "delete[] ";
+    parentPrecedence = Precedence::TOP;
+    op->var.accept(this);
+    stream2 << ";";
+    stream2 << endl;
+  }
+  else {
+    doIndent();
+    stream << "free(";
+    parentPrecedence = Precedence::TOP;
+    op->var.accept(this);
+    stream << ");";
+    stream << endl;
+  }
 }
 
 void IRPrinter::visit(const Comment* op) {
@@ -559,17 +862,32 @@ void IRPrinter::visit(const Comment* op) {
 }
 
 void IRPrinter::visit(const BlankLine*) {
-  stream << endl;
+  if (is_ISPC_code_stream_enabled()) {
+    stream2 << endl;
+  } 
+  else {
+    stream << endl;
+  }
 }
 
 void IRPrinter::visit(const Continue*) {
   doIndent();
-  stream << "continue;" << endl;
+  if (!is_ISPC_code_stream_enabled()) {
+    stream << "continue;" << endl;
+  }
+  else {
+    stream2 << "continue;" << endl;
+  }
 }
 
 void IRPrinter::visit(const Break*) {
   doIndent();
-  stream << "break;" << endl;
+  if (!is_ISPC_code_stream_enabled()) {
+    stream << "break;" << endl;
+  }
+  else {
+    stream2 << "break;" << endl;
+  }
 }
 
 void IRPrinter::visit(const Print* op) {
@@ -585,7 +903,12 @@ void IRPrinter::visit(const Print* op) {
 }
 
 void IRPrinter::visit(const GetProperty* op) {
-  stream << op->name;
+  if (is_ISPC_code_stream_enabled()) {
+    stream2 << op->name;
+  }
+  else {
+    stream << op->name;
+  }
 }
 
 void IRPrinter::visit(const Sort* op) {
@@ -643,23 +966,47 @@ void IRPrinter::resetNameCounters() {
 }
 
 void IRPrinter::doIndent() {
-  for (int i=0; i<indent; i++)
-    stream << "  ";
+  if (is_ISPC_code_stream_enabled()) {
+    for (int i=0; i<indent; i++)
+      stream2 << "  ";  
+  }
+  else {
+    for (int i=0; i<indent; i++)
+      stream << "  ";
+  }
+
 }
 
 void IRPrinter::printBinOp(Expr a, Expr b, string op, Precedence precedence) {
-  bool parenthesize = needsParentheses(precedence);
-  if (parenthesize) {
-    stream << "(";
+  if (is_ISPC_code_stream_enabled()) {
+    bool parenthesize = needsParentheses(precedence);
+    if (parenthesize) {
+      stream2 << "(";
+    }
+    parentPrecedence = precedence;
+    a.accept(this);
+    stream2 << " " << op << " ";
+    parentPrecedence = precedence;
+    b.accept(this);
+    if (parenthesize) {
+      stream2 << ")";
+    }
   }
-  parentPrecedence = precedence;
-  a.accept(this);
-  stream << " " << op << " ";
-  parentPrecedence = precedence;
-  b.accept(this);
-  if (parenthesize) {
-    stream << ")";
+  else {
+    bool parenthesize = needsParentheses(precedence);
+    if (parenthesize) {
+      stream << "(";
+    }
+    parentPrecedence = precedence;
+    a.accept(this);
+    stream << " " << op << " ";
+    parentPrecedence = precedence;
+    b.accept(this);
+    if (parenthesize) {
+      stream << ")";
+    }
   }
+
 }
 
 bool IRPrinter::needsParentheses(Precedence precedence) {
diff --git a/src/ir/ir_rewriter.cpp b/src/ir/ir_rewriter.cpp
index eed6f2bab..2e4827497 100644
--- a/src/ir/ir_rewriter.cpp
+++ b/src/ir/ir_rewriter.cpp
@@ -292,6 +292,7 @@ void IRRewriter::visit(const Store* op) {
 }
 
 void IRRewriter::visit(const For* op) {
+  // std::cout << "This is IRRewriter::visit(const For* op) method: For: " << op << std::endl;
   Expr var       = rewrite(op->var);
   Expr start     = rewrite(op->start);
   Expr end       = rewrite(op->end);
diff --git a/src/ir_tags.cpp b/src/ir_tags.cpp
index af3dbd775..e7365d6c2 100644
--- a/src/ir_tags.cpp
+++ b/src/ir_tags.cpp
@@ -2,7 +2,7 @@
 
 namespace taco {
 
-const char *ParallelUnit_NAMES[] = {"NotParallel", "DefaultUnit", "GPUBlock", "GPUWarp", "GPUThread", "CPUThread", "CPUVector", "CPUThreadGroupReduction", "GPUBlockReduction", "GPUWarpReduction"};
+const char *ParallelUnit_NAMES[] = {"NotParallel", "DefaultUnit", "GPUBlock", "GPUWarp", "GPUThread", "CPUThread", "CPUVector", "CPUThreadGroupReduction", "GPUBlockReduction", "GPUWarpReduction", "CPUSimd", "CPUSpmd"};
 const char *OutputRaceStrategy_NAMES[] = {"IgnoreRaces", "NoRaces", "Atomics", "Temporary", "ParallelReduction"};
 const char *BoundType_NAMES[] = {"MinExact", "MinConstraint", "MaxExact", "MaxConstraint"};
 const char *AssembleStrategy_NAMES[] = {"Append", "Insert"};
diff --git a/src/lower/iteration_graph.cpp b/src/lower/iteration_graph.cpp
index 77735a8d2..482d84aae 100644
--- a/src/lower/iteration_graph.cpp
+++ b/src/lower/iteration_graph.cpp
@@ -48,6 +48,8 @@ struct IterationGraph::Content {
 IterationGraph::IterationGraph() {
 }
 
+// remember that iteration graph does not have an ordering
+// I got the ordering from topologically reorder index Ryan wrote
 IterationGraph IterationGraph::make(Assignment assignment) {
   TensorVar tensor = assignment.getLhs().getTensorVar();
   IndexExpr expr = assignment.getRhs();
@@ -64,8 +66,16 @@ IterationGraph IterationGraph::make(Assignment assignment) {
     oldToSplitVar.insert({indexVar, indexVar});
   }
 
+  // access nodes of right hand side
   match(expr,
     function<void(const AccessNode*)>([&](const AccessNode* op) {
+      std::cout << "access node: " << op->tensorVar << " <- " << IndexExpr(op) << std::endl;
+      std::cout << "index var: ";
+      for (auto indexVar : op->indexVars) {
+        std::cout << indexVar << " ";
+      }
+      std::cout << std::endl;
+      
       auto type = op->tensorVar.getType();
       taco_iassert((size_t)type.getShape().getOrder() == op->indexVars.size())
           << "Tensor access " << IndexExpr(op) << " but tensor format only has "
diff --git a/src/lower/iterator.cpp b/src/lower/iterator.cpp
index 0f0c024c5..eb3d8ac3b 100644
--- a/src/lower/iterator.cpp
+++ b/src/lower/iterator.cpp
@@ -569,6 +569,9 @@ void Iterators::createAccessIterators(Access access, Format format, Expr tensorI
                                       ProvenanceGraph provGraph,
                                       const map<TensorVar, Expr> &tensorVars) {
   TensorVar tensorConcrete = access.getTensorVar();
+  cout << "tensor: " << tensorConcrete << " " ;
+  cout << "tensorConcrete order: " << tensorConcrete.getOrder();
+  cout << ", format order: " << format.getOrder() << endl;
   taco_iassert(tensorConcrete.getOrder() == format.getOrder())
       << tensorConcrete << ", Format" << format;
   Shape shape = tensorConcrete.getType().getShape();
diff --git a/src/lower/lowerer_impl_imperative.cpp b/src/lower/lowerer_impl_imperative.cpp
index b4c9ea710..cce8f2166 100644
--- a/src/lower/lowerer_impl_imperative.cpp
+++ b/src/lower/lowerer_impl_imperative.cpp
@@ -1,4 +1,6 @@
 #include <taco/lower/mode_format_compressed.h>
+#include "taco/cuda.h"
+#include "taco/ir_tags.h"
 #include "taco/lower/lowerer_impl_imperative.h"
 #include "taco/lower/lowerer_impl.h"
 
@@ -26,6 +28,7 @@ class LowererImplImperative::Visitor : public IndexNotationVisitorStrict {
 public:
   Visitor(LowererImplImperative* impl) : impl(impl) {}
   Stmt lower(IndexStmt stmt) {
+    // std::cout << "lowering IndexStmt to ir:Stmt - IndexStmt: " << stmt << std::endl;
     this->stmt = Stmt();
     impl->accessibleIterators.scope();
     IndexStmtVisitorStrict::visit(stmt);
@@ -200,6 +203,7 @@ static std::set<Expr> hasSparseInserts(IndexStmt stmt, Iterators iterators,
   return ret;
 }
 
+
 Stmt
 LowererImplImperative::lower(IndexStmt stmt, string name,
                    bool assemble, bool compute, bool pack, bool unpack)
@@ -414,6 +418,7 @@ LowererImplImperative::lower(IndexStmt stmt, string name,
 
 Stmt LowererImplImperative::lowerAssignment(Assignment assignment)
 {
+  // std::cout << "\n\n converting assignment IndexStmt============================================ Assignment\n";
   taco_iassert(generateAssembleCode() || generateComputeCode());
 
   Stmt computeStmt;
@@ -421,7 +426,7 @@ Stmt LowererImplImperative::lowerAssignment(Assignment assignment)
   Expr var = getTensorVar(result);
 
   const bool needComputeAssign = util::contains(needCompute, result);
-
+  // std::cout << "does assignment need compute assign: " << needComputeAssign << std::endl;
   Expr rhs;
   if (needComputeAssign) {
     rhs = lower(assignment.getRhs());
@@ -429,20 +434,51 @@ Stmt LowererImplImperative::lowerAssignment(Assignment assignment)
 
   // Assignment to scalar variables.
   if (isScalar(result.getType())) {
+    // std::cout << "assignment to scalar variables\n";
     if (needComputeAssign) {
+      // std::cout << "compute assign\n";
       if (!assignment.getOperator().defined()) {
+        // std::cout << "assignment operator is not defined\n";
+        // std::cout << "var: " << var << ", rhs, : " << rhs << std::endl;
         computeStmt = Assign::make(var, rhs);
       }
       else {
         taco_iassert(isa<taco::Add>(assignment.getOperator()));
-        bool useAtomics = markAssignsAtomicDepth > 0 &&
-                          !util::contains(whereTemps, result);
+        
+        // std::cout << "assignment depth -- loopDepth: " << loopDepth << std::endl;
+        // std::cout << "is markAssignsAtomicDepth > 0: " << (markAssignsAtomicDepth > 0) << std::endl;
+        // for (auto &tensors_ : whereTemps) {
+        //   // std::cout << tensors_ << ", ";
+        // }  
+        // std::cout << std::endl;
+        // std::cout << result << std::endl;
+        int tempVarInitLoopDepth = whereTempsWithLoopDepth.find(result)->second;
+        // std::cout << "tempInitLoopDepth: " << tempVarInitLoopDepth << std::endl;
+        
+        bool reduction = false;
+        std::map<int, ParallelUnit>::iterator itr;
+        for (itr = forUnits.begin(); itr!=forUnits.end(); ++itr) {
+          if (itr->first<=loopDepth && itr->first>tempVarInitLoopDepth && itr->second == ParallelUnit::CPUSimd) {
+            reduction = true;
+          }
+          // std::cout << itr->first << "\t" << ParallelUnit_NAMES[(int) itr->second] << std::endl;
+        }
+
+        // less than or equal to loopDepth but greater than temp variable initialized loop depth
+        bool useAtomics = markAssignsAtomicDepth > 0 && (!util::contains(whereTemps, result) || reduction);
+        // std::cout << "whereTemps and result: " << !util::contains(whereTemps, result) << std::endl;
+        // std::cout << "assignment to scalar variables useAtomics: " << useAtomics << std::endl;
         computeStmt = compoundAssign(var, rhs, useAtomics, atomicParallelUnit);
+        // std::cout << "computeStatment: " << computeStmt << std::endl;
       }
     }
+    else {
+      // std::cout << "not compute assign\n";
+    }
   }
   // Assignments to tensor variables (non-scalar).
   else {
+    // std::cout << "assignment to tensor variables\n";
     Expr values = getValuesArray(result);
     Expr loc = generateValueLocExpr(assignment.getLhs());
 
@@ -476,6 +512,7 @@ Stmt LowererImplImperative::lowerAssignment(Assignment assignment)
     }
 
     if (needComputeAssign && values.defined()) {
+      // std::cout << "assign compute statement\n";
       if (!assignment.getOperator().defined()) {
         computeStmt = Store::make(values, loc, rhs);
       }
@@ -586,19 +623,39 @@ LowererImplImperative::splitAppenderAndInserters(const vector<Iterator>& results
 }
 
 
+// important function
+/*
+*  This is the for loop lowering part
+*/
+
 Stmt LowererImplImperative::lowerForall(Forall forall)
 {
+  loopDepth++;
+  forUnits.insert(std::pair<int, ParallelUnit>(loopDepth,forall.getParallelUnit()));
+  // std::cout << "doing lowerForall: " << forall << std::endl;
   bool hasExactBound = provGraph.hasExactBound(forall.getIndexVar());
   bool forallNeedsUnderivedGuards = !hasExactBound && emitUnderivedGuards;
+
+
+  // std::cout << "printing temporary variables with their atomic depths\n";
+  map<TensorVar, int>::iterator itr;
+  for (itr = whereTempsWithLoopDepth.begin(); itr != whereTempsWithLoopDepth.end(); ++itr) {
+    // std::cout << itr->first << "\t" << itr->second << "\n";
+  }
+
+
   if (!ignoreVectorize && forallNeedsUnderivedGuards &&
       (forall.getParallelUnit() == ParallelUnit::CPUVector ||
        forall.getUnrollFactor() > 0)) {
+    // std::cout << "calling lowerForallCloned(forall)\n";
     return lowerForallCloned(forall);
   }
 
+  // std::cout << "inParallelLoopDepth: " << inParallelLoopDepth << "========================\n";
   if (forall.getParallelUnit() != ParallelUnit::NotParallel) {
     inParallelLoopDepth++;
   }
+  // std::cout << "inParallelLoopDepth: " << inParallelLoopDepth << "========================\n";
 
   // Recover any available parents that were not recoverable previously
   vector<Stmt> recoverySteps;
@@ -786,19 +843,23 @@ Stmt LowererImplImperative::lowerForall(Forall forall)
     }
 
     if (!isWhereProducer && hasPosDescendant && underivedAncestors.size() > 1 && provGraph.isPosVariable(iterator.getIndexVar()) && posDescendant == forall.getIndexVar()) {
+      // std::cout << "calling lowerForallFusedPosition(forall\n";
       loops = lowerForallFusedPosition(forall, iterator, locators,
                                          inserters, appenders, reducedAccesses, recoveryStmt);
     }
     else if (canAccelWithSparseIteration) {
+      // std::cout << "calling lowerForallDenseAcceleration(forall\n";
       loops = lowerForallDenseAcceleration(forall, locators, inserters, appenders, reducedAccesses, recoveryStmt);
     }
     // Emit dimension coordinate iteration loop
     else if (iterator.isDimensionIterator()) {
+      // std::cout << "calling lowerForallDimension(forall\n";
       loops = lowerForallDimension(forall, point.locators(),
                                    inserters, appenders, reducedAccesses, recoveryStmt);
     }
     // Emit position iteration loop
     else if (iterator.hasPosIter()) {
+      // std::cout << "calling lowerForallPosition(forall\n";
       loops = lowerForallPosition(forall, iterator, locators,
                                     inserters, appenders, reducedAccesses, recoveryStmt);
     }
@@ -816,6 +877,10 @@ Stmt LowererImplImperative::lowerForall(Forall forall)
     loops = lowerMergeLattice(lattice, underivedAncestors[0],
                               forall.getStmt(), reducedAccesses);
   }
+
+  // std::cout << "printing loops ----------------------------------------------------------------------------------------------\n";
+  // std::cout << loops << std::endl;
+  // std::cout << "loops printed -----------------------------------------------------------------------------------------------\n";
 //  taco_iassert(loops.defined());
 
   if (!generateComputeCode() && !hasStores(loops)) {
@@ -832,6 +897,9 @@ Stmt LowererImplImperative::lowerForall(Forall forall)
     parallelUnitIndexVars.erase(forall.getParallelUnit());
     parallelUnitSizes.erase(forall.getParallelUnit());
   }
+  
+  forUnits.erase(loopDepth);
+  loopDepth--;
   return Block::blanks(preInitValues,
                        temporaryValuesInitFree[0],
                        loops,
@@ -1136,13 +1204,22 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall,
                                        set<Access> reducedAccesses,
                                        ir::Stmt recoveryStmt)
 {
+  // std::cout << "1 Stmt LowererImplImperative::lowerForallDimension\n";
+  // std::cout << "1 Stmt LowererImplImperative::lowerForallDimension markAssignsAtomicDepth: " << markAssignsAtomicDepth << std::endl;
   Expr coordinate = getCoordinateVar(forall.getIndexVar());
 
   if (forall.getParallelUnit() != ParallelUnit::NotParallel && forall.getOutputRaceStrategy() == OutputRaceStrategy::Atomics) {
     markAssignsAtomicDepth++;
+    // std::cout << "1 Stmt LowererImplImperative::lowerForallDimension getParallelUnit() is Not NotParallel and outputRaceStrategy is Atomics\n";
+    // std::cout << "markAssignsAtomicDepth: " << markAssignsAtomicDepth << std::endl;
     atomicParallelUnit = forall.getParallelUnit();
   }
+  else {
+    // std::cout << "1 Stmt LowererImplImperative::lowerForallDimension getParallelUnit() is NotParallel or outputRaceStrategy is not Atomics\n";
+  }
 
+  // std::cout << "original forall : " << forall << std::endl;
+  // std::cout << "inside IndexStmt: " << forall.getStmt() << std::endl;
   Stmt body = lowerForallBody(coordinate, forall.getStmt(),
                               locators, inserters, appenders, reducedAccesses);
 
@@ -1158,7 +1235,18 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall,
   std::vector<ir::Expr> bounds = provGraph.deriveIterBounds(forall.getIndexVar(), definedIndexVarsOrdered, underivedBounds, indexVarToExprMap, iterators);
 
   LoopKind kind = LoopKind::Serial;
-  if (forall.getParallelUnit() == ParallelUnit::CPUVector && !ignoreVectorize) {
+  if (should_use_ISPC_codegen()) {
+    // std::cout << "Foreach compatible loop\n";
+    if (forall.getParallelUnit() == ParallelUnit::CPUSimd) {
+      kind = LoopKind::Foreach;
+    }
+    else if (forall.getParallelUnit() == ParallelUnit::CPUSpmd 
+            && forall.getOutputRaceStrategy() != OutputRaceStrategy::ParallelReduction
+    ) {
+      kind = LoopKind::Mul_Thread;
+    }
+  } 
+  else if (forall.getParallelUnit() == ParallelUnit::CPUVector && !ignoreVectorize) {
     kind = LoopKind::Vectorized;
   }
   else if (forall.getParallelUnit() != ParallelUnit::NotParallel
@@ -1166,6 +1254,7 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall,
     kind = LoopKind::Runtime;
   }
 
+  // std::cout << "2 Stmt LowererImplImperative::lowerForallDimension\n";
   return Block::blanks(For::make(coordinate, bounds[0], bounds[1], 1, body,
                                  kind,
                                  ignoreVectorize ? ParallelUnit::NotParallel : forall.getParallelUnit(), ignoreVectorize ? 0 : forall.getUnrollFactor()),
@@ -1179,6 +1268,7 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall,
                                                  set<Access> reducedAccesses,
                                                  ir::Stmt recoveryStmt)
   {
+    // std::cout << "1 Stmt LowererImplImperative::lowerForallDenseAcceleration\n";
     taco_iassert(locators.size() == 1) << "Optimizing a dense workspace is only supported when the consumer is the only RHS tensor";
     taco_iassert(provGraph.isFullyDerived(forall.getIndexVar())) << "Sparsely accelerating a dense workspace only works with fully derived index vars";
     taco_iassert(forall.getParallelUnit() == ParallelUnit::NotParallel) << "Sparsely accelerating a dense workspace only works within serial loops";
@@ -1204,6 +1294,8 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall,
     }
 
     Stmt declareVar = VarDecl::make(coordinate, Load::make(indexList, loopVar));
+    // std::cout << "original forall : " << forall << std::endl;
+    // std::cout << "inside IndexStmt: " << forall.getStmt() << std::endl;
     Stmt body = lowerForallBody(coordinate, forall.getStmt(), locators, inserters, appenders, reducedAccesses);
     Stmt resetGuard = ir::Store::make(bitGuard, coordinate, ir::Literal::make(false), markAssignsAtomicDepth > 0, atomicParallelUnit);
 
@@ -1216,7 +1308,12 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall,
     Stmt posAppend = generateAppendPositions(appenders);
 
     LoopKind kind = LoopKind::Serial;
-    if (forall.getParallelUnit() == ParallelUnit::CPUVector && !ignoreVectorize) {
+    if (should_use_ISPC_codegen()) {
+      if (forall.getParallelUnit() == ParallelUnit::CPUSimd) {
+        kind = LoopKind::Foreach;
+      }
+    }
+    else if (forall.getParallelUnit() == ParallelUnit::CPUVector && !ignoreVectorize) {
       kind = LoopKind::Vectorized;
     }
     else if (forall.getParallelUnit() != ParallelUnit::NotParallel
@@ -1224,6 +1321,7 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall,
       kind = LoopKind::Runtime;
     }
 
+    // std::cout << "2 Stmt LowererImplImperative::lowerForallDenseAcceleration\n";
     return Block::blanks(For::make(loopVar, 0, indexListSize, 1, body, kind,
                                          ignoreVectorize ? ParallelUnit::NotParallel : forall.getParallelUnit(),
                                          ignoreVectorize ? 0 : forall.getUnrollFactor()),
@@ -1247,6 +1345,8 @@ Stmt LowererImplImperative::lowerForallPosition(Forall forall, Iterator iterator
                                       set<Access> reducedAccesses,
                                       ir::Stmt recoveryStmt)
 {
+  // std::cout << "1 Stmt LowererImplImperative::lowerForallPosition\n" << std::endl;
+
   Expr coordinate = getCoordinateVar(forall.getIndexVar());
   Stmt declareCoordinate = Stmt();
   Stmt strideGuard = Stmt();
@@ -1278,6 +1378,11 @@ Stmt LowererImplImperative::lowerForallPosition(Forall forall, Iterator iterator
     markAssignsAtomicDepth++;
   }
 
+  // see we are inside a forall. ex: forall(i, forall(j, y(i) += A(i,j) * x(j)))
+  // when you call forall.getStmt it returns forall(j, y(i) += A(i,j) * x(j)) which is the 
+  // IndexStmt inside the forall IndexStmt
+  // std::cout << "original forall : " << forall << std::endl;
+  // std::cout << "inside IndexStmt: " << forall.getStmt() << std::endl;
   Stmt body = lowerForallBody(coordinate, forall.getStmt(),
                               locators, inserters, appenders, reducedAccesses);
 
@@ -1339,6 +1444,7 @@ Stmt LowererImplImperative::lowerForallPosition(Forall forall, Iterator iterator
     kind = LoopKind::Runtime;
   }
 
+  // std::cout << "2 Stmt LowererImplImperative::lowerForallPosition\n" << std::endl;
   // Loop with preamble and postamble
   return Block::blanks(
                        boundsCompute,
@@ -1357,6 +1463,7 @@ Stmt LowererImplImperative::lowerForallFusedPosition(Forall forall, Iterator ite
                                       set<Access> reducedAccesses,
                                       ir::Stmt recoveryStmt)
 {
+  // std::cout << "1 Stmt LowererImplImperative::lowerForallFusedPosition\n" << std::endl;
   Expr coordinate = getCoordinateVar(forall.getIndexVar());
   Stmt declareCoordinate = Stmt();
   if (provGraph.isCoordVariable(forall.getIndexVar())) {
@@ -1447,6 +1554,8 @@ Stmt LowererImplImperative::lowerForallFusedPosition(Forall forall, Iterator ite
     markAssignsAtomicDepth++;
   }
 
+  // std::cout << "original forall : " << forall << std::endl;
+  // std::cout << "inside IndexStmt: " << forall.getStmt() << std::endl;
   Stmt body = lowerForallBody(coordinate, forall.getStmt(),
                               locators, inserters, appenders, reducedAccesses);
 
@@ -1503,6 +1612,8 @@ Stmt LowererImplImperative::lowerForallFusedPosition(Forall forall, Iterator ite
            && forall.getOutputRaceStrategy() != OutputRaceStrategy::ParallelReduction && !ignoreVectorize) {
     kind = LoopKind::Runtime;
   }
+
+  // std::cout << "2 Stmt LowererImplImperative::lowerForallFusedPosition\n" << std::endl;
   // Loop with preamble and postamble
   return Block::blanks(boundsCompute,
                        Block::make(Block::make(searchForUnderivedStart),
@@ -1765,6 +1876,9 @@ Stmt LowererImplImperative::lowerForallBody(Expr coordinate, IndexStmt stmt,
                                   vector<Iterator> inserters,
                                   vector<Iterator> appenders,
                                   const set<Access>& reducedAccesses) {
+
+  // std::cout << "lowering a forall body----------------------------------------------------\n";
+  
   Stmt initVals = resizeAndInitValues(appenders, reducedAccesses);
 
   // Inserter positions
@@ -1780,6 +1894,7 @@ Stmt LowererImplImperative::lowerForallBody(Expr coordinate, IndexStmt stmt,
 
   // Code of loop body statement
   Stmt body = lower(stmt);
+  // std::cout << "\nBefore: [" << stmt << "]\nAfter : [" << body << "]\n";
 
   // Code to append coordinates
   Stmt appendCoords = appendCoordinate(appenders, coordinate);
@@ -1889,6 +2004,7 @@ vector<Stmt> LowererImplImperative::codeToInitializeDenseAcceleratorArrays(Where
     Expr p = Var::make("p" + temporary.getName(), Int());
     Stmt guardZeroInit = Store::make(alreadySetArr, p, ir::Literal::zero(bitGuardType));
 
+    // std::cout << "vector<Stmt> LowererImplImperative::codeToInitializeDenseAcceleratorArrays\n" << std::endl;
     Stmt zeroInitLoop = For::make(p, 0, bitGuardSize, 1, guardZeroInit, LoopKind::Serial);
     Stmt inits = Block::make(alreadySetDecl, indexListDecl, allocateAlreadySet, allocateIndexList, zeroInitLoop);
     return {inits, freeTemps};
@@ -2144,6 +2260,7 @@ vector<Stmt> LowererImplImperative::codeToInitializeTemporary(Where where) {
 }
 
 Stmt LowererImplImperative::lowerWhere(Where where) {
+  // std::cout << "\n--------------------------------------- lowering where statement: " << where << "\n\n\n";
   TensorVar temporary = where.getTemporary();
   bool accelerateDenseWorkSpace, sortAccelerator;
   std::tie(accelerateDenseWorkSpace, sortAccelerator) =
@@ -2180,6 +2297,7 @@ Stmt LowererImplImperative::lowerWhere(Where where) {
         })
   );
 
+  // std::cout << "\ninitiating lowering of where consumer: " << where.getConsumer() << std::endl;
   Stmt consumer = lower(where.getConsumer());
   if (accelerateDenseWorkSpace && sortAccelerator) {
     // We need to sort the indices array
@@ -2203,11 +2321,13 @@ Stmt LowererImplImperative::lowerWhere(Where where) {
                                 true, false);
     Expr size = getTemporarySize(where);
     Stmt zeroInit = Store::make(values, p, ir::Literal::zero(temporary.getType().getDataType()));
+    // std::cout << "Stmt LowererImplImperative::lowerWhere\n";
     Stmt loopInit = For::make(p, 0, size, 1, zeroInit, LoopKind::Serial);
     initializeTemporary = Block::make(initializeTemporary, loopInit);
   }
 
   whereConsumers.push_back(consumer);
+  // std::cout << "\nwhere temporaries: " << where.getTemporary() << std::endl;
   whereTemps.push_back(where.getTemporary());
   captureNextLocatePos = true;
 
@@ -2218,6 +2338,9 @@ Stmt LowererImplImperative::lowerWhere(Where where) {
     restoreAtomicDepth = true;
   }
 
+  whereTempsWithLoopDepth.insert(std::pair<TensorVar, int>(where.getTemporary(), loopDepth));
+
+  // std::cout << "\ninitiating lowering of where producer: " << where.getConsumer() << std::endl;
   Stmt producer = lower(where.getProducer());
   if (accelerateDenseWorkSpace) {
     const Expr indexListSizeExpr = tempToIndexListSize.at(temporary);
@@ -2225,6 +2348,8 @@ Stmt LowererImplImperative::lowerWhere(Where where) {
     initializeTemporary = Block::make(indexListSizeDecl, initializeTemporary);
   }
 
+  whereTempsWithLoopDepth.erase(where.getTemporary());
+
   if (restoreAtomicDepth) {
     markAssignsAtomicDepth++;
   }
@@ -2334,6 +2459,7 @@ Stmt LowererImplImperative::lowerAssemble(Assemble assemble) {
                   resultModeOrdering[iter.getMode().getLevel() - 1]);
               Expr pos = iter.getPosVar();
               Stmt initPos = VarDecl::make(pos, iter.locate(locateCoords)[0]);
+              // std::cout << "Stmt LowererImplImperative::lowerAssemble\n";
               insertEdgeLoop = For::make(coords.back(), 0, dim, 1,
                                          Block::make(initPos, insertEdgeLoop));
             } else {
@@ -2371,7 +2497,7 @@ Stmt LowererImplImperative::lowerAssemble(Assemble assemble) {
         initAssembleStmts.push_back(initValues);
       }
     } else if (zeroInit) {
-      initAssembleStmts.push_back(zeroInitValues(resultTensorVar, 0, prevSize));
+      initAssembleStmts.push_back(zeroInitValues(resultTensorVar, 0, prevSize)); // init values
     }
   }
   Stmt initAssemble = Block::make(initAssembleStmts);
@@ -2415,6 +2541,7 @@ Stmt LowererImplImperative::lowerMulti(Multi multi) {
 }
 
 Stmt LowererImplImperative::lowerSuchThat(SuchThat suchThat) {
+  // std::cout << "lowering such that statement\n";
   Stmt stmt = lower(suchThat.getStmt());
   return Block::make(stmt);
 }
@@ -2744,7 +2871,7 @@ Stmt LowererImplImperative::initResultArrays(vector<Access> writes,
       // iteration of all the iterators is not full. We can check this by seeing if we can recover a
       // full iterator from our set of iterators.
       Expr size = generateAssembleCode() ? getCapacityVar(tensor) : parentSize;
-      result.push_back(zeroInitValues(tensor, 0, size));
+      result.push_back(zeroInitValues(tensor, 0, size)); // init values
     }
   }
   return result.empty() ? Stmt() : Block::blanks(result);
@@ -2895,7 +3022,7 @@ Stmt LowererImplImperative::initResultArrays(IndexVar var, vector<Access> writes
             util::contains(reducedAccesses, write)) {
           // Zero-initialize values array if might not assign to every element
           // in values array during compute
-          result.push_back(zeroInitValues(tensor, resultParentPos, stride));
+          result.push_back(zeroInitValues(tensor, resultParentPos, stride)); // init values
         }
       }
     }
@@ -2942,6 +3069,7 @@ Stmt LowererImplImperative::resizeAndInitValues(const std::vector<Iterator>& app
 
 
 Stmt LowererImplImperative::zeroInitValues(Expr tensor, Expr begin, Expr size) {
+  // std::cout << "1 Stmt LowererImplImperative::zeroInitValues\n";
   Expr lower = simplify(ir::Mul::make(begin, size));
   Expr upper = simplify(ir::Mul::make(ir::Add::make(begin, 1), size));
   Expr p = Var::make("p" + util::toString(tensor), Int());
@@ -2954,6 +3082,11 @@ Stmt LowererImplImperative::zeroInitValues(Expr tensor, Expr begin, Expr size) {
     return ir::VarDecl::make(ir::Var::make("status", Int()),
                                     ir::Call::make("cudaMemset", {values, ir::Literal::make(0, Int()), ir::Mul::make(ir::Sub::make(upper, lower), ir::Literal::make(values.type().getNumBytes()))}, Int()));
   }
+  // std::cout << "2 Stmt LowererImplImperative::zeroInitValues\n";
+  // if generating ispc code, we will keep the LoopKind as Init so that we can initializa it if tasks are used
+  if (should_use_ISPC_codegen()) {
+    return For::make(p, lower, upper, 1, zeroInit, LoopKind::Init);
+  }
   return For::make(p, lower, upper, 1, zeroInit, parallel);
 }
 
diff --git a/src/lower/tensor_path.h b/src/lower/tensor_path.h
index 4f5dc49af..da52fb782 100644
--- a/src/lower/tensor_path.h
+++ b/src/lower/tensor_path.h
@@ -2,6 +2,7 @@
 #define TACO_TENSOR_PATH_H
 
 #include <memory>
+#include <ostream>
 #include <vector>
 
 #include "taco/util/comparable.h"
@@ -47,14 +48,13 @@ class TensorPath : public util::Comparable<TensorPath> {
 
   friend bool operator==(const TensorPath&, const TensorPath&);
   friend bool operator<(const TensorPath&, const TensorPath&);
+  friend std::ostream& operator<<(std::ostream&, const TensorPath&);
 
 private:
   struct Content;
   std::shared_ptr<Content> content;
 };
 
-std::ostream& operator<<(std::ostream&, const TensorPath&);
-
 
 /// A step along a tensor path.
 class TensorPathStep : public util::Comparable<TensorPathStep> {
diff --git a/src/tensor.cpp b/src/tensor.cpp
index fab437ff1..1c95851c5 100644
--- a/src/tensor.cpp
+++ b/src/tensor.cpp
@@ -10,6 +10,7 @@
 #include <utility>
 #include <mutex>
 
+#include "../test/util.h"
 #include "taco/cuda.h"
 #include "taco/format.h"
 #include "taco/taco_tensor_t.h"
@@ -278,6 +279,7 @@ static size_t unpackTensorData(const taco_tensor_t& tensorData,
 
 /// Pack coordinates into a data structure given by the tensor format.
 void TensorBase::pack() {
+  std::cout << "TensorBase::Pack() method\n";
   if (!needsPack()) {
     return;
   }
@@ -346,6 +348,7 @@ void TensorBase::pack() {
   taco_iassert((content->coordinateBufferUsed % content->coordinateSize) == 0);
   const size_t numCoordinates = content->coordinateBufferUsed / content->coordinateSize;
 
+  std::cout << "call helperFuncs\n";
   const auto helperFuncs = getHelperFunctions(getFormat(), getComponentType(),
                                               dimensions);
 
@@ -619,10 +622,12 @@ void TensorBase::compile() {
   IndexStmt stmt = makeConcreteNotation(makeReductionNotation(assignment));
   stmt = reorderLoopsTopologically(stmt);
   stmt = insertTemporaries(stmt);
+  std::cout << "calling parallelizeOuterLoop(stmt)\n";
   stmt = parallelizeOuterLoop(stmt);
   compile(stmt, content->assembleWhileCompute);
 }
 void TensorBase::compile(taco::IndexStmt stmt, bool assembleWhileCompute) {
+  std::cout << "TensorBase::compile\n";
   if (!needsCompile()) {
     return;
   }
@@ -804,9 +809,9 @@ void TensorBase::assemble() {
 
 void TensorBase::compute() {
   taco_uassert(!needsCompile()) << error::compute_without_compile;
-  if (!needsCompute()) {
-    return;
-  }
+  // if (!needsCompute()) {
+  //   return;
+  // }
   setNeedsCompute(false);
   // Sync operand tensors if needed.
   auto operands = getTensors(getAssignment().getRhs());
@@ -816,7 +821,12 @@ void TensorBase::compute() {
   }
 
   auto arguments = packArguments(*this);
-  this->content->module->callFuncPacked("compute", arguments.data());
+
+  taco::util::TimeResults timevalue;
+  bool time                = true;
+  TOOL_BENCHMARK_TIMER(this->content->module->callFuncPacked("compute", arguments.data()), 
+      "\n\nkernel execution time: ", timevalue);
+  // this->content->module->callFuncPacked("compute", arguments.data());
 
   if (content->assembleWhileCompute) {
     setNeedsAssemble(false);
@@ -934,6 +944,7 @@ TensorBase::getHelperFunctions(const Format& format, Datatype ctype,
   };
   const auto dims = util::map(dimensions, getDim);
 
+  set_ISPC_code_stream_enabled(false);
   if (format.getOrder() > 0) {
     const Format bufferFormat = COO(format.getOrder(), false, true, false,
                                     format.getModeOrdering());
@@ -951,6 +962,7 @@ TensorBase::getHelperFunctions(const Format& format, Datatype ctype,
     }
 
     // Lower packing and iterator code.
+    std::cout << "1 Lower packing and iterator code\n";
     helperModule->addFunction(lower(packStmt, "pack", true, true));
     helperModule->addFunction(lower(iterateStmt, "iterate", false, true));
   } else {
@@ -964,12 +976,14 @@ TensorBase::getHelperFunctions(const Format& format, Datatype ctype,
     IndexVar indexVar;
     IndexStmt assignment = (packedScalar() = bufferVector(indexVar));
     IndexStmt packStmt= makeConcreteNotation(makeReductionNotation(assignment));
+    std::cout << "2 Lower packing and iterator code\n";
     helperModule->addFunction(lower(packStmt, "pack", true, true));
 
     // Define and lower iterator code.
     IndexStmt iterateStmt = Yield({}, packedScalar());
     helperModule->addFunction(lower(iterateStmt, "iterate", false, true));
   }
+  std::cout << "Compiling the helperModule\n";
   helperModule->compile();
 
   helperFunctionsMutex.lock();
diff --git a/taco-uml.wsd b/taco-uml.wsd
new file mode 100644
index 000000000..4b8e39802
--- /dev/null
+++ b/taco-uml.wsd
@@ -0,0 +1,411 @@
+@startuml taco
+scale 1
+
+
+class IntrusivePtr {
+    +T *ptr
+}
+class Uncopyable {}
+
+class IRNode {
+    +virtual void accept(IRVisitorStrict *v) const = 0
+    +virtual IRNodeType type_info() const = 0;
+}
+
+class BaseStmtNode {}
+class BaseExprNode {
+    +Datatype type
+}
+
+class StmtNode {
+    +void accept(IRVisitorStrict *v) const
+}
+class ExprNode {
+    +void accept(IRVisitorStrict *v) const
+}
+
+Uncopyable <|-- IRNode
+IRNode <|-- BaseStmtNode
+IRNode <|-- BaseExprNode
+BaseStmtNode <|-- StmtNode
+BaseExprNode <|-- ExprNode
+
+class IRHandle {
+    +void accept(IRVisitorStrict *v) const
+}
+class Expr {}
+class Stmt {}
+
+IntrusivePtr <|-- IRHandle
+IRHandle <|-- Expr
+IRHandle <|-- Stmt
+
+IRHandle "1" *-- "1" IRNode : contains
+
+
+
+' this class is abstract but plantuml version does not support interface keyword
+interface IRVisitorStrict {
+    +virtual void visit(const IRNode*) const = 0
+}
+
+/' 
+IRVisitor is not an interface or abstract because it 
+has not pure virtual methods
+'/
+class IRVisitor {
+    +virtual void visit(const IRNode*)
+}
+
+class IRRewriter {
+    ' protected fields and methods
+    #Expr expr 
+    #Stmt stmt
+
+    #virtual void visit(const ExprNode* op)
+    #virtual void visit(const StmtNode* op)
+
+    ' public fields and methods
+    +Expr rewrite(Expr)
+    +Stmt rewrite(Stmt)
+}
+class IRPrinter {
+    #std::ostream &stream
+    #std::ostream &stream2
+    #int indent
+    #bool color
+    #bool simplify
+    #enum Precedence
+    #Precedence parentPrecedence = BOTTOM
+    #NameGenerator varNameGenerator
+    #scopedMap<Expr, std::String> varNames
+
+    #void doIndent()
+    #void printBinOp(Expr a, Expr b, std::string op, Precedence precedence)
+    #void fewMoreMethods()
+    
+    #virtual void visit(const ExprNode*)
+    #virtual void visit(const StmtNode*)
+
+    +setColor(bool color)
+    +print(Stmt)
+}
+class IRVerifier {}
+
+IRVisitorStrict <|-- IRVisitor
+IRVisitorStrict <|-- IRPrinter
+IRVisitorStrict <|-- IRRewriter
+IRVisitor <|-- IRVerifier
+
+' Inheritance from IRRewriter
+' simplifier for ir::Expr
+class ExpressionSimplifier {}
+IRRewriter <|-- ExpressionSimplifier
+
+' simplifiers for ir::Stmt
+class RemoveRedundantStatements {}
+class RemoveRedundantLoops {}
+class RemoveDuplicateBody {}
+
+IRRewriter <|-- RemoveRedundantStatements
+IRRewriter <|-- RemoveRedundantLoops
+IRRewriter <|-- RemoveDuplicateBody
+
+
+' Inheritance from IRPrinter
+class CodeGen {}
+class CodeGen_C {}
+class CodeGen_CUDA {}
+class CodeGen_ISPC {
+    -class FindVars
+}
+
+class FindVars {}
+
+IRPrinter <|-- CodeGen
+CodeGen <|-- CodeGen_C
+CodeGen <|-- CodeGen_ISPC
+CodeGen <|-- CodeGen_CUDA
+
+IRVisitor <|-- FindVars
+CodeGen_ISPC +-- FindVars
+
+class Manageable {}
+class IndexStmtNode {
+    -virtual void accept(IndexStmtVisitorStrict*) const = 0
+}
+class IndexExprNode {
+    -virtual void accept(IndexStmtVisitorStrict*) const = 0
+}
+
+
+Manageable <|-- IndexStmtNode
+Uncopyable <|-- IndexStmtNode
+Manageable <|-- IndexExprNode
+Uncopyable <|-- IndexExprNode
+
+class IndexStmt {}
+class IndexExpr {}
+
+IntrusivePtr <|-- IndexStmt
+IndexStmt "1" *-- "1" IndexStmtNode
+IntrusivePtr <|-- IndexExpr
+IndexExpr "1" *-- "1" IndexExprNode
+
+
+abstract class IndexExprVisitorStrict {
+    +void visit(const IndexStmt&)
+    +virtual void visit(const AccessNode*) = 0
+    +virtual void visit(const LiteralNode*) = 0
+    +virtual void visit(const NegNode*) = 0
+    +virtual void visit(const AddNode*) = 0
+    +virtual void visit(const SubNode*) = 0
+    +virtual void visit(const MulNode*) = 0
+    +virtual void visit(const DivNode*) = 0
+    +virtual void visit(const SqrtNode*) = 0
+    +virtual void visit(const CastNode*) = 0
+    +virtual void visit(const CallIntrinsicNode*) = 0
+    +virtual void visit(const ReductionNode*) = 0
+}
+abstract class IndexStmtVisitorStrict {
+    +void visit(const IndexStmt&)
+    +virtual void visit(const AssignmentNode*) = 0
+    +virtual void visit(const YieldNode*) = 0
+    +virtual void visit(const ForallNode*) = 0
+    +virtual void visit(const WhereNode*) = 0
+    +virtual void visit(const SequenceNode*) = 0
+    +virtual void visit(const AssembleNode*) = 0
+    +virtual void visit(const MultiNode*) = 0
+    +virtual void visit(const SuchThatNode*) = 0
+}
+
+abstract class IndexNotationVisitorStrict {}
+class IndexNotationPrinter {
+    +void print(const IndexExpr& expr)
+    +void print(const IndexStmt& expr)
+
+    ' Index Expressions visit()
+    +void visit(const AccessNode* node)
+    +void visit(const LiteralNode* node)
+    + void visit(const NegNode* node)
+    + void visit(const AddNode* node)
+    + void visit(const SubNode* node)
+    + void visit(const MulNode* node)
+    + void visit(const DivNode* node)
+    + void visit(const SqrtNode* node)
+    + void visit(const CastNode* node)
+    + void visit(const CallIntrinsicNode* node)
+    + void visit(const UnaryExprNode* node)
+    + void visit(const BinaryExprNode* node)
+    + void visit(const ReductionNode* node)
+
+    ' Index Statement visit()
+    + void visit(const AssignmentNode* node)
+    + void visit(const YieldNode* node)
+    + void visit(const ForallNode* node)
+    + void visit(const WhereNode* node)
+    + void visit(const SequenceNode* node)
+    + void visit(const AssembleNode* node)
+    + void visit(const MultiNode* node)
+    + void visit(const SuchThatNode* node)
+}
+class IndexNotationVisitor {
+    ' Index Expressions visit()
+    +virtual void visit(const AccessNode* node)
+    +virtual void visit(const LiteralNode* node)
+    +virtual void visit(const NegNode* node)
+    +virtual void visit(const AddNode* node)
+    +virtual void visit(const SubNode* node)
+    +virtual void visit(const MulNode* node)
+    +virtual void visit(const DivNode* node)
+    +virtual void visit(const SqrtNode* node)
+    +virtual void visit(const CastNode* node)
+    +virtual void visit(const CallIntrinsicNode* node)
+    +virtual void visit(const UnaryExprNode* node)
+    +virtual void visit(const BinaryExprNode* node)
+    +virtual void visit(const ReductionNode* node)
+
+    ' Index Statement visit()
+    +virtual void visit(const AssignmentNode* node)
+    +virtual void visit(const YieldNode* node)
+    +virtual void visit(const ForallNode* node)
+    +virtual void visit(const WhereNode* node)
+    +virtual void visit(const SequenceNode* node)
+    +virtual void visit(const AssembleNode* node)
+    +virtual void visit(const MultiNode* node)
+    +virtual void visit(const SuchThatNode* node)
+}
+class Matcher {
+
+}
+
+abstract class IndexExprRewriterStrict {
+    +IndexExpr rewrite(IndexExpr)
+
+    #IndexExpr expr
+
+    #virtual void visit(const AccessNode* op) = 0
+    #virtual void visit(const LiteralNode* op) = 0
+    #virtual void visit(const NegNode* op) = 0
+    #virtual void visit(const SqrtNode* op) = 0
+    #virtual void visit(const AddNode* op) = 0
+    #virtual void visit(const SubNode* op) = 0
+    #virtual void visit(const MulNode* op) = 0
+    #virtual void visit(const DivNode* op) = 0
+    #virtual void visit(const CastNode* op) = 0
+    #virtual void visit(const CallIntrinsicNode* op) = 0
+    #virtual void visit(const ReductionNode* op) = 0
+}
+abstract class IndexStmtRewriterStrict {
+    +IndexStmt rewrite(IndexStmt)
+
+    #IndexStmt stmt
+
+    #virtual void visit(const AssignmentNode* op) = 0
+    #virtual void visit(const YieldNode* op) = 0
+    #virtual void visit(const ForallNode* op) = 0
+    #virtual void visit(const WhereNode* op) = 0
+    #virtual void visit(const SequenceNode* op) = 0
+    #virtual void visit(const AssembleNode* op) = 0
+    #virtual void visit(const MultiNode* op) = 0
+    #virtual void visit(const SuchThatNode* op) = 0
+}
+abstract class IndexNotationRewriterStrict {}
+class IndexNotationRewriter {
+    ' Index Expressions visit()
+    +virtual void visit(const AccessNode* node)
+    +virtual void visit(const LiteralNode* node)
+    +virtual void visit(const NegNode* node)
+    +virtual void visit(const AddNode* node)
+    +virtual void visit(const SubNode* node)
+    +virtual void visit(const MulNode* node)
+    +virtual void visit(const DivNode* node)
+    +virtual void visit(const SqrtNode* node)
+    +virtual void visit(const CastNode* node)
+    +virtual void visit(const CallIntrinsicNode* node)
+    +virtual void visit(const UnaryExprNode* node)
+    +virtual void visit(const BinaryExprNode* node)
+    +virtual void visit(const ReductionNode* node)
+
+    ' Index Statement visit()
+    +virtual void visit(const AssignmentNode* node)
+    +virtual void visit(const YieldNode* node)
+    +virtual void visit(const ForallNode* node)
+    +virtual void visit(const WhereNode* node)
+    +virtual void visit(const SequenceNode* node)
+    +virtual void visit(const AssembleNode* node)
+    +virtual void visit(const MultiNode* node)
+    +virtual void visit(const SuchThatNode* node)
+}
+
+
+IndexExprVisitorStrict <|-- IndexNotationVisitorStrict
+IndexStmtVisitorStrict <|-- IndexNotationVisitorStrict
+IndexNotationVisitorStrict <|-- IndexNotationVisitor
+IndexNotationVisitorStrict <|-- IndexNotationPrinter
+IndexNotationVisitor <|-- Matcher
+
+IndexExprVisitorStrict <|-- IndexExprRewriterStrict
+IndexStmtVisitorStrict <|-- IndexStmtRewriterStrict
+IndexExprRewriterStrict <|-- IndexNotationRewriterStrict
+IndexStmtRewriterStrict <|-- IndexNotationRewriterStrict
+
+IndexNotationRewriterStrict <|-- IndexNotationRewriter
+
+' - private
+' # protected
+' ~ package private
+' + public
+
+' {static}
+' {abstract} virtual methods
+
+' lowering part -- convertion from IndexExpr and IndexStmt to ir::Expr and ir::Stmt
+class Lowerer {
+    +std::shared_ptr<LowererImpl> impl;
+}
+abstract class LowererImpl {
+    ' protected fields and methods
+    #class Visitor;
+    #friend class Visitor;
+    #std::shared_ptr<Visitor> visitor;
+
+    #virtual ir::Stmt lower(IndexStmt stmt);
+    #virtual ir::Expr lower(IndexExpr expr);
+
+    #virtual ir::Expr lowerExpr(IndexExpr expr) = 0;
+    #virtual ir::Stmt lowerStmt(IndexStmt stmt) = 0;
+
+    ' public fields and methods
+    +virtual ir::Stmt lower(IndexStmt stmt, std::string name, 
+                 bool assemble, bool compute, bool pack, bool unpack) = 0;
+}
+
+class LowererImplImperative {
+    ' private fields and methods
+    -class Visitor
+    -fiend class Visitor
+    -std::shared_ptr<Visitor> visitor
+    -bool assemble
+    -bool compute
+    -vars a_bunch_of_other_fields
+
+    ' protected fields and methods
+    #virtual ir::Stmt lowerExpr(IndexExpr expr);
+    #virtual ir::Stmt lowerStmt(IndexStmt stmt);
+
+    ' public fields and methods
+    +ir::Stmt lower(IndexStmt stmt, std::string name, 
+                 bool assemble, bool compute, bool pack, bool unpack)
+
+}
+note bottom of LowererImplImperative : Stmt LowererImplImperative::lower(IndexStmt stmt) {\n  return visitor->lower(stmt);\n}
+
+Uncopyable <|-- LowererImpl
+Lowerer "1" *-- "1" LowererImpl : contains
+
+
+' visitor that does the lowering
+class Visitor {
+    ' private fields and methods
+    -LowererImpl* impl
+    -Expr expr
+    -Stmt stmt
+
+    -void visit(const AssignmentNode* node)
+    -void visit(const YieldNode* node)
+    -void visit(const ForallNode* node) 
+    -void visit(const WhereNode* node) 
+    -void visit(const MultiNode* node) 
+    -void visit(const SuchThatNode* node) 
+    -void visit(const SequenceNode* node) 
+    -void visit(const AssembleNode* node) 
+    -void visit(const AccessNode* node) 
+    -void visit(const LiteralNode* node) 
+    -void visit(const NegNode* node) 
+    -void visit(const AddNode* node) 
+    -void visit(const SubNode* node) 
+    -void visit(const MulNode* node) 
+    -void visit(const DivNode* node) 
+    -void visit(const SqrtNode* node) 
+    -void visit(const CastNode* node) 
+    -void visit(const CallIntrinsicNode* node) 
+    -void visit(const ReductionNode* node) 
+
+    ' public fields and methods
+    +Visitor(LowererImplImperative* impl)
+    +Stmt lower(IndexStmt stmt)
+    +Expr lower(IndexExpr expr)
+}
+
+note bottom of Visitor:   Stmt lower(IndexStmt stmt) {\n  this->stmt = Stmt();\n  impl->accessibleIterators.scope();\n  IndexStmtVisitorStrict::visit(stmt);\n  impl->accessibleIterators.unscope();\n  return this->stmt;\n}
+
+IndexNotationVisitorStrict <|-- Visitor
+LowererImpl "1" +-- "1" Visitor : contains
+Visitor "1" *-- "1" LowererImpl : contains
+
+LowererImpl <|-- LowererImplImperative
+LowererImplImperative "1" +-- "1" Visitor : contains
+Visitor "1" *-- "1" LowererImplImperative : contains
+
+@enduml
\ No newline at end of file
diff --git a/test/test.cpp b/test/test.cpp
index a49f10ff7..851493b7f 100644
--- a/test/test.cpp
+++ b/test/test.cpp
@@ -38,6 +38,20 @@ void ASSERT_TENSOR_EQ(TensorBase expected, TensorBase actual) {
   ASSERT_TRUE(equals(expected, actual));
 }
 
+// void ASSERT_TENSOR_VAL(TensorBase expected, TensorBase actual) {
+//   std::cout << "order: " << expected.getOrder();
+//   std::vector<int> modes{};
+//   for (int mode = 0; mode < expected.getOrder(); mode++) {
+//     if (expected.getDimension(mode) != actual.getDimension(mode)) {
+//       ASSERT_TRUE(false);
+//     }
+
+//     for (int i=0; i<expected.getDimension(mode); i++) {
+//       std::cout << expected(i) << " " << actual(i) << std::endl;
+//     }
+//   }
+// }
+
 std::string testDirectory() {
   return TO_STRING(TACO_TEST_DIR);
 }
diff --git a/test/test.h b/test/test.h
index 3302bf81f..1c8f5172e 100644
--- a/test/test.h
+++ b/test/test.h
@@ -61,6 +61,7 @@ void ASSERT_VECTOR_EQ(std::vector<T> expected,
 
 void ASSERT_STORAGE_EQ(TensorStorage expected, TensorStorage actual);
 void ASSERT_TENSOR_EQ(TensorBase expected, TensorBase actual);
+// void ASSERT_TENSOR_VAL(TensorBase expected, TensorBase actual);
 
 template <typename T>
 void ASSERT_COMPONENTS_EQUALS(vector<vector<vector<int>>> expectedIndices,
diff --git a/test/tests-indexstmt.cpp b/test/tests-indexstmt.cpp
index e2a972430..123bea3e6 100644
--- a/test/tests-indexstmt.cpp
+++ b/test/tests-indexstmt.cpp
@@ -1,10 +1,13 @@
+#include "taco/index_notation/kernel.h"
+#include "taco/type.h"
 #include "test.h"
 #include "test_tensors.h"
 #include "taco/tensor.h"
 #include "taco/index_notation/index_notation.h"
+#include "taco/index_notation/transformations.h"
 
 using namespace taco;
-const IndexVar i("i"), j("j"), k("k");
+const IndexVar i("i"), j("j"), k("k"), l("l"), m("m");
 
 TEST(indexstmt, assignment) {
   Type t(type<double>(), {3});
@@ -84,4 +87,193 @@ TEST(indexstmt, spmm) {
 }
 
 
+TEST(indexstmt, sddmm) {
+  Type t(type<double>(), {3,3});
+  TensorVar A("A", t, {Sparse, Dense});
+  TensorVar B("B", t, {Sparse, Dense});
+  TensorVar C("C", t, {Dense, Dense});
+  TensorVar w("w", Type(type<double>(),{3}), Dense);
+
+  // the below expression is the concrete index notation
+  // where (consumer, producer)
+  IndexStmt spmm = forall(i,
+                     forall(k,
+                            where(forall(j, A(i,j) = w(j)),
+                                  forall(j,   w(j) += B(i,k)*C(k,j))
+                                  )
+                            )
+                     );
+
+  // after adding scheduling transformations to this concrete-topologically sorted index stmt
+  //
+
+  std::cout << spmm << std::endl;
+  spmm = reorderLoopsTopologically(spmm);
+  std::cout << "topologically reordered loops statement: " << spmm << std::endl;
+
+  Kernel kernel = compile(spmm);
+  kernel.compute();
+}
+
+TEST(indexstmt, sddmmPlusSpmm) {
+
+  // Y(i,l) = B(i,j)*C(i,k)*D(k,j) * F(j,l);
+  // indexstmt order i, j, k, l
+  //topologically reordered loops statement: forall(i, forall(k, forall(j, forall(l, Y(i,l) += B(i,j) * C(i,k) * D(k,j) * F(j,l), NotParallel, IgnoreRaces), NotParallel, IgnoreRaces), NotParallel, IgnoreRaces), NotParallel, IgnoreRaces)
+
+  Type t(type<double>(), {3,3});
+  TensorVar Y("Y", t, {Dense, Dense});
+  TensorVar B("B", t, {Dense, Sparse});
+  TensorVar C("C", t, {Dense, Dense});
+  TensorVar D("D", t, {Dense, Dense});
+  TensorVar E("E", t, {Dense, Dense});
+
+  // TensorVar A("A", Type(type<double>(),{3}), );
+  TensorVar A("A", Type());
+
+  IndexStmt fused1 = 
+  forall(i,
+    forall(j,
+      forall(k,
+        forall(l, Y(i,l) += B(i,j) * C(i,k) * D(j,k) * E(j,l))
+      )
+    )
+  );
+
+  std::cout << "before topological sort" << fused1 << std::endl;
+  fused1 = reorderLoopsTopologically(fused1);
+  std::cout << "after topological sort" << fused1 << std::endl;
+
+  Kernel kernel = compile(fused1);
+
+
+  IndexStmt fused2 =
+  forall(i,
+    forall(j,
+      where(
+        forall(l, Y(i,l) += A * E(j,l)), // consumer
+        forall(k, A += B(i,j)*C(i,k)*D(j,k)) // producer
+      )
+    )
+  );
+
+  Kernel kernel2 = compile(fused2);
+
+} 
+
+
+
+TEST(indexstmt, mttkrpPlusSpmm) {
+
+  // ./bin/taco "A(i,m)=B(i,k,l)*C(k,j)*D(l,j)*E(j,m)" -f=A:dd:0,1 -f=B:sss:0,1,2 -f=C:dd:0,1 -f=D:dd:0,1 -f=E:dd:0,1
+
+  // i = 11, k = 5, l = 7, j = 8;
+  long unsigned int idim = 11, kdim = 5, ldim = 7, jdim = 8, mdim = 6;
+
+  Type atype(type<double>(), {idim, mdim});
+  Type btype(type<double>(), {idim, kdim, ldim});
+  Type ctype(type<double>(), {kdim, jdim});
+  Type dtype(type<double>(), {ldim, jdim});
+  Type etype(type<double>(), {jdim, mdim});
+
+  TensorVar A("A", atype, {Dense, Dense});
+  TensorVar B("B", btype, {Sparse, Sparse, Sparse});
+  TensorVar C("C", ctype, {Dense, Dense});
+  TensorVar D("D", dtype, {Dense, Dense});
+  TensorVar E("E", etype, {Dense, Dense});
+
+  TensorVar ws("ws", Type(type<double>(), {jdim}) );
+
+  IndexStmt fused1 = 
+  forall(i,
+    forall(k,
+      forall(l,
+        forall(j,
+          forall(m, A(i,m) += B(i,k,l) * C(k,j) * D(l,j) * E(j,m))
+        )
+      )
+    )
+  );
+
+  std::cout << "before topological sort" << fused1 << std::endl;
+  fused1 = reorderLoopsTopologically(fused1);
+  std::cout << "after topological sort" << fused1 << std::endl;
+
+  Kernel kernel = compile(fused1);
+
+  IndexStmt fused2 =
+  forall(i,
+    where(
+      forall(j,
+        forall(m, 
+          A(i,m) += ws(j) * E(j,m)
+        )
+      )
+      ,
+      forall(k,
+        forall(l,
+          forall(j, 
+            ws(j) += B(i,k,l) * C(k,j) * D(l,j)
+          )
+        )
+      )
+    )
+  );
+
+  Kernel kernel2 = compile(fused2);
+
+}
+
+// ./bin/taco "y(i)=A(i,j)*B(j,k)*v(k)" -f=y:d:0 -f=A:dd:0,1 -f=B:dd:0,1 -f=v:d:0
+TEST(indexstmt, mmPlusSpmv) {
+
+  //
+
+  long unsigned int idim = 11, jdim = 8, kdim = 5;
+
+  Type ytype(type<double>(), {idim});
+  Type atype(type<double>(), {idim, jdim});
+  Type btype(type<double>(), {jdim, kdim});
+  Type vtype(type<double>(), {kdim});
+
+  TensorVar y("y", ytype, {Dense});
+  TensorVar A("A", atype, {Dense, Dense});
+  TensorVar B("B", btype, {Dense, Dense});
+  TensorVar v("v", vtype, {Dense});
+  
+  TensorVar ws("ws", Type(type<double>(), {jdim}) );
+
+  IndexStmt fused1 = 
+  forall(i,
+    forall(j,
+      forall(k,
+        forall(m, y(i) += A(i,j) * B(j,k) * v(k))
+      )
+    )
+  );
+
+  std::cout << "before topological sort" << fused1 << std::endl;
+  fused1 = reorderLoopsTopologically(fused1);
+  std::cout << "after topological sort" << fused1 << std::endl;
+
+  Kernel kernel = compile(fused1); 
+  
+  IndexStmt fused2 =
+  where(
+    forall(i,
+      forall(j, 
+        y(i) += A(i,j) * ws(j)
+      )
+    )
+    ,
+    forall(j,
+      forall(k,
+        ws(j) += B(j,k) * v(k)
+      )
+    )
+  );
+
+  Kernel kernel2 = compile(fused2);
+}
+
 
diff --git a/test/tests-scheduling-eval.cpp b/test/tests-scheduling-eval.cpp
index 52bd74ab4..29a7e512e 100644
--- a/test/tests-scheduling-eval.cpp
+++ b/test/tests-scheduling-eval.cpp
@@ -1,42 +1,8 @@
-#include <taco/index_notation/transformations.h>
-#include <codegen/codegen_c.h>
-#include <codegen/codegen_cuda.h>
-#include <fstream>
-#include "test.h"
-#include "test_tensors.h"
-#include "taco/tensor.h"
-#include "taco/index_notation/index_notation.h"
-#include "taco/index_notation/transformations.h"
-#include "codegen/codegen.h"
-#include "taco/lower/lower.h"
-
-using namespace taco;
+#include "util.h"
+
 const IndexVar i("i"), j("j"), k("k"), l("l"), m("m"), n("n");
 int WARP_SIZE = 32;
 
-void printToCout(IndexStmt stmt) {
-  std::shared_ptr<ir::CodeGen> codegen = ir::CodeGen::init_default(cout, ir::CodeGen::ImplementationGen);
-  ir::Stmt compute = lower(stmt, "compute", false, true);
-  codegen->compile(compute, true);
-}
-
-void printToFile(string filename, IndexStmt stmt) {
-  stringstream source;
-
-  string file_path = "eval_generated/";
-  mkdir(file_path.c_str(), 0777);
-
-  std::shared_ptr<ir::CodeGen> codegen = ir::CodeGen::init_default(source, ir::CodeGen::ImplementationGen);
-  ir::Stmt compute = lower(stmt, "compute",  false, true);
-  codegen->compile(compute, true);
-
-  ofstream source_file;
-  string file_ending = should_use_CUDA_codegen() ? ".cu" : ".c";
-  source_file.open(file_path + filename + file_ending);
-  source_file << source.str();
-  source_file.close();
-}
-
 IndexStmt scheduleSpMVCPU(IndexStmt stmt, int CHUNK_SIZE=16) {
   IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1");
   return stmt.split(i, i0, i1, CHUNK_SIZE)
@@ -44,6 +10,14 @@ IndexStmt scheduleSpMVCPU(IndexStmt stmt, int CHUNK_SIZE=16) {
           .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces);
 }
 
+IndexStmt scheduleSpMVISPC(IndexStmt stmt, int CHUNK_SIZE=16) {
+  IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1");
+  // return stmt;
+  return stmt.split(i, i0, i1, CHUNK_SIZE)
+          .reorder({i0, i1, j})
+          .parallelize(i0, ParallelUnit::CPUSimd, OutputRaceStrategy::NoRaces);
+}
+
 IndexStmt scheduleSpMMCPU(IndexStmt stmt, Tensor<double> A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
   IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1");
   return stmt.split(i, i0, i1, CHUNK_SIZE)
@@ -54,6 +28,80 @@ IndexStmt scheduleSpMMCPU(IndexStmt stmt, Tensor<double> A, int CHUNK_SIZE=16, i
           .parallelize(k, ParallelUnit::CPUVector, OutputRaceStrategy::IgnoreRaces);
 }
 
+IndexStmt scheduleSpMMISPC1(IndexStmt stmt, Tensor<double> A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
+  IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1");
+  return stmt.split(i, i0, i1, CHUNK_SIZE)
+          .pos(j, jpos, A(i,j))
+          .split(jpos, jpos0, jpos1, UNROLL_FACTOR)
+          .reorder({i0, i1, jpos0, k, jpos1})
+          // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces)
+          .parallelize(k, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces);
+}
+
+IndexStmt scheduleSpMMISPCOMP1(IndexStmt stmt, Tensor<double> A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
+  IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1");
+  return stmt.split(i, i0, i1, CHUNK_SIZE)
+          .pos(j, jpos, A(i,j))
+          .split(jpos, jpos0, jpos1, UNROLL_FACTOR)
+          .reorder({i0, i1, jpos0, k, jpos1})
+          .parallelize(i0, ParallelUnit::CPUSpmd, OutputRaceStrategy::NoRaces)
+          .parallelize(k, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces);
+}
+
+IndexStmt scheduleSpMMISPC1_2(IndexStmt stmt, Tensor<double> A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
+  IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1");
+  return stmt.split(i, i0, i1, CHUNK_SIZE)
+          .pos(j, jpos, A(i,j))
+          .split(jpos, jpos0, jpos1, UNROLL_FACTOR)
+          .reorder({i0, i1, jpos0, k, jpos1})
+          // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces)
+          .parallelize(i0, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces);
+}
+
+IndexStmt scheduleSpMMISPC1_3(IndexStmt stmt, Tensor<double> A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
+  IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1");
+  return stmt.split(i, i0, i1, CHUNK_SIZE)
+          .pos(j, jpos, A(i,j))
+          .split(jpos, jpos0, jpos1, UNROLL_FACTOR)
+          .reorder({i0, i1, jpos0, k, jpos1})
+          // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces)
+          .parallelize(i1, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces);
+}
+
+IndexStmt scheduleSpMMISPC2(IndexStmt stmt, Tensor<double> A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
+  IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1");
+  return stmt
+          .parallelize(k, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces);
+}
+
+IndexStmt scheduleSpMMISPC2_2(IndexStmt stmt, Tensor<double> A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
+  IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1");
+  return stmt
+          .parallelize(i, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces);
+}
+
+IndexStmt scheduleSpMMISPC3(IndexStmt stmt, Tensor<double> A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
+  IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1");
+  return stmt
+          // .split(i, i0, i1, CHUNK_SIZE)
+          // .pos(j, jpos, A(i,j))
+          // .split(jpos, jpos0, jpos1, UNROLL_FACTOR)
+          .reorder({j, k})
+          // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces)
+          .parallelize(k, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces);
+}
+
+IndexStmt scheduleSpMMISPC3_2(IndexStmt stmt, Tensor<double> A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
+  IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1");
+  return stmt
+          // .split(i, i0, i1, CHUNK_SIZE)
+          // .pos(j, jpos, A(i,j))
+          // .split(jpos, jpos0, jpos1, UNROLL_FACTOR)
+          .reorder({j, k})
+          // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces)
+          .parallelize(i, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces);
+}
+
 IndexStmt scheduleSpGEMMCPU(IndexStmt stmt, bool doPrecompute) {
   Assignment assign = stmt.as<Forall>().getStmt().as<Forall>().getStmt()
                           .as<Forall>().getStmt().as<Assignment>();
@@ -107,6 +155,68 @@ IndexStmt scheduleSDDMMCPU(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16,
           .parallelize(kpos1, ParallelUnit::CPUVector, OutputRaceStrategy::ParallelReduction);
 }
 
+IndexStmt scheduleSDDMMCSRCPU(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
+  IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1");
+  return stmt;
+  // return stmt.split(i, i0, i1, CHUNK_SIZE)
+  //         .pos(k, kpos, B(i,k))
+  //         .split(kpos, kpos0, kpos1, UNROLL_FACTOR)
+  //         .reorder({i0, i1, kpos0, j, kpos1});
+          // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces);
+          // .parallelize(k, ParallelUnit::CPUVector, OutputRaceStrategy::IgnoreRaces);
+}
+
+IndexStmt scheduleSDDMM2CPU(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
+  IndexVar i0("i0"), i1("i1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1");
+  return stmt.split(i, i0, i1, CHUNK_SIZE)
+          .pos(j, jpos, B(i,j))
+          .split(jpos, jpos0, jpos1, UNROLL_FACTOR)
+          .reorder({i0, i1, jpos0, k, jpos1})
+          .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces)
+          .parallelize(jpos1, ParallelUnit::CPUVector, OutputRaceStrategy::ParallelReduction);
+}
+
+IndexStmt scheduleSDDMMISPC(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
+  IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1");
+  return stmt.split(i, i0, i1, CHUNK_SIZE)
+          .pos(k, kpos, B(i,k))
+          .split(kpos, kpos0, kpos1, UNROLL_FACTOR)
+          .reorder({i0, i1, kpos0, j, kpos1})
+          // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces);
+          .parallelize(kpos1, ParallelUnit::CPUSimd, OutputRaceStrategy::ParallelReduction);
+}
+
+IndexStmt scheduleSDDMM2ISPC(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
+  IndexVar i0("i0"), i1("i1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1");
+  return stmt.split(i, i0, i1, CHUNK_SIZE)
+          .pos(j, jpos, B(i,j))
+          .split(jpos, jpos0, jpos1, UNROLL_FACTOR)
+          .reorder({i0, i1, jpos0, k, jpos1})
+          // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces);
+          .parallelize(jpos1, ParallelUnit::CPUSimd, OutputRaceStrategy::ParallelReduction);
+}
+
+IndexStmt scheduleSDDMMISPC1(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
+  IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1");
+  return stmt.split(i, i0, i1, CHUNK_SIZE)
+          .pos(k, kpos, B(i,k))
+          .split(kpos, kpos0, kpos1, UNROLL_FACTOR)
+          .reorder({i0, i1, kpos0, j, kpos1})
+          .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces)
+          .parallelize(kpos1, ParallelUnit::CPUSimd, OutputRaceStrategy::ParallelReduction);
+}
+
+IndexStmt scheduleSDDMMISPC2(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
+  IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1");
+  return stmt;
+          // .split(i, i0, i1, CHUNK_SIZE)
+          // .pos(k, kpos, B(i,k))
+          // .split(kpos, kpos0, kpos1, UNROLL_FACTOR)
+          // .reorder({i0, i1, kpos0, j, kpos1})
+          // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces)
+          // .parallelize(kpos1, ParallelUnit::CPUSimd, OutputRaceStrategy::ParallelReduction);
+}
+
 IndexStmt scheduleTTVCPU(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16) {
   IndexVar f("f"), fpos("fpos"), chunk("chunk"), fpos2("fpos2");
   return stmt.fuse(i, j, f)
@@ -116,6 +226,16 @@ IndexStmt scheduleTTVCPU(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16) {
           .parallelize(chunk, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces);
 }
 
+IndexStmt scheduleTTVISPC(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16) {
+  IndexVar f("f"), fpos("fpos"), chunk("chunk"), fpos2("fpos2");
+  // return stmt;
+  return stmt.fuse(i, j, f)
+          .pos(f, fpos, B(i,j,k))
+          .split(fpos, chunk, fpos2, CHUNK_SIZE)
+          .reorder({chunk, fpos2, k})
+          .parallelize(chunk, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces);
+}
+
 IndexStmt scheduleTTVCPUCSR(IndexStmt stmt) {
   TensorVar result = stmt.as<Forall>().getStmt().as<Forall>().getStmt()
                          .as<Forall>().getStmt().as<Assignment>().getLhs()
@@ -125,6 +245,25 @@ IndexStmt scheduleTTVCPUCSR(IndexStmt stmt) {
                           OutputRaceStrategy::NoRaces);
 }
 
+IndexStmt scheduleTTVCPUCSR_ST(IndexStmt stmt) {
+  TensorVar result = stmt.as<Forall>().getStmt().as<Forall>().getStmt()
+                         .as<Forall>().getStmt().as<Assignment>().getLhs()
+                         .getTensorVar();
+  return stmt.assemble(result, AssembleStrategy::Insert);
+}
+
+IndexStmt scheduleTTVISPCCSR(IndexStmt stmt) {
+  TensorVar result = stmt.as<Forall>().getStmt().as<Forall>().getStmt()
+                         .as<Forall>().getStmt().as<Assignment>().getLhs()
+                         .getTensorVar();
+  return stmt.assemble(result, AssembleStrategy::Insert)
+             .parallelize(i, ParallelUnit::CPUSimd, OutputRaceStrategy::NoRaces);
+}
+
+IndexStmt scheduleTTVISPCCSR2(IndexStmt stmt) {
+  return stmt;
+}
+
 IndexStmt scheduleTTMCPU(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
   IndexVar f("f"), fpos("fpos"), chunk("chunk"), fpos2("fpos2"), kpos("kpos"), kpos1("kpos1"), kpos2("kpos2");
   return stmt.fuse(i, j, f)
@@ -149,12 +288,47 @@ IndexStmt scheduleMTTKRPCPU(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16,
           .parallelize(i1, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces);
 }
 
+IndexStmt scheduleMTTKRPCPU_ST(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
+  IndexVar i1("i1"), i2("i2");
+  IndexExpr precomputeExpr = stmt.as<Forall>().getStmt().as<Forall>().getStmt()
+                                 .as<Forall>().getStmt().as<Forall>().getStmt()
+                                 .as<Assignment>().getRhs().as<Mul>().getA();
+  TensorVar w("w", Type(Float64, {Dimension(j)}), taco::dense);
+  return stmt.split(i, i1, i2, CHUNK_SIZE)
+          .reorder({i1, i2, k, l, j})
+          .precompute(precomputeExpr, j, j, w);
+          // .parallelize(j, ParallelUnit::CPUVector, OutputRaceStrategy::Atomics); // gives error when lowering for IgnoreRaces, NoRaces and Atomics
+          // .parallelize(i1, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces);
+}
+
+IndexStmt scheduleMTTKRPISPC(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
+  IndexVar i1("i1"), i2("i2");
+  IndexExpr precomputeExpr = stmt.as<Forall>().getStmt().as<Forall>().getStmt()
+                                 .as<Forall>().getStmt().as<Forall>().getStmt()
+                                 .as<Assignment>().getRhs().as<Mul>().getA();
+  TensorVar w("w", Type(Float64, {Dimension(j)}), taco::dense);
+  return stmt.split(i, i1, i2, CHUNK_SIZE)
+          .reorder({i1, i2, k, l, j})
+          .precompute(precomputeExpr, j, j, w)
+          .parallelize(j, ParallelUnit::CPUSimd, OutputRaceStrategy::NoRaces);
+}
+
 IndexStmt scheduleMTTKRPPrecomputedCPU(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
   IndexVar i1("i1"), i2("i2"), j_pre("j_pre");
   return stmt.split(i, i1, i2, CHUNK_SIZE)
           .parallelize(i1, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces);
 }
 
+IndexStmt scheduleMTTKRPPrecomputedCPU_ST(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
+  IndexVar i1("i1"), i2("i2"), j_pre("j_pre");
+  return stmt.split(i, i1, i2, CHUNK_SIZE);
+}
+
+IndexStmt scheduleMTTKRPPrecomputedISPC_ST(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
+  IndexVar i1("i1"), i2("i2"), j_pre("j_pre");
+  return stmt.parallelize(j, ParallelUnit::CPUSimd, OutputRaceStrategy::NoRaces);
+}
+
 IndexStmt scheduleMTTKRP4CPU(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
   IndexVar i1("i1"), i2("i2");
   return stmt.split(i, i1, i2, CHUNK_SIZE)
@@ -162,6 +336,19 @@ IndexStmt scheduleMTTKRP4CPU(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16
           .parallelize(i1, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces);
 }
 
+IndexStmt scheduleMTTKRP4CPU_ST(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
+  IndexVar i1("i1"), i2("i2");
+  return stmt.split(i, i1, i2, CHUNK_SIZE)
+          .reorder({i1, i2, k, l, m, j});
+}
+
+IndexStmt scheduleMTTKRP4ISPC_ST(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
+  IndexVar i1("i1"), i2("i2");
+  return stmt.split(i, i1, i2, CHUNK_SIZE)
+          .reorder({i1, i2, k, l, m, j})
+          .parallelize(j, ParallelUnit::CPUSimd, OutputRaceStrategy::NoRaces);
+}
+
 IndexStmt scheduleMTTKRP5CPU(IndexStmt stmt, Tensor<double> B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) {
   IndexVar i1("i1"), i2("i2");
   return stmt.split(i, i1, i2, CHUNK_SIZE)
@@ -576,6 +763,92 @@ TEST(scheduling_eval, spmmCPU) {
   ASSERT_TENSOR_EQ(expected, C);
 }
 
+TEST(scheduling_eval, spmmISPC) {
+  taco::util::TimeResults timevalue;
+  bool time                = true;
+
+  set_ISPC_codegen_enabled(false);
+  set_CUDA_codegen_enabled(false);
+  
+  int NUM_I = 1021/10;
+  int NUM_J = 1039/10;
+  int NUM_K = 128;
+  float SPARSITY = .1;
+  Tensor<double> A("A", {NUM_I, NUM_J}, CSR);
+  Tensor<double> B("B", {NUM_J, NUM_K}, {Dense, Dense});
+  Tensor<double> C("C", {NUM_I, NUM_K}, {Dense, Dense});
+
+  srand(75883);
+  for (int i = 0; i < NUM_I; i++) {
+    for (int j = 0; j < NUM_J; j++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      if (rand_float < SPARSITY) {
+        A.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY)));
+      }
+    }
+  }
+
+  for (int j = 0; j < NUM_J; j++) {
+    for (int k = 0; k < NUM_K; k++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      B.insert({j, k}, (double) ((int) (rand_float*3/SPARSITY)));
+    }
+  }
+
+  A.pack();
+  B.pack();
+
+  set_ISPC_codegen_enabled(true);
+  C(i, k) = A(i, j) * B(j, k);
+
+  IndexStmt stmt = C.getAssignment().concretize();
+  // stmt = scheduleSpMMISPC1(stmt, A);
+  // stmt = scheduleSpMMISPC1_2(stmt, A);
+  stmt = scheduleSpMMISPC1_3(stmt, A);
+  
+  // stmt = scheduleSpMMISPC2(stmt, A);
+  // stmt = scheduleSpMMISPC2_2(stmt, A);
+  
+  // stmt = scheduleSpMMISPC3(stmt, A);
+  // stmt = scheduleSpMMISPC3_2(stmt, A);
+
+  //printToFile("spmm_cpu", stmt);
+
+  C.compile(stmt);
+  C.assemble();
+  C.compute();
+
+  set_ISPC_codegen_enabled(false);
+  Tensor<double> expected("expected", {NUM_I, NUM_K}, {Dense, Dense});
+  expected(i, k) = A(i, j) * B(j, k);
+  IndexStmt stmt_taco = expected.getAssignment().concretize();
+  stmt_taco = scheduleSpMMCPU(stmt_taco, A);
+
+  expected.compile(stmt_taco);
+  expected.assemble();
+  expected.compute();
+  ASSERT_TENSOR_EQ(expected, C);
+
+  // float ERROR_MARGIN = 0.01;
+  // ASSERT_TENSOR_VAL(expected, y);
+  for (int i = 0; i < NUM_I; i++) {
+    for (int k = 0; k < NUM_K; k++) {
+      if (expected(i,k) <= C(i,k) + ERROR_MARGIN && expected(i,k) >= C(i,k) - ERROR_MARGIN) {
+        // std::cout << "matched values: expected -> " << expected(j) << " == " << y(j) << " <- actual\n";
+      }
+      else {
+        std::cout << "unmatched values: expected -> " << expected(i,k) << " != " << C(i,k) << " <- actual\n";
+        ASSERT_TRUE(false);
+      };
+    }
+  }
+
+  for (int i=0; i<10; i++) {
+    TOOL_BENCHMARK_TIMER(C.compute(), "Compute ISPC: ", timevalue);
+    TOOL_BENCHMARK_TIMER(expected.compute(), "Compute TACO: ", timevalue);
+  }
+}
+
 struct spgemm : public TestWithParam<std::tuple<Format,Format,bool>> {};
 
 TEST_P(spgemm, scheduling_eval) {
@@ -805,7 +1078,7 @@ TEST(scheduling_eval, sddmmCPU) {
   IndexStmt stmt = A.getAssignment().concretize();
   stmt = scheduleSDDMMCPU(stmt, B);
 
-  //printToFile("sddmm_cpu", stmt);
+  printToFile("sddmm_cpu_ryan2", stmt);
 
   A.compile(stmt);
   A.assemble();
@@ -819,55 +1092,69 @@ TEST(scheduling_eval, sddmmCPU) {
   ASSERT_TENSOR_EQ(expected, A);
 }
 
-TEST(scheduling_eval, spmvCPU) {
-  if (should_use_CUDA_codegen()) {
+TEST(scheduling_eval, sddmmSPMMFusedCPU) {
+  if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) {
     return;
   }
+
   int NUM_I = 1021/10;
   int NUM_J = 1039/10;
+  int NUM_K = 1057/10;
   float SPARSITY = .3;
-  Tensor<double> A("A", {NUM_I, NUM_J}, CSR);
-  Tensor<double> x("x", {NUM_J}, Format({Dense}));
-  Tensor<double> y("y", {NUM_I}, Format({Dense}));
+  Tensor<double> A("A", {NUM_I, NUM_K}, {Dense, Dense});
+  Tensor<double> B("B", {NUM_I, NUM_K}, CSR);
+  Tensor<double> C("C", {NUM_I, NUM_J}, {Dense, Dense});
+  Tensor<double> D("D", {NUM_J, NUM_K}, {Dense, Dense});
 
-  srand(120);
+  srand(268238);
   for (int i = 0; i < NUM_I; i++) {
     for (int j = 0; j < NUM_J; j++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      C.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY)));
+    }
+  }
+
+  for (int i = 0; i < NUM_I; i++) {
+    for (int k = 0; k < NUM_K; k++) {
       float rand_float = (float)rand()/(float)(RAND_MAX);
       if (rand_float < SPARSITY) {
-        A.insert({i, j}, (double) ((int) (rand_float * 3 / SPARSITY)));
+        B.insert({i, k}, (double) ((int) (rand_float*3/SPARSITY)));
       }
     }
   }
 
   for (int j = 0; j < NUM_J; j++) {
-    float rand_float = (float)rand()/(float)(RAND_MAX);
-    x.insert({j}, (double) ((int) (rand_float*3/SPARSITY)));
+    for (int k = 0; k < NUM_K; k++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      D.insert({j, k}, (double) ((int) (rand_float*3/SPARSITY)));
+    }
   }
 
-  x.pack();
-  A.pack();
+  B.pack();
+  C.pack();
+  D.pack();
 
-  y(i) = A(i, j) * x(j);
+  A(i,k) = B(i,k) * C(i,j) * D(j,k);
 
-  IndexStmt stmt = y.getAssignment().concretize();
-  stmt = scheduleSpMVCPU(stmt);
+  IndexStmt stmt = A.getAssignment().concretize();
+  stmt = scheduleSDDMMCPU(stmt, B);
 
-  //printToFile("spmv_cpu", stmt);
+  printToFile("sddmm_cpu_ryan2", stmt);
 
-  y.compile(stmt);
-  y.assemble();
-  y.compute();
+  A.compile(stmt);
+  A.assemble();
+  A.compute();
 
-  Tensor<double> expected("expected", {NUM_I}, Format({Dense}));
-  expected(i) = A(i, j) * x(j);
+  Tensor<double> expected("expected", {NUM_I, NUM_K}, {Dense, Dense});
+  expected(i,k) = B(i,k) * C(i,j) * D(j,k);
   expected.compile();
   expected.assemble();
   expected.compute();
-  ASSERT_TENSOR_EQ(expected, y);
+  ASSERT_TENSOR_EQ(expected, A);
 }
 
-TEST(scheduling_eval, ttvCPU) {
+
+TEST(scheduling_eval, sddmmcsrCPU) {
   if (should_use_CUDA_codegen()) {
     return;
   }
@@ -875,7 +1162,495 @@ TEST(scheduling_eval, ttvCPU) {
   int NUM_J = 1039/10;
   int NUM_K = 1057/10;
   float SPARSITY = .3;
-  Tensor<double> A("A", {NUM_I, NUM_J}, {Dense, Dense}); // TODO: change to sparse outputs
+  Tensor<double> A("A", {NUM_I, NUM_K}, CSR);
+  Tensor<double> B("B", {NUM_I, NUM_K}, CSR);
+  Tensor<double> C("C", {NUM_I, NUM_J}, {Dense, Dense});
+  Tensor<double> D("D", {NUM_J, NUM_K}, {Dense, Dense});
+
+  srand(268238);
+  for (int i = 0; i < NUM_I; i++) {
+    for (int j = 0; j < NUM_J; j++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      C.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY)));
+    }
+  }
+
+  for (int i = 0; i < NUM_I; i++) {
+    for (int k = 0; k < NUM_K; k++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      if (rand_float < SPARSITY) {
+        B.insert({i, k}, (double) ((int) (rand_float*3/SPARSITY)));
+      }
+    }
+  }
+
+  for (int j = 0; j < NUM_J; j++) {
+    for (int k = 0; k < NUM_K; k++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      D.insert({j, k}, (double) ((int) (rand_float*3/SPARSITY)));
+    }
+  }
+
+  B.pack();
+  C.pack();
+  D.pack();
+
+  A(i,k) = B(i,k) * C(i,j) * D(j,k);
+
+  IndexStmt stmt = A.getAssignment().concretize();
+  stmt = scheduleSDDMMCSRCPU(stmt, B);
+
+  printToFile("sddmm_cpu", stmt);
+
+  A.compile(stmt);
+  A.assemble();
+  A.compute();
+
+  Tensor<double> expected("expected", {NUM_I, NUM_K}, CSR);
+  expected(i,k) = B(i,k) * C(i,j) * D(j,k);
+  
+  IndexStmt stmt_ref = expected.getAssignment().concretize();
+  printToFile("sddmm_cpu_ref", stmt_ref);
+
+  expected.compile(stmt_ref);
+  expected.assemble();
+  expected.compute();
+  ASSERT_TENSOR_EQ(expected, A);
+}
+
+
+TEST(scheduling_eval, sddmm2CPU) {
+  if (should_use_CUDA_codegen()) {
+    return;
+  }
+  int NUM_I = 1021/10;
+  int NUM_J = 1021/10;
+  int NUM_K = 18;
+  float SPARSITY = .3;
+  Tensor<double> Y("Y", {NUM_I, NUM_J}, {Dense, Compressed(ModeFormat::UNIQUE)});
+  Tensor<double> A("A", {NUM_I, NUM_J}, {Dense, Compressed(ModeFormat::UNIQUE)});
+  Tensor<double> X("X", {NUM_I, NUM_K}, {Dense, Dense});
+
+  srand(268238);
+
+  for (int i = 0; i < NUM_I; i++) {
+    for (int j = 0; j < NUM_J; j++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      if (rand_float < SPARSITY) {
+        A.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY)));
+      }
+    }
+  }
+
+  for (int i = 0; i < NUM_J; i++) {
+    for (int k = 0; k < NUM_K; k++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      X.insert({i, k}, (double) ((int) (rand_float*3/SPARSITY)));
+    }
+  }
+
+  A.pack();
+  X.pack();
+
+  Y(i,j) = A(i,j) * X(i,k) * X(k,j);
+
+  // IndexStmt stmt = A.getAssignment().concretize();
+  // // stmt = scheduleSDDMMCPU(stmt, A);
+
+  // printToFile("sddmm2_cpu", stmt);
+
+  // A.compile(stmt);
+  // A.assemble();
+  // A.compute();
+
+  // Tensor<double> expected("expected", {NUM_I, NUM_J}, {Dense, Dense});
+  // expected(i,j) = A(i,j) * X(i,k) * X(j,k);
+  // expected.compile();
+  // expected.assemble();
+  // expected.compute();
+  // ASSERT_TENSOR_EQ(expected, A);
+}
+
+
+
+// bin/taco-test --gtest_filter=scheduling_eval.sddmmISPC
+TEST(scheduling_eval, sddmmISPC) {
+
+  taco::util::TimeResults timevalue;
+  bool time                = true;
+
+  set_CUDA_codegen_enabled(false);
+  set_ISPC_codegen_enabled(false);
+
+  int NUM_I = 1021/10;
+  int NUM_J = 1039/10;
+  int NUM_K = 1057/10;
+  float SPARSITY = .3;
+  Tensor<double> A("A", {NUM_I, NUM_K}, {Dense, Dense});
+  Tensor<double> B("B", {NUM_I, NUM_K}, CSR);
+  Tensor<double> C("C", {NUM_I, NUM_J}, {Dense, Dense});
+  Tensor<double> D("D", {NUM_J, NUM_K}, {Dense, Dense});
+
+  srand(268238);
+  for (int i = 0; i < NUM_I; i++) {
+    for (int j = 0; j < NUM_J; j++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      C.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY)));
+    }
+  }
+
+  for (int i = 0; i < NUM_I; i++) {
+    for (int k = 0; k < NUM_K; k++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      if (rand_float < SPARSITY) {
+        B.insert({i, k}, (double) ((int) (rand_float*3/SPARSITY)));
+      }
+    }
+  }
+
+  for (int j = 0; j < NUM_J; j++) {
+    for (int k = 0; k < NUM_K; k++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      D.insert({j, k}, (double) ((int) (rand_float*3/SPARSITY)));
+    }
+  }
+
+  B.pack();
+  C.pack();
+  D.pack();
+
+  set_ISPC_codegen_enabled(true);
+  A(i,k) = B(i,k) * C(i,j) * D(j,k);
+
+  IndexStmt stmt = A.getAssignment().concretize();
+  stmt = scheduleSDDMMISPC(stmt, B);
+
+  //printToFile("sddmm_cpu", stmt);
+
+  A.compile(stmt);
+  A.assemble();
+  // A.compute();
+
+  set_ISPC_codegen_enabled(false);
+  Tensor<double> expected("expected", {NUM_I, NUM_K}, {Dense, Dense});
+  expected(i,k) = B(i,k) * C(i,j) * D(j,k);
+  IndexStmt stmt_taco = A.getAssignment().concretize();
+  stmt_taco = scheduleSDDMMCPU(stmt_taco, B);
+  expected.compile(stmt_taco);
+  expected.assemble();
+  // expected.compute();
+
+  TOOL_BENCHMARK_TIMER(A.compute(), "Compute ISPC: ", timevalue);
+  TOOL_BENCHMARK_TIMER(expected.compute(), "Compute TACO: ", timevalue);
+
+  ASSERT_TENSOR_EQ(expected, A);
+
+
+  // float ERROR_MARGIN = 0.01;
+  // ASSERT_TENSOR_VAL(expected, y);
+  for (int i = 0; i < NUM_I; i++) {
+    for (int k = 0; k < NUM_K; k++) {
+      if (expected(i,k) <= A(i,k) + ERROR_MARGIN && expected(i,k) >= A(i,k) - ERROR_MARGIN) {
+        // std::cout << "matched values: expected -> " << expected(j) << " == " << y(j) << " <- actual\n";
+      }
+      else {
+        std::cout << "unmatched values: expected -> " << expected(i,k) << " != " << A(i,k) << " <- actual\n";
+        ASSERT_TRUE(false);
+      };
+    }
+  }
+  std::cout << "test scheduling_eval.sddmmISPC passed\n";
+
+}
+
+
+// bin/taco-test --gtest_filter=scheduling_eval.sddmmISPC
+TEST(scheduling_eval, sddmm2ISPC) {
+
+  taco::util::TimeResults timevalue;
+  bool time                = true;
+
+  set_CUDA_codegen_enabled(false);
+  set_ISPC_codegen_enabled(false);
+
+  int NUM_I = 1021/10;
+  int NUM_K = 1039/10;
+  int NUM_J = 1021/10;
+  float SPARSITY = .3;
+  Tensor<double> A("A", {NUM_I, NUM_J}, {Dense, Dense});
+  Tensor<double> B("B", {NUM_I, NUM_J}, CSR);
+  Tensor<double> C("C", {NUM_I, NUM_K}, {Dense, Dense});
+
+  srand(268238);
+  for (int i = 0; i < NUM_I; i++) {
+    for (int k = 0; k < NUM_K; k++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      C.insert({i, k}, (double) ((int) (rand_float*3/SPARSITY)));
+    }
+  }
+
+  for (int i = 0; i < NUM_I; i++) {
+    for (int j = 0; j < NUM_J; j++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      if (rand_float < SPARSITY) {
+        B.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY)));
+      }
+    }
+  }
+
+  B.pack();
+  C.pack();
+
+  set_ISPC_codegen_enabled(true);
+  A(i,j) = B(i,j) * C(i,k) * C(j,k);
+
+  IndexStmt stmt = A.getAssignment().concretize();
+  stmt = scheduleSDDMM2ISPC(stmt, B);
+
+  //printToFile("sddmm_cpu", stmt);
+
+  A.compile(stmt);
+  A.assemble();
+  // A.compute();
+
+  set_ISPC_codegen_enabled(false);
+  Tensor<double> expected("expected", {NUM_I, NUM_J}, {Dense, Dense});
+  expected(i,j) = B(i,j) * C(i,k) * C(j,k);
+  IndexStmt stmt_taco = A.getAssignment().concretize();
+  stmt_taco = scheduleSDDMM2CPU(stmt_taco, B);
+  expected.compile(stmt_taco);
+  expected.assemble();
+  // expected.compute();
+
+  TOOL_BENCHMARK_TIMER(A.compute(), "Compute ISPC: ", timevalue);
+  TOOL_BENCHMARK_TIMER(expected.compute(), "Compute TACO: ", timevalue);
+
+  ASSERT_TENSOR_EQ(expected, A);
+
+
+  // float ERROR_MARGIN = 0.01;
+  // ASSERT_TENSOR_VAL(expected, y);
+  for (int i = 0; i < NUM_I; i++) {
+    for (int j = 0; j < NUM_J; j++) {
+      if (expected(i,j) <= A(i,j) + ERROR_MARGIN && expected(i,j) >= A(i,j) - ERROR_MARGIN) {
+        // std::cout << "matched values: expected -> " << expected(j) << " == " << y(j) << " <- actual\n";
+      }
+      else {
+        std::cout << "unmatched values: expected -> " << expected(i,j) << " != " << A(i,j) << " <- actual\n";
+        ASSERT_TRUE(false);
+      };
+    }
+  }
+  std::cout << "test scheduling_eval.sddmmISPC passed\n";
+
+}
+
+
+TEST(scheduling_eval, spmvCPU) {
+  if (should_use_CUDA_codegen()) {
+    return;
+  }
+  int NUM_I = 1021/10;
+  int NUM_J = 1039/10;
+  float SPARSITY = .3;
+  Tensor<double> A("A", {NUM_I, NUM_J}, CSR);
+  Tensor<double> x("x", {NUM_J}, Format({Dense}));
+  Tensor<double> y("y", {NUM_I}, Format({Dense}));
+
+  srand(120);
+  for (int i = 0; i < NUM_I; i++) {
+    for (int j = 0; j < NUM_J; j++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      if (rand_float < SPARSITY) {
+        A.insert({i, j}, (double) ((int) (rand_float * 3 / SPARSITY)));
+      }
+    }
+  }
+
+  for (int j = 0; j < NUM_J; j++) {
+    float rand_float = (float)rand()/(float)(RAND_MAX);
+    x.insert({j}, (double) ((int) (rand_float*3/SPARSITY)));
+  }
+
+  x.pack();
+  A.pack();
+
+  y(i) = A(i, j) * x(j);
+
+  IndexStmt stmt = y.getAssignment().concretize();
+  stmt = scheduleSpMVCPU(stmt);
+
+  //printToFile("spmv_cpu", stmt);
+
+  y.compile(stmt);
+  y.assemble();
+  y.compute();
+
+  Tensor<double> expected("expected", {NUM_I}, Format({Dense}));
+  expected(i) = A(i, j) * x(j);
+  expected.compile();
+  expected.assemble();
+  expected.compute();
+  ASSERT_TENSOR_EQ(expected, y);
+}
+
+
+TEST(scheduling_eval, spmvISPC) {
+
+  taco::util::TimeResults timevalue;
+  bool time                = true;
+
+  set_ISPC_codegen_enabled(false);
+  set_CUDA_codegen_enabled(false);
+  
+  int NUM_I = 200021/10;
+  int NUM_J = 200039/10;
+  float SPARSITY = .2;
+  Tensor<double> A("A", {NUM_I, NUM_J}, CSR);
+  Tensor<double> x("x", {NUM_J}, Format({Dense}));
+  Tensor<double> y("y", {NUM_I}, Format({Dense}));
+
+  srand(120);
+  for (int i = 0; i < NUM_I; i++) {
+    for (int j = 0; j < NUM_J; j++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      if (rand_float < SPARSITY) {
+        A.insert({i, j}, (double) ((int) (rand_float * 3 / SPARSITY)));
+      }
+    }
+  }
+
+  for (int j = 0; j < NUM_J; j++) {
+    float rand_float = (float)rand()/(float)(RAND_MAX);
+    x.insert({j}, (double) ((int) (rand_float*3/SPARSITY)));
+  }
+
+  x.pack();
+  A.pack();
+
+  set_ISPC_codegen_enabled(true);
+
+  y(i) = A(i, j) * x(j);
+
+  IndexStmt stmt = y.getAssignment().concretize();
+  // stmt = scheduleSpMVISPC(stmt);
+
+  printToFile("spmv_cpu", stmt);
+
+  y.compile(stmt);
+  y.assemble();
+  // y.compile();
+
+  set_ISPC_codegen_enabled(false);
+
+  // Tensor<double> expected("expected", {NUM_I}, Format({Dense}));
+  // expected(i) = A(i, j) * x(j);
+  // expected.compile();
+  // expected.assemble();
+  // expected.compute();
+
+
+  Tensor<double> expected("expected", {NUM_I}, Format({Dense}));
+  expected(i) = A(i, j) * x(j);
+  IndexStmt stmt_taco = expected.getAssignment().concretize();
+  stmt_taco = scheduleSpMVCPU(stmt_taco);
+  
+  expected.compile(stmt_taco);
+  expected.assemble();
+  // expected.compile();
+
+
+  TOOL_BENCHMARK_TIMER(y.compute(), "Compute ISPC: ", timevalue);
+  TOOL_BENCHMARK_TIMER(expected.compute(), "Compute TACO: ", timevalue);
+  
+
+  ASSERT_TENSOR_EQ(expected, y);
+
+  // float ERROR_MARGIN = 0.01;
+  // ASSERT_TENSOR_VAL(expected, y);
+  for (int j = 0; j < NUM_J; j++) {
+    if (expected(j) <= y(j) + ERROR_MARGIN && expected(j) >= y(j) - ERROR_MARGIN) {
+      // std::cout << "matched values: expected -> " << expected(j) << " == " << y(j) << " <- actual\n";
+    }
+    else {
+      std::cout << "unmatched values: expected -> " << expected(j) << " != " << y(j) << " <- actual\n";
+      ASSERT_TRUE(false);
+    };
+  }
+
+  std::cout << "test scheduling_eval.spmvISPC passed\n";
+
+  for (int i=0; i<10; i++) {
+    TOOL_BENCHMARK_TIMER(y.compute(), "Compute ISPC: ", timevalue);
+    TOOL_BENCHMARK_TIMER(expected.compute(), "Compute TACO: ", timevalue);
+  }
+
+
+}
+
+TEST(scheduling_eval, ttvCPU) {
+  if (should_use_CUDA_codegen()) {
+    return;
+  }
+  int NUM_I = 1021/10;
+  int NUM_J = 1039/10;
+  int NUM_K = 1057/10;
+  float SPARSITY = .3;
+  Tensor<double> A("A", {NUM_I, NUM_J}, {Dense, Dense}); // TODO: change to sparse outputs
+  Tensor<double> B("B", {NUM_I, NUM_J, NUM_K}, {Sparse, Sparse, Sparse});
+  Tensor<double> c("c", {NUM_K}, Format({Dense}));
+
+  srand(9536);
+  for (int i = 0; i < NUM_I; i++) {
+    for (int j = 0; j < NUM_J; j++) {
+      for (int k = 0; k < NUM_K; k++) {
+        float rand_float = (float) rand() / (float) (RAND_MAX);
+        if (rand_float < SPARSITY) {
+          B.insert({i, j, k}, (double) ((int) (rand_float * 3 / SPARSITY)));
+        }
+      }
+    }
+  }
+
+  for (int k = 0; k < NUM_K; k++) {
+    float rand_float = (float)rand()/(float)(RAND_MAX);
+    c.insert({k}, (double) ((int) (rand_float*3)));
+  }
+
+  B.pack();
+  c.pack();
+
+  A(i,j) = B(i,j,k) * c(k);
+
+  IndexStmt stmt = A.getAssignment().concretize();
+  stmt = scheduleTTVCPU(stmt, B);
+
+  printToFile("ttv_cpu", stmt);
+
+  A.compile(stmt);
+  A.assemble();
+  A.compute();
+
+  Tensor<double> expected("expected", {NUM_I, NUM_J}, {Dense, Dense});
+  expected(i,j) = B(i,j,k) * c(k);
+  expected.compile();
+  expected.assemble();
+  expected.compute();
+  ASSERT_TENSOR_EQ(expected, A);
+}
+
+
+TEST(scheduling_eval, ttvISPC) {
+  if (should_use_CUDA_codegen()) {
+    return;
+  }
+  set_CUDA_codegen_enabled(false);
+  set_ISPC_codegen_enabled(false);
+  int NUM_I = 1021/10;
+  int NUM_J = 1039/10;
+  int NUM_K = 1057/10;
+  float SPARSITY = .3;
+  Tensor<double> A("A", {NUM_I, NUM_J}, {Dense, Dense}); // TODO: change to sparse outputs
   Tensor<double> B("B", {NUM_I, NUM_J, NUM_K}, {Sparse, Sparse, Sparse});
   Tensor<double> c("c", {NUM_K}, Format({Dense}));
 
@@ -899,25 +1674,30 @@ TEST(scheduling_eval, ttvCPU) {
   B.pack();
   c.pack();
 
+  set_ISPC_codegen_enabled(true);
   A(i,j) = B(i,j,k) * c(k);
 
   IndexStmt stmt = A.getAssignment().concretize();
-  stmt = scheduleTTVCPU(stmt, B);
+  stmt = scheduleTTVISPC(stmt, B);
 
-  //printToFile("ttv_cpu", stmt);
+  printToFile("ttv_ispc", "__ttv_ispc", stmt);
 
   A.compile(stmt);
   A.assemble();
   A.compute();
 
+  set_ISPC_codegen_enabled(false);
   Tensor<double> expected("expected", {NUM_I, NUM_J}, {Dense, Dense});
   expected(i,j) = B(i,j,k) * c(k);
+  IndexStmt stmt_taco = expected.getAssignment().concretize();
+  stmt_taco = scheduleTTVCPU(stmt_taco, B);
   expected.compile();
   expected.assemble();
   expected.compute();
   ASSERT_TENSOR_EQ(expected, A);
 }
 
+
 TEST(scheduling_eval, ttvCPU_CSR) {
   if (should_use_CUDA_codegen()) {
     return;
@@ -928,7 +1708,7 @@ TEST(scheduling_eval, ttvCPU_CSR) {
   int NUM_K = 1057/10;
   float SPARSITY = .3;
   Tensor<double> A("A", {NUM_I, NUM_J}, {Dense, Sparse});
-  Tensor<double> B("B", {NUM_I, NUM_J, NUM_K}, {Sparse, Sparse, Sparse});
+  Tensor<double> B("B", {NUM_I, NUM_J, NUM_K}, {Dense, Sparse, Sparse});
   Tensor<double> c("c", {NUM_K}, Format({Dense}));
 
   srand(9536);
@@ -956,11 +1736,13 @@ TEST(scheduling_eval, ttvCPU_CSR) {
   IndexStmt stmt = A.getAssignment().concretize();
   stmt = scheduleTTVCPUCSR(stmt);
 
+  printToFile("ttv_cpu_csr", stmt);
+
   A.compile(stmt);
   A.assemble();
   A.compute();
 
-  Tensor<double> expected("expected", {NUM_I, NUM_J}, {Dense, Dense});
+  Tensor<double> expected("expected", {NUM_I, NUM_J}, {Dense, Sparse});
   expected(i,j) = B(i,j,k) * c(k);
   expected.compile();
   expected.assemble();
@@ -968,6 +1750,82 @@ TEST(scheduling_eval, ttvCPU_CSR) {
   ASSERT_TENSOR_EQ(expected, A);
 }
 
+TEST(scheduling_eval, ttvISPC_CSR) {
+  if (should_use_CUDA_codegen()) {
+    return;
+  }
+
+  int NUM_I = 10000;
+  int NUM_J = 1039/10;
+  int NUM_K = 128;
+  float SPARSITY = .3;
+  Tensor<double> A("A", {NUM_I, NUM_J}, {Dense, Sparse});
+  Tensor<double> B("B", {NUM_I, NUM_J, NUM_K}, {Dense, Sparse, Sparse});
+  Tensor<double> c("c", {NUM_K}, Format({Dense}));
+
+  srand(9536);
+  for (int i = 0; i < NUM_I; i++) {
+    for (int j = 0; j < NUM_J; j++) {
+      for (int k = 0; k < NUM_K; k++) {
+        float rand_float = (float) rand() / (float) (RAND_MAX);
+        if (rand_float < SPARSITY) {
+          B.insert({i, j, k}, (double) ((int) (rand_float * 3 / SPARSITY)));
+        }
+      }
+    }
+  }
+
+  for (int k = 0; k < NUM_K; k++) {
+    float rand_float = (float)rand()/(float)(RAND_MAX);
+    c.insert({k}, (double) ((int) (rand_float*3)));
+  }
+
+  B.pack();
+  c.pack();
+
+  set_ISPC_codegen_enabled(true);
+  A(i,j) = B(i,j,k) * c(k);
+
+  IndexStmt stmt = A.getAssignment().concretize();
+  stmt = scheduleTTVISPCCSR(stmt);
+  printToFile("ttv_ispc_csr", "__ttv_ispc_csr", stmt);
+
+  A.compile(stmt);
+  A.assemble();
+  A.compute();
+
+  set_ISPC_codegen_enabled(false);
+  Tensor<double> expected("expected", {NUM_I, NUM_J}, {Dense, Sparse});
+  expected(i,j) = B(i,j,k) * c(k);
+  IndexStmt taco_stmt = expected.getAssignment().concretize();
+  taco_stmt = scheduleTTVCPUCSR_ST(taco_stmt);
+  expected.compile(taco_stmt);
+  expected.assemble();
+  expected.compute();
+  ASSERT_TENSOR_EQ(expected, A);
+
+  Tensor<double> A2("A2", {NUM_I, NUM_J}, {Dense, Sparse});
+  set_ISPC_codegen_enabled(true);
+  A2(i,j) = B(i,j,k) * c(k);
+
+  IndexStmt stmt2 = A2.getAssignment().concretize();
+
+  A2.compile(stmt2);
+  A2.assemble();
+  A2.compute();
+
+  taco::util::TimeResults timevalue;
+  bool time                = true;
+
+  for (int i=0; i<3; i++) {
+    TOOL_BENCHMARK_TIMER(expected.compute(), "Compute TACO1: ", timevalue);
+    TOOL_BENCHMARK_TIMER(A.compute(), "Compute ISPC1: ", timevalue);
+    TOOL_BENCHMARK_TIMER(A2.compute(), "Compute ISPC2: ", timevalue);
+  }
+
+  
+}
+
 TEST(scheduling_eval, ttmCPU) {
   if (should_use_CUDA_codegen()) {
     return;
@@ -1010,39 +1868,318 @@ TEST(scheduling_eval, ttmCPU) {
 
   //printToFile("ttm_cpu", stmt);
 
-  A.compile(stmt);
-  A.assemble();
-  A.compute();
+  A.compile(stmt);
+  A.assemble();
+  A.compute();
+
+  Tensor<double> expected("expected", {NUM_I, NUM_J, NUM_L}, {Dense, Dense, Dense});
+  expected(i,j,l) = B(i,j,k) * C(k,l);
+  expected.compile();
+  expected.assemble();
+  expected.compute();
+  ASSERT_TENSOR_EQ(expected, A);
+}
+
+TEST(scheduling_eval, ttmISPC) {
+  if (should_use_CUDA_codegen()) {
+    return;
+  }
+  int NUM_I = 1021/40;
+  int NUM_J = 1039/40;
+  int NUM_K = 1057/40;
+  int NUM_L = 1232/40;
+  float SPARSITY = .1;
+  Tensor<double> A("A", {NUM_I, NUM_J, NUM_L}, {Dense, Dense, Dense}); // TODO: change to sparse outputs
+  Tensor<double> B("B", {NUM_I, NUM_J, NUM_K}, {Sparse, Sparse, Sparse});
+  Tensor<double> C("C", {NUM_K, NUM_L}, {Dense, Dense});
+
+  srand(935);
+  for (int i = 0; i < NUM_I; i++) {
+    for (int j = 0; j < NUM_J; j++) {
+      for (int k = 0; k < NUM_K; k++) {
+        float rand_float = (float) rand() / (float) (RAND_MAX);
+        if (rand_float < SPARSITY) {
+          B.insert({i, j, k}, (double) ((int) (rand_float * 3 / SPARSITY)));
+        }
+      }
+    }
+  }
+
+  for (int k = 0; k < NUM_K; k++) {
+    for (int l = 0; l < NUM_L; l++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      C.insert({k, l}, (double) ((int) (rand_float*3)));
+    }
+  }
+
+  B.pack();
+  C.pack();
+
+  A(i,j,l) = B(i,j,k) * C(k,l);
+
+  IndexStmt stmt = A.getAssignment().concretize();
+  stmt = scheduleTTMCPU(stmt, B);
+
+  //printToFile("ttm_cpu", stmt);
+
+  A.compile(stmt);
+  A.assemble();
+  A.compute();
+
+  Tensor<double> expected("expected", {NUM_I, NUM_J, NUM_L}, {Dense, Dense, Dense});
+  expected(i,j,l) = B(i,j,k) * C(k,l);
+  expected.compile();
+  expected.assemble();
+  expected.compute();
+  ASSERT_TENSOR_EQ(expected, A);
+}
+
+TEST(scheduling_eval, mttkrpCPU) {
+  if (should_use_CUDA_codegen()) {
+    return;
+  }
+  int NUM_I = 1021/20;
+  int NUM_J = 1039/20;
+  int NUM_K = 1057/20;
+  int NUM_L = 1232/20;
+  float SPARSITY = .1;
+  Tensor<double> A("A", {NUM_I, NUM_J}, {Dense, Dense});
+  Tensor<double> B("B", {NUM_I, NUM_K, NUM_L}, {Dense, Sparse, Sparse});
+  Tensor<double> C("C", {NUM_K, NUM_J}, {Dense, Dense});
+  Tensor<double> D("D", {NUM_L, NUM_J}, {Dense, Dense});
+
+  srand(549694);
+  for (int i = 0; i < NUM_I; i++) {
+    for (int k = 0; k < NUM_K; k++) {
+      for (int l = 0; l < NUM_L; l++) {
+        float rand_float = (float) rand() / (float) (RAND_MAX);
+        if (rand_float < SPARSITY) {
+          B.insert({i, k, l}, (double) ((int) (rand_float * 3 / SPARSITY)));
+        }
+      }
+    }
+  }
+
+  for (int k = 0; k < NUM_K; k++) {
+    for (int j = 0; j < NUM_J; j++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      C.insert({k, j}, (double) ((int) (rand_float*3)));
+    }
+  }
+
+  for (int l = 0; l < NUM_L; l++) {
+    for (int j = 0; j < NUM_J; j++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      D.insert({l, j}, (double) ((int) (rand_float*3)));
+    }
+  }
+
+  B.pack();
+  C.pack();
+  D.pack();
+
+  A(i,j) = B(i,k,l) * C(k,j) * D(l,j);
+
+  IndexStmt stmt = A.getAssignment().concretize();
+  stmt = scheduleMTTKRPCPU(stmt, B);
+  //printToFile("mttkrp_cpu", stmt);
+
+  A.compile(stmt);
+  A.assemble();
+  A.compute();
+
+  Tensor<double> expected("expected", {NUM_I, NUM_J}, {Dense, Dense});
+  expected(i,j) = B(i,k,l) * C(k,j) * D(l,j);
+  expected.compile();
+  expected.assemble();
+  expected.compute();
+  ASSERT_TENSOR_EQ(expected, A);
+}
+
+TEST(scheduling_eval, temp) {
+  if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) {
+    return;
+  }
+  std::default_random_engine gen(0);
+  std::uniform_real_distribution<double> unif(0.0, 1.0);
+  // Predeclare the storage formats that the inputs and output will be stored as.
+  // To define a format, you must specify whether each dimension is dense or sparse
+  // and (optionally) the order in which dimensions should be stored. The formats
+  // declared below correspond to doubly compressed sparse row (dcsr), row-major
+  // dense (rm), and column-major dense (dm).
+  Format dcsr({Sparse,Sparse});
+  Format   rm({Dense,Dense});
+  Format   cm({Dense,Dense}, {1,0});
+
+  // Load a sparse matrix from file (stored in the Matrix Market format) and
+  // store it as a doubly compressed sparse row matrix. Matrices correspond to
+  // order-2 tensors in taco. The matrix in this example can be download from:
+  // https://www.cise.ufl.edu/research/sparse/MM/Williams/webbase-1M.tar.gz
+  Tensor<double> B = read("/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx", dcsr);
+  // Generate a random dense matrix and store it in row-major (dense) format.
+  Tensor<double> C({B.getDimension(0), 1000}, rm);
+  for (int i = 0; i < C.getDimension(0); ++i) {
+    for (int j = 0; j < C.getDimension(1); ++j) {
+      C.insert({i,j}, unif(gen));
+    }
+  }
+  C.pack();
+
+  // Generate another random dense matrix and store it in column-major format.
+  Tensor<double> D({1000, B.getDimension(1)}, cm);
+  for (int i = 0; i < D.getDimension(0); ++i) {
+    for (int j = 0; j < D.getDimension(1); ++j) {
+      D.insert({i,j}, unif(gen));
+    }
+  }
+  D.pack();
+
+  // Declare the output matrix to be a sparse matrix with the same dimensions as
+  // input matrix B, to be also stored as a doubly compressed sparse row matrix.
+  Tensor<double> A(B.getDimensions(), dcsr);
+
+  // Define the SDDMM computation using index notation.
+  IndexVar i, j, k;
+  A(i,j) = B(i,j) * C(i,k) * D(k,j);
+
+  // At this point, we have defined how entries in the output matrix should be
+  // computed from entries in the input matrices but have not actually performed
+  // the computation yet. To do so, we must first tell taco to generate code that
+  // can be executed to compute the SDDMM operation.
+  A.compile();
+  // We can now call the functions taco generated to assemble the indices of the
+  // output matrix and then actually compute the SDDMM.
+  A.assemble();
+  A.compute();
+  // Write the output of the computation to file (stored in the Matrix Market format).
+  write("A.mtx", A);
+}
+
+TEST(scheduling_eval, mttkrpISPC) {
+  if (should_use_CUDA_codegen()) {
+    return;
+  }
+  set_ISPC_codegen_enabled(false);
+  set_CUDA_codegen_enabled(false);
+  int NUM_I = 10000; // 1021/20;
+  int NUM_J = 256;
+  int NUM_K = 1057/20;
+  int NUM_L = 1232/20;
+  float SPARSITY = .1;
+  Tensor<double> B("B", {NUM_I, NUM_K, NUM_L}, {Dense, Sparse, Sparse});
+  Tensor<double> C("C", {NUM_K, NUM_J}, {Dense, Dense});
+  Tensor<double> D("D", {NUM_L, NUM_J}, {Dense, Dense});
+
+  srand(549694);
+  for (int i = 0; i < NUM_I; i++) {
+    for (int k = 0; k < NUM_K; k++) {
+      for (int l = 0; l < NUM_L; l++) {
+        float rand_float = (float) rand() / (float) (RAND_MAX);
+        if (rand_float < SPARSITY) {
+          B.insert({i, k, l}, (double) ((int) (rand_float * 3 / SPARSITY)));
+        }
+      }
+    }
+  }
+
+  for (int k = 0; k < NUM_K; k++) {
+    for (int j = 0; j < NUM_J; j++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      C.insert({k, j}, (double) ((int) (rand_float*3)));
+    }
+  }
+
+  for (int l = 0; l < NUM_L; l++) {
+    for (int j = 0; j < NUM_J; j++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      D.insert({l, j}, (double) ((int) (rand_float*3)));
+    }
+  }
+
+  B.pack();
+  C.pack();
+  D.pack();
 
-  Tensor<double> expected("expected", {NUM_I, NUM_J, NUM_L}, {Dense, Dense, Dense});
-  expected(i,j,l) = B(i,j,k) * C(k,l);
-  expected.compile();
-  expected.assemble();
-  expected.compute();
-  ASSERT_TENSOR_EQ(expected, A);
+  set_ISPC_codegen_enabled(true);
+
+  Tensor<double> A1("A1", {NUM_I, NUM_J}, {Dense, Dense});
+  A1(i,j) = B(i,k,l) * C(k,j) * D(l,j);
+  IndexStmt stmt1 = A1.getAssignment().concretize();
+  stmt1 = scheduleMTTKRPISPC(stmt1, B);
+  // printToFile("mttkrp1_cpu_ispc", stmt1);
+  A1.compile(stmt1);
+  A1.assemble();
+  A1.compute();
+
+  set_ISPC_codegen_enabled(false);
+  Tensor<double> expected1("expected1", {NUM_I, NUM_J}, {Dense, Dense});
+  expected1(i,j) = B(i,k,l) * C(k,j) * D(l,j);
+  IndexStmt taco_stmt1 = expected1.getAssignment().concretize();
+  taco_stmt1 = scheduleMTTKRPCPU(taco_stmt1, B);
+  expected1.compile(taco_stmt1);
+  expected1.assemble();
+  expected1.compute();
+  ASSERT_TENSOR_EQ(expected1, A1);
+
+  set_ISPC_codegen_enabled(true);
+  Tensor<double> A2("A2", {NUM_I, NUM_J}, {Dense, Dense});
+  A2(i,j) = B(i,k,l) * C(k,j) * D(l,j);
+  IndexStmt stmt2 = A1.getAssignment().concretize();
+  stmt2 = scheduleMTTKRPPrecomputedISPC_ST(stmt2, B);
+  // printToFile("mttkrp_cpu_ispc", stmt);
+  A2.compile(stmt2);
+  A2.assemble();
+  A2.compute();
+  ASSERT_TENSOR_EQ(expected1, A2);
+  
+  set_ISPC_codegen_enabled(false);
+  Tensor<double> expected2("expected2", {NUM_I, NUM_J}, {Dense, Dense});
+  expected2(i,j) = B(i,k,l) * C(k,j) * D(l,j);
+  IndexStmt taco_stmt2 = expected2.getAssignment().concretize();
+  taco_stmt2 = scheduleMTTKRPPrecomputedCPU_ST(taco_stmt2, B);
+  expected2.compile(taco_stmt2);
+  expected2.assemble();
+  expected2.compute();
+  ASSERT_TENSOR_EQ(expected1, expected2);
+
+  taco::util::TimeResults timevalue;
+  bool time                = true;
+
+  for (int i=0; i<3; i++) {
+    TOOL_BENCHMARK_TIMER(expected1.compute(), "Compute TACO1: ", timevalue);
+    TOOL_BENCHMARK_TIMER(A1.compute(), "Compute ISPC1: ", timevalue);
+    TOOL_BENCHMARK_TIMER(expected2.compute(), "Compute TACO2: ", timevalue);
+    TOOL_BENCHMARK_TIMER(A2.compute(), "Compute ISPC2: ", timevalue);
+  }
 }
 
-TEST(scheduling_eval, mttkrpCPU) {
+
+TEST(scheduling_eval, mttkrp4ISPC) {
   if (should_use_CUDA_codegen()) {
     return;
   }
-  int NUM_I = 1021/20;
-  int NUM_J = 1039/20;
+  set_ISPC_codegen_enabled(false);
+  set_CUDA_codegen_enabled(false);
+  int NUM_I = 1000; // 1021/20;
+  int NUM_J = 16;
   int NUM_K = 1057/20;
   int NUM_L = 1232/20;
+  int NUM_M = 1124/20;
   float SPARSITY = .1;
-  Tensor<double> A("A", {NUM_I, NUM_J}, {Dense, Dense});
-  Tensor<double> B("B", {NUM_I, NUM_K, NUM_L}, {Dense, Sparse, Sparse});
+  Tensor<double> B("B", {NUM_I, NUM_K, NUM_L, NUM_M}, {Dense, Sparse, Sparse, Sparse});
   Tensor<double> C("C", {NUM_K, NUM_J}, {Dense, Dense});
   Tensor<double> D("D", {NUM_L, NUM_J}, {Dense, Dense});
+  Tensor<double> E("E", {NUM_M, NUM_J}, {Dense, Dense});
 
   srand(549694);
   for (int i = 0; i < NUM_I; i++) {
     for (int k = 0; k < NUM_K; k++) {
       for (int l = 0; l < NUM_L; l++) {
-        float rand_float = (float) rand() / (float) (RAND_MAX);
-        if (rand_float < SPARSITY) {
-          B.insert({i, k, l}, (double) ((int) (rand_float * 3 / SPARSITY)));
+        for (int m = 0; m < NUM_M; m++) {
+          float rand_float = (float) rand() / (float) (RAND_MAX);
+          if (rand_float < SPARSITY) {
+            B.insert({i, k, l, m}, (double) ((int) (rand_float * 3 / SPARSITY)));
+          }
         }
       }
     }
@@ -1062,27 +2199,83 @@ TEST(scheduling_eval, mttkrpCPU) {
     }
   }
 
+  for (int m = 0; m < NUM_M; m++) {
+    for (int j = 0; j < NUM_J; j++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      E.insert({m, j}, (double) ((int) (rand_float*3)));
+    }
+  }
+
   B.pack();
   C.pack();
   D.pack();
+  E.pack();
+
+  set_ISPC_codegen_enabled(true);
+  Tensor<double> A1("A1", {NUM_I, NUM_J}, {Dense, Dense});
+  A1(i,j) = B(i,k,l,m) * C(k,j) * D(l,j) * E(m,j);
+  IndexStmt stmt1 = A1.getAssignment().concretize();
+  stmt1 = scheduleMTTKRP4ISPC_ST(stmt1, B);
+  // printToFile("mttkrp1_cpu_ispc", stmt1);
+  A1.compile(stmt1);
+  A1.assemble();
+  A1.compute();
+
+  set_ISPC_codegen_enabled(false);
+  Tensor<double> expected1("expected1", {NUM_I, NUM_J}, {Dense, Dense});
+  expected1(i,j) = B(i,k,l,m) * C(k,j) * D(l,j) * E(m,j);
+  IndexStmt taco_stmt1 = expected1.getAssignment().concretize();
+  taco_stmt1 = scheduleMTTKRP4CPU_ST(taco_stmt1, B);
+  expected1.compile(taco_stmt1);
+  expected1.assemble();
+  expected1.compute();
+  ASSERT_TENSOR_EQ(expected1, A1);
+
+  // set_ISPC_codegen_enabled(true);
+  // Tensor<double> A2("A2", {NUM_I, NUM_J}, {Dense, Dense});
+  // A2(i,j) = B(i,k,l) * C(k,j) * D(l,j);
+  // IndexStmt stmt2 = A1.getAssignment().concretize();
+  // stmt2 = scheduleMTTKRPPrecomputedISPC_ST(stmt2, B);
+  // // printToFile("mttkrp_cpu_ispc", stmt);
+  // A2.compile(stmt2);
+  // A2.assemble();
+  // A2.compute();
+  // ASSERT_TENSOR_EQ(expected1, A2);
+  
+  set_ISPC_codegen_enabled(false);
+  Tensor<double> expected2("expected2", {NUM_I, NUM_J}, {Dense, Dense});
+  expected2(i,j) = B(i,k,l,m) * C(k,j) * D(l,j) * E(m,j);
+
+  IndexExpr BE = B(i,k,l,m) * E(m,j);
+  IndexExpr BDE = BE * D(l, j);
+  expected2(i,j) = BDE * C(k,j);
+  IndexStmt taco_stmt2 = expected2.getAssignment().concretize();
+  TensorVar BE_workspace("BE_workspace", Type(Float64, {Dimension(j)}), taco::dense);
+  TensorVar BDE_workspace("BDE_workspace", Type(Float64, {Dimension(j)}), taco::dense);
+
+  IndexStmt precomputed_stmt = forall(i, forall(k,
+          where(forall(j, expected2(i,j) += BDE_workspace(j) * C(k,j)),
+            forall(l, where(forall(j, BDE_workspace(j) += BE_workspace(j) * D(l,j)),
+                forall(m, forall(j, BE_workspace(j) += B(i,k,l,m) * E(m,j))))))));
+
+  // IndexStmt scheduled2 = scheduleMTTKRPPrecomputedCPU(precomputed_stmt, B, 64);
+  // expected2.compile(scheduled2);
+  // expected2.assemble();
+  // expected2.compute();
+  // ASSERT_TENSOR_EQ(expected1, expected2);
+
+  taco::util::TimeResults timevalue;
+  bool time                = true;
+
+  for (int i=0; i<3; i++) {
+    TOOL_BENCHMARK_TIMER(expected1.compute(), "Compute TACO1: ", timevalue);
+    TOOL_BENCHMARK_TIMER(A1.compute(), "Compute ISPC1: ", timevalue);
+    // TOOL_BENCHMARK_TIMER(expected2.compute(), "Compute TACO2: ", timevalue);
+    // TOOL_BENCHMARK_TIMER(A2.compute(), "Compute ISPC2: ", timevalue);
+  }
+}
 
-  A(i,j) = B(i,k,l) * C(k,j) * D(l,j);
-
-  IndexStmt stmt = A.getAssignment().concretize();
-  stmt = scheduleMTTKRPCPU(stmt, B);
-  //printToFile("mttkrp_cpu", stmt);
-
-  A.compile(stmt);
-  A.assemble();
-  A.compute();
 
-  Tensor<double> expected("expected", {NUM_I, NUM_J}, {Dense, Dense});
-  expected(i,j) = B(i,k,l) * C(k,j) * D(l,j);
-  expected.compile();
-  expected.assemble();
-  expected.compute();
-  ASSERT_TENSOR_EQ(expected, A);
-}
 
 TEST(scheduling_eval, spmvGPU) {
   if (!should_use_CUDA_codegen()) {
@@ -1463,7 +2656,336 @@ TEST(scheduling_eval, mttkrpGPU) {
   ASSERT_TENSOR_EQ(expected, A);
 }
 
-TEST(generate_evaluation_files, DISABLED_cpu) {
+TEST(generate_evaluation_files, ispc) {
+  std::cout << "Hi Adhitha!\n" << std::endl ;
+  set_CUDA_codegen_enabled(false);
+  set_ISPC_codegen_enabled(true);
+
+  vector<vector<int>> spmv_parameters = {{32}};
+  vector<vector<int>> spmspv_parameters = {{8}};
+
+  // 4 to 512 and 4, 8, 16
+  vector<vector<int>> spmm_dcsr_parameters = {{16, 8}};
+  vector<vector<int>> spmm_parameters = {{16,4}};
+
+  vector<vector<int>> mttkrp_parameters = {};
+  mttkrp_parameters.push_back({64,0});
+
+  vector<vector<int>> sddmm_parameters = {{8, 8}};
+  vector<vector<int>> ttv_parameters = {{32}};
+
+  int NUM_I = 100;
+  int NUM_J = 100;
+  int NUM_K = 100;
+  int NUM_L = 100;
+
+  string c_file_ending = ".h";
+  string file_ending = ".ispc";
+  string file_path = "eval_prepared_ispc/";
+  mkdir(file_path.c_str(), 0777);
+
+  // spmv
+  {
+    stringstream source1;
+    stringstream source2;
+    std::shared_ptr<ir::CodeGen> codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen);
+    Tensor<double> A("A", {NUM_I, NUM_J}, CSR);
+    Tensor<double> x("x", {NUM_J}, {Dense});
+    Tensor<double> y("y", {NUM_I}, {Dense});
+    y(i) = A(i, j) * x(j);
+    std::cout << "concretizing the assignment statement\n";
+    IndexStmt stmt = y.getAssignment().concretize();
+    std::cout << "Printing the original IndexStmt: " << stmt << std::endl;
+
+    for (auto paramSet : spmv_parameters) {
+      std::cout << "param set: " << paramSet[0] << std::endl;
+      IndexStmt scheduled = scheduleSpMVISPC(stmt, paramSet[0]);
+      std::cout << "scheduled IndexStmt: " << scheduled << std::endl;
+      ir::Stmt compute = lower(scheduled, string("compute_") + util::join(paramSet, "_"),  false, true);
+      std::cout << "computed statement: \n" << compute << std::endl;
+      codegen->compile(compute, false);
+    }
+    ofstream source_file;
+    source_file.open(file_path + "spmv_csr_ispc_taco" + c_file_ending);
+    source_file << source1.str();
+    source_file.close();
+
+    ofstream ispc_source_file;
+    ispc_source_file.open(file_path + "__spmv_csr_ispc_taco" + file_ending);
+    ispc_source_file << source2.str();
+    ispc_source_file.close();
+    
+  }
+
+  // spmm
+  {
+    stringstream source1;
+    stringstream source2;
+    std::shared_ptr<ir::CodeGen> codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen);
+    Tensor<double> A("A", {NUM_I, NUM_J}, CSR);
+    Tensor<double> X("X", {NUM_J, NUM_K}, {Dense, Dense});
+    Tensor<double> Y("Y", {NUM_I, NUM_K}, {Dense, Dense});
+    Y(i, k) = A(i, j) * X(j, k);
+    IndexStmt stmt = Y.getAssignment().concretize();
+    bool isFirst = true;
+    for (auto paramSet : spmm_parameters) {
+      IndexStmt scheduled = scheduleSpMMISPC1(stmt, A, paramSet[0], paramSet[1]);
+      ir::Stmt compute = lower(scheduled, string("compute1_") + util::join(paramSet, "_"),  false, true);
+      codegen->compile(compute, isFirst);
+      isFirst = false;
+    }
+    ofstream source_file;
+    source_file.open(file_path + "spmm_csr_ispc_taco1" + c_file_ending);
+    source_file << source1.str();
+    source_file.close();
+
+    ofstream ispc_source_file;
+    ispc_source_file.open(file_path + "__spmm_csr_ispc_taco1" + file_ending);
+    ispc_source_file << source2.str();
+    ispc_source_file.close();
+  }
+
+  // spmm omp
+  {
+    stringstream source1;
+    stringstream source2;
+    std::shared_ptr<ir::CodeGen> codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen);
+    Tensor<double> A("A", {NUM_I, NUM_J}, CSR);
+    Tensor<double> X("X", {NUM_J, NUM_K}, {Dense, Dense});
+    Tensor<double> Y("Y", {NUM_I, NUM_K}, {Dense, Dense});
+    Y(i, k) = A(i, j) * X(j, k);
+    IndexStmt stmt = Y.getAssignment().concretize();
+    bool isFirst = true;
+    for (auto paramSet : spmm_parameters) {
+      IndexStmt scheduled = scheduleSpMMISPCOMP1(stmt, A, paramSet[0], paramSet[1]);
+      ir::Stmt compute = lower(scheduled, string("compute1_") + util::join(paramSet, "_"),  false, true);
+      codegen->compile(compute, isFirst);
+      isFirst = false;
+    }
+    ofstream source_file;
+    source_file.open(file_path + "spmm_omp_ispc_taco1" + c_file_ending);
+    source_file << source1.str();
+    source_file.close();
+
+    ofstream ispc_source_file;
+    ispc_source_file.open(file_path + "__spmm_omp_ispc_taco1" + file_ending);
+    ispc_source_file << source2.str();
+    ispc_source_file.close();
+  }
+
+  // spmm2
+  {
+    stringstream source1;
+    stringstream source2;
+    std::shared_ptr<ir::CodeGen> codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen);
+    Tensor<double> A("A", {NUM_I, NUM_J}, CSR);
+    Tensor<double> X("X", {NUM_J, NUM_K}, {Dense, Dense});
+    Tensor<double> Y("Y", {NUM_I, NUM_K}, {Dense, Dense});
+    Y(i, k) = A(i, j) * X(j, k);
+    IndexStmt stmt = Y.getAssignment().concretize();
+    bool isFirst = true;
+    for (auto paramSet : spmm_parameters) {
+      IndexStmt scheduled = scheduleSpMMISPC2(stmt, A, paramSet[0], paramSet[1]);
+      ir::Stmt compute = lower(scheduled, string("compute2_") + util::join(paramSet, "_"),  false, true);
+      codegen->compile(compute, isFirst);
+      isFirst = false;
+    }
+    ofstream source_file;
+    source_file.open(file_path + "spmm_csr_ispc_taco2" + c_file_ending);
+    source_file << source1.str();
+    source_file.close();
+
+    ofstream ispc_source_file;
+    ispc_source_file.open(file_path + "__spmm_csr_ispc_taco2" + file_ending);
+    ispc_source_file << source2.str();
+    ispc_source_file.close();
+  }
+
+  // spmm
+  {
+    stringstream source1;
+    stringstream source2;
+    std::shared_ptr<ir::CodeGen> codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen);
+    Tensor<double> A("A", {NUM_I, NUM_J}, CSR);
+    Tensor<double> X("X", {NUM_J, NUM_K}, {Dense, Dense});
+    Tensor<double> Y("Y", {NUM_I, NUM_K}, {Dense, Dense});
+    Y(i, k) = A(i, j) * X(j, k);
+    IndexStmt stmt = Y.getAssignment().concretize();
+    bool isFirst = true;
+    for (auto paramSet : spmm_parameters) {
+      IndexStmt scheduled = scheduleSpMMISPC3(stmt, A, paramSet[0], paramSet[1]);
+      ir::Stmt compute = lower(scheduled, string("compute3_") + util::join(paramSet, "_"),  false, true);
+      codegen->compile(compute, isFirst);
+      isFirst = false;
+    }
+    ofstream source_file;
+    source_file.open(file_path + "spmm_csr_ispc_taco3" + c_file_ending);
+    source_file << source1.str();
+    source_file.close();
+
+    ofstream ispc_source_file;
+    ispc_source_file.open(file_path + "__spmm_csr_ispc_taco3" + file_ending);
+    ispc_source_file << source2.str();
+    ispc_source_file.close();
+  }
+
+  // ttv
+  {
+    stringstream source;
+    stringstream source2;
+    std::shared_ptr<ir::CodeGen> codegen = ir::CodeGen::init_default(source, source2, ir::CodeGen::ImplementationGen);
+    Tensor<double> A("A", {NUM_I, NUM_J}, {Dense, Dense}); // TODO: change to sparse outputs
+    Tensor<double> B("B", {NUM_I, NUM_J, NUM_K}, {Sparse, Sparse, Sparse});
+    Tensor<double> c("c", {NUM_K}, Format({Dense}));
+    A(i,j) = B(i,j,k) * c(k);
+    IndexStmt stmt = A.getAssignment().concretize();
+    bool isFirst = true;
+    for (auto paramSet : ttv_parameters) {
+      IndexStmt scheduled = scheduleTTVCPU(stmt, B, paramSet[0]);
+      ir::Stmt compute = lower(scheduled, string("compute_") + util::join(paramSet, "_"),  false, true);
+      codegen->compile(compute, isFirst);
+      isFirst = false;
+    }
+    ofstream source_file;
+    source_file.open(file_path + "ttv_cpu" + c_file_ending);
+    source_file << source.str();
+    source_file.close();
+
+    ofstream ispc_source_file;
+    ispc_source_file.open(file_path + "__ttv_cpu" + file_ending);
+    ispc_source_file << source2.str();
+    ispc_source_file.close();
+  }
+
+
+  // mttkrp3
+  {
+    stringstream source;
+    stringstream source2;
+    std::shared_ptr<ir::CodeGen> codegen = ir::CodeGen::init_default(source, source2, ir::CodeGen::ImplementationGen);
+    Tensor<double> A("A", {NUM_I, NUM_J}, {Dense, Dense});
+    Tensor<double> B("B", {NUM_I, NUM_K, NUM_L}, {Dense, Sparse, Sparse});
+    Tensor<double> C("C", {NUM_K, NUM_J}, {Dense, Dense});
+    Tensor<double> D("D", {NUM_L, NUM_J}, {Dense, Dense});
+    A(i,j) = B(i,k,l) * C(k,j) * D(l,j);
+    IndexStmt stmt = A.getAssignment().concretize();
+    bool isFirst = true;
+    for (auto paramSet : mttkrp_parameters) {
+      IndexStmt scheduled = scheduleMTTKRPCPU(stmt, B, paramSet[0], paramSet[1]);
+      ir::Stmt compute = lower(scheduled, string("compute_") + util::join(paramSet, "_"),  false, true);
+      codegen->compile(compute, isFirst);
+      isFirst = false;
+    }
+    ofstream source_file;
+    source_file.open(file_path + "mttkrp3_cpu" + c_file_ending);
+    source_file << source.str();
+    source_file.close();
+
+    ofstream ispc_source_file;
+    ispc_source_file.open(file_path + "__mttkrp3_cpu" + file_ending);
+    ispc_source_file << source2.str();
+    ispc_source_file.close();
+  }
+
+
+  return;
+}
+
+
+
+TEST(generate_ispc_sddmm_evaluation_files, ispc) {
+  std::cout << "Hi Adhitha!\n" << std::endl ;
+  set_CUDA_codegen_enabled(false);
+  set_ISPC_codegen_enabled(true);
+
+  vector<vector<int>> spmv_parameters = {{32}};
+  vector<vector<int>> spmspv_parameters = {{8}};
+
+  // 4 to 512 and 4, 8, 16
+  vector<vector<int>> spmm_dcsr_parameters = {{16, 8}};
+  vector<vector<int>> spmm_parameters = {{16,4}};
+
+  vector<vector<int>> mttkrp_parameters = {};
+  mttkrp_parameters.push_back({64,0});
+
+  vector<vector<int>> sddmm_parameters = {{8, 8}};
+  vector<vector<int>> ttv_parameters = {{32}};
+
+  int NUM_I = 100;
+  int NUM_J = 100;
+  int NUM_K = 100;
+
+  string c_file_ending = ".h";
+  string file_ending = ".ispc";
+  string file_path = "eval_prepared_ispc/sddmm/";
+  mkdir(file_path.c_str(), 0777);
+
+  // sddmm
+  {
+    stringstream source1;
+    stringstream source2;
+    std::shared_ptr<ir::CodeGen> codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen);
+    Tensor<double> A("A", {NUM_I, NUM_K}, {Dense, Dense});
+    Tensor<double> B("B", {NUM_I, NUM_K}, CSR);
+    Tensor<double> C("C", {NUM_I, NUM_J}, {Dense, Dense});
+    Tensor<double> D("D", {NUM_J, NUM_K}, {Dense, Dense});
+    A(i,k) = B(i,k) * C(i,j) * D(j,k);
+    IndexStmt stmt = A.getAssignment().concretize();
+    bool isFirst = true;
+    for (auto paramSet : sddmm_parameters) {
+      IndexStmt scheduled = scheduleSDDMMISPC1(stmt, B, paramSet[0], paramSet[1]);
+      ir::Stmt compute = lower(scheduled, string("compute1_") + util::join(paramSet, "_"),  false, true);
+      codegen->compile(compute, isFirst);
+      isFirst = false;
+    }
+    ofstream source_file;
+    source_file.open(file_path + "sddmm_cpu_ispc_taco1" + file_ending);
+    source_file << source1.str();
+    source_file.close();
+
+    ofstream ispc_source_file;
+    ispc_source_file.open(file_path + "__sddmm_cpu_ispc_taco1" + file_ending);
+    ispc_source_file << source2.str();
+    ispc_source_file.close();
+  }
+
+
+  // sddmm
+  {
+    stringstream source1;
+    stringstream source2;
+    std::shared_ptr<ir::CodeGen> codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen);
+    Tensor<double> Y("Y", {NUM_I, NUM_K}, {Dense, Dense});
+    Tensor<double> A("A", {NUM_I, NUM_K}, CSR);
+    Tensor<double> X("X", {NUM_I, NUM_J}, {Dense, Dense});
+    Y(i,j) = A(i,j) * X(i,k) * X(j,k);
+    IndexStmt stmt = Y.getAssignment().concretize();
+    bool isFirst = true;
+    for (auto paramSet : sddmm_parameters) {
+      IndexStmt scheduled = scheduleSDDMMISPC2(stmt, A, paramSet[0], paramSet[1]);
+      ir::Stmt compute = lower(scheduled, string("compute2_") + util::join(paramSet, "_"),  false, true);
+      codegen->compile(compute, isFirst);
+      isFirst = false;
+    }
+    ofstream source_file;
+    source_file.open(file_path + "sddmm_cpu_ispc_taco2" + file_ending);
+    source_file << source1.str();
+    source_file.close();
+
+    ofstream ispc_source_file;
+    ispc_source_file.open(file_path + "__sddmm_cpu_ispc_taco2" + file_ending);
+    ispc_source_file << source2.str();
+    ispc_source_file.close();
+  }
+
+
+  return;
+}
+
+
+
+
+TEST(generate_evaluation_files, cpu) {
   if (should_use_CUDA_codegen()) {
     return;
   }
@@ -1779,10 +3301,63 @@ TEST(generate_evaluation_files, DISABLED_cpu) {
   }
 }
 
-TEST(generate_evaluation_files, DISABLED_gpu) {
-  if (!should_use_CUDA_codegen()) {
-    return;
+TEST(generate_evaluation_files, spmv_ispc) {
+  set_CUDA_codegen_enabled(false);
+  set_ISPC_codegen_enabled(true);
+
+  std::cout << "executing generate_evaluation_file.ispc\n";
+
+  int NUM_I = 100;
+  int NUM_J = 100;
+
+  vector<vector<int>> spmv_parameters = {}; // {NNZ_PER_THREAD, BLOCK_SIZE}
+  for (int i = 3; i <= 20; i++) {
+    spmv_parameters.push_back({i, 512});
+  }
+
+  string file_ending_c = ".c";
+  string file_ending_ispc = ".ispc";
+  string file_path = "eval_prepared_ispc/spmv/";
+  mkdir(file_path.c_str(), 0777);
+
+    // spmv
+  {
+    stringstream source1;
+    stringstream source2;
+    std::shared_ptr<ir::CodeGen> codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen);
+    Tensor<double> A("A", {NUM_I, NUM_J}, CSR);
+    Tensor<double> x("x", {NUM_J}, Format({Dense}));
+    Tensor<double> y("y", {NUM_I}, Format({Dense}));
+    IndexExpr precomputed = A(i, j) * x(j);
+    y(i) = precomputed;
+    IndexStmt stmt = y.getAssignment().concretize();
+    bool isFirst = true;
+    for (auto paramSet : spmv_parameters) {
+      IndexStmt scheduled = scheduleSpMVCPU(stmt);
+      ir::Stmt compute = lower(scheduled, string("compute_") + util::join(paramSet, "_"),  false, true);
+      codegen->compile(compute, isFirst);
+      isFirst = false;
+    }
+    ofstream source_file1;
+    source_file1.open(file_path + "spmv_ispc" + file_ending_c);
+    source_file1 << source1.str();
+    source_file1.close();
+
+    ofstream source_file2;
+    source_file2.open(file_path + "__spmv_ispc" + file_ending_ispc);
+    source_file2 << source2.str();
+    source_file2.close();
   }
+}
+
+TEST(generate_evaluation_files, gpu) {
+  // if (!should_use_CUDA_codegen()) {
+  //   return;
+  // }
+  set_CUDA_codegen_enabled(true);
+  set_ISPC_codegen_enabled(false);
+
+  std::cout << "executing generate_evaluation_file.gpu\n";
 
   vector<vector<int>> spmv_parameters = {}; // {NNZ_PER_THREAD, BLOCK_SIZE}
   for (int i = 3; i <= 20; i++) {
diff --git a/test/tests-transformation.cpp b/test/tests-transformation.cpp
index abfec3d45..9a472906f 100644
--- a/test/tests-transformation.cpp
+++ b/test/tests-transformation.cpp
@@ -255,6 +255,8 @@ INSTANTIATE_TEST_CASE_P(parallelize, apply,
 
 struct reorderLoopsTopologically : public TestWithParam<NotationTest> {};
 
+
+//
 TEST_P(reorderLoopsTopologically, test) {
   IndexStmt actual = taco::reorderLoopsTopologically(GetParam().actual);
   ASSERT_NOTATION_EQ(GetParam().expected, actual);
diff --git a/tools/taco.cpp b/tools/taco.cpp
index cd351a203..7384874ec 100644
--- a/tools/taco.cpp
+++ b/tools/taco.cpp
@@ -9,6 +9,7 @@
 #include "taco.h"
 
 #include "taco/error.h"
+#include "taco/index_notation/index_notation.h"
 #include "taco/parser/lexer.h"
 #include "taco/parser/parser.h"
 #include "taco/parser/schedule_parser.h"
@@ -20,6 +21,7 @@
 #include "taco/lower/lower.h"
 #include "taco/codegen/module.h"
 #include "codegen/codegen_c.h"
+#include "codegen/codegen_ispc.h"
 #include "codegen/codegen_cuda.h"
 #include "codegen/codegen.h"
 #include "taco/util/strings.h"
@@ -188,6 +190,8 @@ static void printUsageInfo() {
   cout << endl;
   printFlag("print-nocolor", "Print without colors.");
   cout << endl;
+  printFlag("ispc", "Generate ISPC code for Intel CPUs");
+  cout << endl;
   printFlag("cuda", "Generate CUDA code for NVIDIA GPUs");
   cout << endl;
   printFlag("schedule", "Specify parallel execution schedule");
@@ -262,7 +266,7 @@ static void printSchedulingHelp() {
               "an output race strategy `strat`. Since the other transformations "
               "expect serial code, parallelize must come last in a series of "
               "transformations.  Possible parallel hardware units are: "
-              "NotParallel, GPUBlock, GPUWarp, GPUThread, CPUThread, CPUVector. "
+              "NotParallel, GPUBlock, GPUWarp, GPUThread, CPUThread, CPUVector, CPUSimd, CPUSimd. "
               "Possible output race strategies are: "
               "IgnoreRaces, NoRaces, Atomics, Temporary, ParallelReduction.");
 }
@@ -279,6 +283,8 @@ static void printVersionInfo() {
     cout << "Built with Python support." << endl;
   if(TACO_FEATURE_CUDA)
     cout << "Built with CUDA support." << endl;
+  if(TACO_FEATURE_ISPC)
+    cout << "Built with ISPC support." << endl;
   cout << endl;
   cout << "Built on: " << TACO_BUILD_DATE << endl;
   cout << "CMake build type: " << TACO_BUILD_TYPE << endl;
@@ -308,7 +314,10 @@ static void printCommandLine(ostream& os, int argc, char* argv[]) {
   }
 }
 
-static bool setSchedulingCommands(vector<vector<string>> scheduleCommands, parser::Parser& parser, IndexStmt& stmt) {
+static int setSchedulingCommands(vector<vector<string>> scheduleCommands, 
+  parser::Parser& parser, IndexStmt& stmt, Assignment assignment) {
+
+  std::cout << "setting scheduling commands\n";
   auto findVar = [&stmt](string name) {
     ProvenanceGraph graph(stmt);
     for (auto v : graph.getAllIndexVars()) {
@@ -321,9 +330,15 @@ static bool setSchedulingCommands(vector<vector<string>> scheduleCommands, parse
     abort(); // to silence a warning: control reaches end of non-void function
   };
 
-  bool isGPU = false;
+  int isGPU = 0;
+  int isISPC = 0;
 
   for(vector<string> scheduleCommand : scheduleCommands) {
+    std::cout << "running schedluing command: ";
+    for (auto &command : scheduleCommand) {
+      std::cout << command << " ";
+    }
+    std::cout << std::endl;
     string command = scheduleCommand[0];
     scheduleCommand.erase(scheduleCommand.begin());
 
@@ -352,6 +367,16 @@ static bool setSchedulingCommands(vector<vector<string>> scheduleCommands, parse
       IndexVar fused(f);
       stmt = stmt.fuse(findVar(i), findVar(j), fused);
 
+    } else if (command == "loopfuse") {
+      taco_uassert(scheduleCommand.size() == 2) 
+        << "'loopfuse' scheduling directive takes 2 parameters: fuse(b, 2)";
+      std::string side = scheduleCommand[0];
+      taco_uassert(side == "b" || side == "f") 
+        << "first parameter must be either 'f' or 'b'";
+
+      int iters = std::stoi(scheduleCommand[1]);
+
+      stmt = loopFusionOverFission(stmt, assignment, side, iters);
     } else if (command == "split") {
       taco_uassert(scheduleCommand.size() == 4)
           << "'split' scheduling directive takes 4 parameters: split(i, i1, i2, splitFactor)";
@@ -536,7 +561,15 @@ static bool setSchedulingCommands(vector<vector<string>> scheduleCommands, parse
         parallel_unit = ParallelUnit::CPUThread;
       } else if (unit == "CPUVector") {
         parallel_unit = ParallelUnit::CPUVector;
-      } else {
+      } else if (unit == "CPUSimd") {
+        isISPC = true;
+        parallel_unit = ParallelUnit::CPUSimd;
+      } 
+      else if (unit == "CPUSpmd") {
+        parallel_unit = ParallelUnit::CPUSpmd;
+        isISPC = true;
+      }
+      else {
         taco_uerror << "Parallel hardware not defined.";
         goto end;
       }
@@ -557,6 +590,8 @@ static bool setSchedulingCommands(vector<vector<string>> scheduleCommands, parse
         goto end;
       }
 
+      std::cout << "stmt before parallelizing the statement: " << stmt << endl;
+      std::cout << "ParallelUnit: " << ParallelUnit_NAMES[(int) parallel_unit] << ", outputRaceStrategy: " << OutputRaceStrategy_NAMES[(int) output_race_strategy] << std::endl;
       stmt = stmt.parallelize(findVar(i), parallel_unit, output_race_strategy);
 
     } else if (command == "assemble") {
@@ -612,7 +647,13 @@ static bool setSchedulingCommands(vector<vector<string>> scheduleCommands, parse
     end:;
   }
 
-  return isGPU;
+  if (isGPU) {
+    return 1;
+  }
+  else if (isISPC) {
+    return 2;
+  }
+  return 0;
 }
 
 int main(int argc, char* argv[]) {
@@ -641,6 +682,7 @@ int main(int argc, char* argv[]) {
   bool color               = true;
   bool readKernels         = false;
   bool cuda                = false;
+  bool ispc                = false;
 
   bool setSchedule         = false;
 
@@ -949,6 +991,10 @@ int main(int argc, char* argv[]) {
     else if ("-cuda" == argName) {
       cuda = true;
     }
+    else if ("-ispc" == argName) {
+      std::cout << "ispc true\n";
+      ispc = true;
+    }
     else if ("-schedule" == argName) {
       vector<string> descriptor = util::split(argValue, ",");
       if (descriptor.size() > 2 || descriptor.empty()) {
@@ -1001,6 +1047,8 @@ int main(int argc, char* argv[]) {
     }
   }
 
+  std::cout << "cuda: " << cuda << ", ispc: " << ispc << std::endl;
+
   // Print compute is the default if nothing else was asked for
   if (!printAssemble && !printEvaluate && !printIterationGraph &&
       !writeCompute && !writeAssemble && !writeKernels && !readKernels &&
@@ -1009,9 +1057,11 @@ int main(int argc, char* argv[]) {
   }
 
   // pre-parse expression, to determine existence and order of loaded tensors
+  std::cout << "pre-parse expression, to determine existence and order of loaded tensors\n";
   map<string,TensorBase> loadedTensors;
   TensorBase temp_tensor;
   parser::Parser temp_parser(exprStr, formats, dataTypes, tensorsDimensions, loadedTensors, 42);
+  std::cout << exprStr << std::endl;
   try {
     temp_parser.parse();
     temp_tensor = temp_parser.getResultTensor();
@@ -1112,33 +1162,61 @@ int main(int argc, char* argv[]) {
   taco_set_parallel_schedule(sched, chunkSize);
   taco_set_num_threads(nthreads);
 
-  IndexStmt stmt =
-      makeConcreteNotation(makeReductionNotation(tensor.getAssignment()));
+  Assignment assignment = tensor.getAssignment();
+  std::cout << "tensor.getAssignment(): " << assignment << std::endl;
+
+  IndexStmt stmt2 = makeReductionNotation(tensor.getAssignment());
+  std::cout << "reducedNotation: " << stmt2 << std::endl;
+  // IndexStmt stmt = 
+  //     makeConcreteNotation(makeReductionNotation(tensor.getAssignment()));
+  IndexStmt stmt = makeConcreteNotation(stmt2);
+  std::cout << "concrete index statement: " << stmt << std::endl;
   stmt = reorderLoopsTopologically(stmt);
 
+  std::cout << "topologically reordered loops statement: " << stmt << std::endl;
+
   if (setSchedule) {
-    cuda |= setSchedulingCommands(scheduleCommands, parser, stmt);
+    int val = setSchedulingCommands(scheduleCommands, parser, stmt, tensor.getAssignment());
+    // stmt = loopFusionOverFission(stmt, tensor.getAssignment());
+    cuda |= (val==1);
+    ispc |= (val==2);
   }
   else {
+    // stmt = loopFusionOverFission(stmt, tensor.getAssignment());
     stmt = insertTemporaries(stmt);
     stmt = parallelizeOuterLoop(stmt);
   }
+  std::cout << "after setting the scheduling commands\n";
+  std::cout << stmt << std::endl;
 
   if (cuda) {
     if (!CUDA_BUILT && benchmark) {
       return reportError("TACO must be built for CUDA (cmake -DCUDA=ON ..) to benchmark", 2);
     }
     set_CUDA_codegen_enabled(true);
+    set_ISPC_codegen_enabled(false);
+  }
+  else if (ispc) {
+    if (!ISPC_BUILT && benchmark) {
+      return reportError("TACO must be built for ISPC (cmake -DISPC=ON .. to benchmark", 2);
+    }
+    set_CUDA_codegen_enabled(false);
+    set_ISPC_codegen_enabled(true);
   }
   else {
     set_CUDA_codegen_enabled(false);
+    set_ISPC_codegen_enabled(false);
   }
 
+  std::cout << "running scalar promote\n" << std::endl; //
   stmt = scalarPromote(stmt);
+  std::cout << "\nafter scalar promote: \n" << stmt << std::endl << std::endl;
+
   if (printConcrete) {
     cout << stmt << endl;
   }
 
+  // lower index statement to ir statement
   Kernel kernel;
   if (benchmark) {
     if (time) cout << endl;
@@ -1221,9 +1299,15 @@ int main(int argc, char* argv[]) {
     }
   }
   else {
+    std::cout << "lowering stmt: " << stmt << std::endl;
     compute = lower(stmt, prefix+"compute",  computeWithAssemble, true);
     assemble = lower(stmt, prefix+"assemble", true, false);
     evaluate = lower(stmt, prefix+"evaluate", true, true);
+
+    std::cout << "\n\ncompute kernel\n------------\n" << compute << std::endl << std::endl;
+    // compute kernel is the most basic kernel after lowering phase
+
+    std::cout << "\n\nevaluate kernel\n------------\n" << evaluate << std::endl << std::endl;
   }
 
   string packComment =
@@ -1278,6 +1362,7 @@ int main(int argc, char* argv[]) {
   }
 
   bool hasPrinted = false;
+
   std::shared_ptr<ir::CodeGen> codegen = ir::CodeGen::init_default(cout, ir::CodeGen::ImplementationGen);
   codegen->setColor(color);
   if (printAssemble) {
@@ -1298,6 +1383,7 @@ int main(int argc, char* argv[]) {
     }
 
     if (compute.defined()) {
+      std::cout << "Code generation\n";
       codegen->compile(compute, false);
     }
     else {
@@ -1355,7 +1441,7 @@ int main(int argc, char* argv[]) {
   }
 
   IterationGraph iterationGraph;
-  if (printIterationGraph) {
+  if (printIterationGraph) { // print iteration graph
     iterationGraph = IterationGraph::make(tensor.getAssignment());
   }