diff --git a/.gitignore b/.gitignore index 16389f34e..215b56e9a 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,7 @@ CMakeCache.txt doc apps/tensor_times_vector/tensor_times_vector + +.cache +.vscode +compile_commands.json diff --git a/CMakeLists.txt b/CMakeLists.txt index a6a80d9d1..5c405fb27 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,10 +10,12 @@ project(taco LANGUAGES C CXX ) option(CUDA "Build for NVIDIA GPU (CUDA must be preinstalled)" OFF) +option(ISPC "Build for Intel ISPC Compiler (ISPC Compiler must be preinstalled)" OFF) option(PYTHON "Build TACO for python environment" OFF) -option(OPENMP "Build with OpenMP execution support" OFF) +option(OPENMP "Build with OpenMP execution support" ON) option(COVERAGE "Build with code coverage analysis" OFF) set(TACO_FEATURE_CUDA 0) +set(TACO_FEATURE_ISPC 0) set(TACO_FEATURE_OPENMP 0) set(TACO_FEATURE_PYTHON 0) if(CUDA) @@ -22,6 +24,11 @@ if(CUDA) add_definitions(-DCUDA_BUILT) set(TACO_FEATURE_CUDA 1) endif(CUDA) +if(ISPC) + message("-- Searching for ISPC Installation") + add_definitions(-DISPC_BUILT) + set(TACO_FEATURE_ISPC 1) +endif(ISPC) if(OPENMP) message("-- Will use OpenMP for parallel execution") add_definitions(-DUSE_OPENMP) diff --git a/include/taco/codegen/module.h b/include/taco/codegen/module.h index 36eb34f1a..3df7c8e0f 100644 --- a/include/taco/codegen/module.h +++ b/include/taco/codegen/module.h @@ -68,6 +68,7 @@ class Module { private: std::stringstream source; + std::stringstream additional_source; std::stringstream header; std::string libname; std::string tmpdir; diff --git a/include/taco/cuda.h b/include/taco/cuda.h index aad6b5229..9c4a7aae9 100644 --- a/include/taco/cuda.h +++ b/include/taco/cuda.h @@ -9,7 +9,19 @@ #define CUDA_BUILT false #endif +#ifndef ISPC_BUILT + #define ISPC_BUILT false +#endif + namespace taco { + +/// Functions used by taco to interface with ISPC +bool should_use_ISPC_codegen(); +void set_ISPC_codegen_enabled(bool enabled); +bool is_ISPC_code_stream_enabled(); +void set_ISPC_code_stream_enabled(bool enabled); + + /// Functions used by taco to interface with CUDA (especially unified memory) /// Check if should use CUDA codegen bool should_use_CUDA_codegen(); diff --git a/include/taco/index_notation/transformations.h b/include/taco/index_notation/transformations.h index 7aa2579ad..4d6ec6830 100644 --- a/include/taco/index_notation/transformations.h +++ b/include/taco/index_notation/transformations.h @@ -223,6 +223,9 @@ IndexStmt parallelizeOuterLoop(IndexStmt stmt); */ IndexStmt reorderLoopsTopologically(IndexStmt stmt); +IndexStmt loopFusionOverFission(IndexStmt stmt, Assignment assignment, + std::string side, int iters); + /** * Performs scalar promotion so that reductions are done by accumulating into * scalar temporaries whenever possible. diff --git a/include/taco/ir/ir.h b/include/taco/ir/ir.h index f852f26b1..96dc7d034 100644 --- a/include/taco/ir/ir.h +++ b/include/taco/ir/ir.h @@ -591,7 +591,7 @@ struct Switch : public StmtNode { static const IRNodeType _type_info = IRNodeType::Switch; }; -enum class LoopKind {Serial, Static, Dynamic, Runtime, Vectorized, Static_Chunked}; +enum class LoopKind {Serial, Static, Dynamic, Runtime, Vectorized, Static_Chunked, Foreach, Mul_Thread, Init}; /** A for loop from start to end by increment. * A vectorized loop will require the increment to be 1 and the diff --git a/include/taco/ir/ir_printer.h b/include/taco/ir/ir_printer.h index 4e50764e9..c2c505bf5 100644 --- a/include/taco/ir/ir_printer.h +++ b/include/taco/ir/ir_printer.h @@ -16,6 +16,7 @@ class IRPrinter : public IRVisitorStrict { public: IRPrinter(std::ostream& stream); IRPrinter(std::ostream& stream, bool color, bool simplify); + IRPrinter(std::ostream& stream, std::ostream& stream2, bool color, bool simplify); virtual ~IRPrinter(); void setColor(bool color); @@ -72,6 +73,7 @@ class IRPrinter : public IRVisitorStrict { virtual void visit(const Break*); std::ostream &stream; + std::ostream &stream2; int indent; bool color; bool simplify; @@ -109,6 +111,7 @@ class IRPrinter : public IRVisitorStrict { void doIndent(); void printBinOp(Expr a, Expr b, std::string op, Precedence precedence); bool needsParentheses(Precedence precedence); + void sendToStream(std::stringstream &stream); std::string keywordString(std::string); std::string commentString(std::string); diff --git a/include/taco/ir_tags.h b/include/taco/ir_tags.h index 5858a13e3..6a74be173 100644 --- a/include/taco/ir_tags.h +++ b/include/taco/ir_tags.h @@ -9,7 +9,7 @@ namespace taco { /// ParallelUnit::GPUWarp can be optionally used to allow for GPU warp-level primitives /// ParallelUnit::GPUThread causes for every iteration to be executed on a separate GPU thread enum class ParallelUnit { - NotParallel, DefaultUnit, GPUBlock, GPUWarp, GPUThread, CPUThread, CPUVector, CPUThreadGroupReduction, GPUBlockReduction, GPUWarpReduction + NotParallel, DefaultUnit, GPUBlock, GPUWarp, GPUThread, CPUThread, CPUVector, CPUThreadGroupReduction, GPUBlockReduction, GPUWarpReduction, CPUSimd, CPUSpmd }; extern const char *ParallelUnit_NAMES[]; diff --git a/include/taco/lower/lowerer_impl_imperative.h b/include/taco/lower/lowerer_impl_imperative.h index 65f069fda..d743f5875 100644 --- a/include/taco/lower/lowerer_impl_imperative.h +++ b/include/taco/lower/lowerer_impl_imperative.h @@ -499,10 +499,13 @@ class LowererImplImperative : public LowererImpl { bool emitUnderivedGuards = true; + int loopDepth = 0; int inParallelLoopDepth = 0; std::map parallelUnitSizes; std::map parallelUnitIndexVars; + std::map forUnits; // + std::map whereTempsWithLoopDepth; /// Keep track of what IndexVars have already been defined std::set definedIndexVars; diff --git a/include/taco/util/strings.h b/include/taco/util/strings.h index 5dfb2f174..a3c3d863f 100644 --- a/include/taco/util/strings.h +++ b/include/taco/util/strings.h @@ -1,6 +1,7 @@ #ifndef TACO_UTIL_STRINGS_H #define TACO_UTIL_STRINGS_H +#include "taco/cuda.h" #include #include #include @@ -8,6 +9,8 @@ #include #include +#include "taco/type.h" + // To get the value of a compiler macro variable #define STRINGIFY(x) #x #define TO_STRING(x) STRINGIFY(x) @@ -15,6 +18,25 @@ namespace taco { namespace util { +// /// Turn anything except floating points that can be written to a stream +// /// into a string. +// template +// typename std::enable_if::value, std::string>::type +// toStringISPC(const T &val) { + +// std::stringstream sstream; +// if (val == Int32) { +// sstream << "int32"; +// } +// else if (val == Int64) { +// sstream << "int64"; +// } +// else { +// sstream << val; +// } +// return sstream.str(); +// } + /// Turn anything except floating points that can be written to a stream /// into a string. template diff --git a/include/taco/version.h.in b/include/taco/version.h.in index bc5559d7d..8ef507598 100644 --- a/include/taco/version.h.in +++ b/include/taco/version.h.in @@ -20,5 +20,6 @@ #define TACO_FEATURE_OPENMP @TACO_FEATURE_OPENMP@ #define TACO_FEATURE_PYTHON @TACO_FEATURE_PYTHON@ #define TACO_FEATURE_CUDA @TACO_FEATURE_CUDA@ +#define TACO_FEATURE_ISPC @TACO_FEATURE_ISPC@ #endif /* TACO_VERSION_H */ diff --git a/out/taco-uml/._taco.svg b/out/taco-uml/._taco.svg new file mode 100755 index 000000000..e88dbd51b Binary files /dev/null and b/out/taco-uml/._taco.svg differ diff --git a/out/taco-uml/taco.svg b/out/taco-uml/taco.svg new file mode 100644 index 000000000..57f7a18d1 --- /dev/null +++ b/out/taco-uml/taco.svg @@ -0,0 +1,878 @@ +IntrusivePtrT *ptrUncopyableIRNodevirtual void accept(IRVisitorStrict *v) const = 0virtual IRNodeType type_info() const = 0;BaseStmtNodeBaseExprNodeDatatype typeStmtNodevoid accept(IRVisitorStrict *v) constExprNodevoid accept(IRVisitorStrict *v) constIRHandlevoid accept(IRVisitorStrict *v) constExprStmtIRVisitorStrictvirtual void visit(const IRNode*) const = 0IRVisitorvirtual void visit(const IRNode*)IRRewriterExpr exprStmt stmtvirtual void visit(const ExprNode* op)virtual void visit(const StmtNode* op)Expr rewrite(Expr)Stmt rewrite(Stmt)IRPrinterstd::ostream &streamstd::ostream &stream2int indentbool colorbool simplifyenum PrecedencePrecedence parentPrecedence = BOTTOMNameGenerator varNameGeneratorscopedMap<Expr, std::String> varNamesvoid doIndent()void printBinOp(Expr a, Expr b, std::string op, Precedence precedence)void fewMoreMethods()virtual void visit(const ExprNode*)virtual void visit(const StmtNode*)setColor(bool color)print(Stmt)IRVerifierExpressionSimplifierRemoveRedundantStatementsRemoveRedundantLoopsRemoveDuplicateBodyCodeGenCodeGen_CCodeGen_CUDACodeGen_ISPCManageableIndexStmtNodevirtual void accept(IndexStmtVisitorStrict*) const = 0IndexExprNodevirtual void accept(IndexStmtVisitorStrict*) const = 0IndexStmtIndexExprIndexExprVisitorStrictvoid visit(const IndexStmt&)virtual void visit(const AccessNode*) = 0virtual void visit(const LiteralNode*) = 0virtual void visit(const NegNode*) = 0virtual void visit(const AddNode*) = 0virtual void visit(const SubNode*) = 0virtual void visit(const MulNode*) = 0virtual void visit(const DivNode*) = 0virtual void visit(const SqrtNode*) = 0virtual void visit(const CastNode*) = 0virtual void visit(const CallIntrinsicNode*) = 0virtual void visit(const ReductionNode*) = 0IndexStmtVisitorStrictvoid visit(const IndexStmt&)virtual void visit(const AssignmentNode*) = 0virtual void visit(const YieldNode*) = 0virtual void visit(const ForallNode*) = 0virtual void visit(const WhereNode*) = 0virtual void visit(const SequenceNode*) = 0virtual void visit(const AssembleNode*) = 0virtual void visit(const MultiNode*) = 0virtual void visit(const SuchThatNode*) = 0IndexNotationVisitorStrictIndexNotationPrintervoid print(const IndexExpr& expr)void print(const IndexStmt& expr)void visit(const AccessNode* node)void visit(const LiteralNode* node)void visit(const NegNode* node)void visit(const AddNode* node)void visit(const SubNode* node)void visit(const MulNode* node)void visit(const DivNode* node)void visit(const SqrtNode* node)void visit(const CastNode* node)void visit(const CallIntrinsicNode* node)void visit(const UnaryExprNode* node)void visit(const BinaryExprNode* node)void visit(const ReductionNode* node)void visit(const AssignmentNode* node)void visit(const YieldNode* node)void visit(const ForallNode* node)void visit(const WhereNode* node)void visit(const SequenceNode* node)void visit(const AssembleNode* node)void visit(const MultiNode* node)void visit(const SuchThatNode* node)IndexNotationVisitorvirtual void visit(const AccessNode* node)virtual void visit(const LiteralNode* node)virtual void visit(const NegNode* node)virtual void visit(const AddNode* node)virtual void visit(const SubNode* node)virtual void visit(const MulNode* node)virtual void visit(const DivNode* node)virtual void visit(const SqrtNode* node)virtual void visit(const CastNode* node)virtual void visit(const CallIntrinsicNode* node)virtual void visit(const UnaryExprNode* node)virtual void visit(const BinaryExprNode* node)virtual void visit(const ReductionNode* node)virtual void visit(const AssignmentNode* node)virtual void visit(const YieldNode* node)virtual void visit(const ForallNode* node)virtual void visit(const WhereNode* node)virtual void visit(const SequenceNode* node)virtual void visit(const AssembleNode* node)virtual void visit(const MultiNode* node)virtual void visit(const SuchThatNode* node)MatcherIndexExprRewriterStrictIndexExpr exprIndexExpr rewrite(IndexExpr)virtual void visit(const AccessNode* op) = 0virtual void visit(const LiteralNode* op) = 0virtual void visit(const NegNode* op) = 0virtual void visit(const SqrtNode* op) = 0virtual void visit(const AddNode* op) = 0virtual void visit(const SubNode* op) = 0virtual void visit(const MulNode* op) = 0virtual void visit(const DivNode* op) = 0virtual void visit(const CastNode* op) = 0virtual void visit(const CallIntrinsicNode* op) = 0virtual void visit(const ReductionNode* op) = 0IndexStmtRewriterStrictIndexStmt stmtIndexStmt rewrite(IndexStmt)virtual void visit(const AssignmentNode* op) = 0virtual void visit(const YieldNode* op) = 0virtual void visit(const ForallNode* op) = 0virtual void visit(const WhereNode* op) = 0virtual void visit(const SequenceNode* op) = 0virtual void visit(const AssembleNode* op) = 0virtual void visit(const MultiNode* op) = 0virtual void visit(const SuchThatNode* op) = 0IndexNotationRewriterStrictIndexNotationRewritervirtual void visit(const AccessNode* node)virtual void visit(const LiteralNode* node)virtual void visit(const NegNode* node)virtual void visit(const AddNode* node)virtual void visit(const SubNode* node)virtual void visit(const MulNode* node)virtual void visit(const DivNode* node)virtual void visit(const SqrtNode* node)virtual void visit(const CastNode* node)virtual void visit(const CallIntrinsicNode* node)virtual void visit(const UnaryExprNode* node)virtual void visit(const BinaryExprNode* node)virtual void visit(const ReductionNode* node)virtual void visit(const AssignmentNode* node)virtual void visit(const YieldNode* node)virtual void visit(const ForallNode* node)virtual void visit(const WhereNode* node)virtual void visit(const SequenceNode* node)virtual void visit(const AssembleNode* node)virtual void visit(const MultiNode* node)virtual void visit(const SuchThatNode* node)Lowererstd::shared_ptr<LowererImpl> impl;LowererImplclass Visitor;friend class Visitor;std::shared_ptr<Visitor> visitor;virtual ir::Stmt lower(IndexStmt stmt);virtual ir::Expr lower(IndexExpr expr);virtual ir::Expr lowerExpr(IndexExpr expr) = 0;virtual ir::Stmt lowerStmt(IndexStmt stmt) = 0;virtual ir::Stmt lower(IndexStmt stmt, std::string name,bool assemble, bool compute, bool pack, bool unpack) = 0;LowererImplImperativeclass Visitorfiend class Visitorstd::shared_ptr<Visitor> visitorbool assemblebool computevars a_bunch_of_other_fieldsvirtual ir::Stmt lowerExpr(IndexExpr expr);virtual ir::Stmt lowerStmt(IndexStmt stmt);ir::Stmt lower(IndexStmt stmt, std::string name,bool assemble, bool compute, bool pack, bool unpack)Stmt LowererImplImperative::lower(IndexStmt stmt) {return visitor->lower(stmt);}VisitorLowererImpl* implExpr exprStmt stmtvoid visit(const AssignmentNode* node)void visit(const YieldNode* node)void visit(const ForallNode* node)void visit(const WhereNode* node)void visit(const MultiNode* node)void visit(const SuchThatNode* node)void visit(const SequenceNode* node)void visit(const AssembleNode* node)void visit(const AccessNode* node)void visit(const LiteralNode* node)void visit(const NegNode* node)void visit(const AddNode* node)void visit(const SubNode* node)void visit(const MulNode* node)void visit(const DivNode* node)void visit(const SqrtNode* node)void visit(const CastNode* node)void visit(const CallIntrinsicNode* node)void visit(const ReductionNode* node)Visitor(LowererImplImperative* impl)Stmt lower(IndexStmt stmt)Expr lower(IndexExpr expr)Stmt lower(IndexStmt stmt) {this->stmt = Stmt();impl->accessibleIterators.scope();IndexStmtVisitorStrict::visit(stmt);impl->accessibleIterators.unscope();return this->stmt;}contains111111contains11contains11contains11contains11contains11 \ No newline at end of file diff --git a/src/codegen/codegen.cpp b/src/codegen/codegen.cpp index f0c09d98a..6ec54a2f8 100644 --- a/src/codegen/codegen.cpp +++ b/src/codegen/codegen.cpp @@ -2,6 +2,7 @@ #include "taco/cuda.h" #include "codegen_cuda.h" #include "codegen_c.h" +#include "codegen_ispc.h" #include #include @@ -26,6 +27,21 @@ shared_ptr CodeGen::init_default(std::ostream &dest, OutputKind outputK if (should_use_CUDA_codegen()) { return make_shared(dest, outputKind); } + else if (should_use_ISPC_codegen()) { + return make_shared(dest, outputKind); + } + else { + return make_shared(dest, outputKind); + } +} + +shared_ptr CodeGen::init_default(std::ostream &dest, std::ostream &dest2, OutputKind outputKind) { + if (should_use_CUDA_codegen()) { + return make_shared(dest, outputKind); + } + else if (should_use_ISPC_codegen()) { + return make_shared(dest, dest2, outputKind); + } else { return make_shared(dest, outputKind); } @@ -229,6 +245,49 @@ string CodeGen::printTensorProperty(string varname, const GetProperty* op, bool return ret.str(); } +string CodeGen::getUnpackedTensorArgument(string varname, const GetProperty* op, + bool is_output_prop) { + stringstream ret; + ret << ""; + + auto tensor = op->tensor.as(); + if (op->property == TensorProperty::Values) { + // for the values, it's in the last slot + ret << "uniform " << printType(tensor->type, false) << " " << varname << "[]"; + return ret.str(); + } else if (op->property == TensorProperty::ValuesSize) { + ret << "int32 " << varname; + return ret.str(); + } + + // for a Dense level, nnz is an int + // for a Fixed level, ptr is an int + // all others are int* + if (op->property == TensorProperty::Dimension) { + if (op->type == Int32) { + ret << "uniform int32 "; + } else if (op->type == Int64) { + ret << "uniform int64 "; + } else { + ret << "int "; + } + ret << varname; + + } else { + taco_iassert(op->property == TensorProperty::Indices); + if (op->type == Int32) { + ret << "uniform int32 "; + } else if (op->type == Int64) { + ret << "uniform int64 "; + } else { + ret << "uniform int "; + } + ret << varname << "[]"; + } + + return ret.str(); +} + string CodeGen::unpackTensorProperty(string varname, const GetProperty* op, bool is_output_prop) { stringstream ret; @@ -310,13 +369,9 @@ string CodeGen::pointTensorProperty(std::string varname) { return ret.str(); } -// helper to print declarations -string CodeGen::printDecls(map varMap, - vector inputs, vector outputs) { - stringstream ret; - unordered_set propsAlreadyGenerated; - - vector sortedProps; +void CodeGen::getSortedProps(map &varMap, + vector &sortedProps, vector &inputs, + vector &outputs) { for (auto const& p: varMap) { if (p.first.as()) @@ -355,6 +410,17 @@ string CodeGen::printDecls(map varMap, return a->index < b->index; }); +} + +// helper to print declarations +string CodeGen::printDecls(map varMap, + vector inputs, vector outputs) { + stringstream ret; + unordered_set propsAlreadyGenerated; + + vector sortedProps; + getSortedProps(varMap, sortedProps, inputs, outputs); + for (auto prop: sortedProps) { bool isOutputProp = (find(outputs.begin(), outputs.end(), prop->tensor) != outputs.end()); @@ -375,7 +441,6 @@ string CodeGen::printDecls(map varMap, return ret.str(); } - string CodeGen::printPack(map, string> outputProperties, vector outputs) { stringstream ret; diff --git a/src/codegen/codegen.h b/src/codegen/codegen.h index cc25c80d6..db891f995 100644 --- a/src/codegen/codegen.h +++ b/src/codegen/codegen.h @@ -16,9 +16,13 @@ class CodeGen : public IRPrinter { enum CodeGenType { C, CUDA }; CodeGen(std::ostream& stream, CodeGenType type) : IRPrinter(stream), codeGenType(type) {}; - CodeGen(std::ostream& stream, bool color, bool simplify, CodeGenType type) : IRPrinter(stream, color, simplify), codeGenType(type) {}; + CodeGen(std::ostream& stream, bool color, bool simplify, CodeGenType type) + : IRPrinter(stream, color, simplify), codeGenType(type) {}; + CodeGen(std::ostream& stream, std::ostream& stream2, bool color, bool simplify, CodeGenType type) + : IRPrinter(stream, stream2, color, simplify), codeGenType(type) {}; /// Initialize the default code generator static std::shared_ptr init_default(std::ostream &dest, OutputKind outputKind); + static std::shared_ptr init_default(std::ostream &dest, std::ostream &dest2, OutputKind outputKind); /// Compile a lowered function virtual void compile(Stmt stmt, bool isFirst=false) =0; @@ -26,6 +30,9 @@ class CodeGen : public IRPrinter { protected: static bool checkForAlloc(const Function *func); static int countYields(const Function *func); + void getSortedProps(std::map &varMap, + std::vector &sortedProps, std::vector &inputs, + std::vector &outputs); static std::string printCType(Datatype type, bool is_ptr); static std::string printCUDAType(Datatype type, bool is_ptr); @@ -52,6 +59,10 @@ class CodeGen : public IRPrinter { std::string printFuncName(const Function *func, std::map inputMap={}, std::map outputMap={}); + + std::string printTensorProperty(std::string varname, const GetProperty* op, bool is_ptr); + std::string getUnpackedTensorArgument(std::string varname, const GetProperty* op, + bool is_output_prop); void resetUniqueNameCounters(); std::string genUniqueName(std::string name); @@ -61,9 +72,8 @@ class CodeGen : public IRPrinter { private: virtual std::string restrictKeyword() const { return ""; } - std::string printTensorProperty(std::string varname, const GetProperty* op, bool is_ptr); std::string unpackTensorProperty(std::string varname, const GetProperty* op, - bool is_output_prop); + bool is_output_prop); std::string packTensorProperty(std::string varname, Expr tnsr, TensorProperty property, int mode, int index); std::string pointTensorProperty(std::string varname); diff --git a/src/codegen/codegen_c.cpp b/src/codegen/codegen_c.cpp index 2ade9d7f6..89a98a20e 100644 --- a/src/codegen/codegen_c.cpp +++ b/src/codegen/codegen_c.cpp @@ -240,7 +240,10 @@ class CodeGen_C::FindVars : public IRVisitor { }; CodeGen_C::CodeGen_C(std::ostream &dest, OutputKind outputKind, bool simplify) - : CodeGen(dest, false, simplify, C), out(dest), outputKind(outputKind) {} + : CodeGen(dest, false, simplify, C), out(dest), out2(dest), outputKind(outputKind) {} + +CodeGen_C::CodeGen_C(std::ostream &dest, std::ostream &dest2, OutputKind outputKind, bool simplify) + : CodeGen(dest, dest2, false, simplify, C), out(dest), out2(dest2), outputKind(outputKind) {} CodeGen_C::~CodeGen_C() {} diff --git a/src/codegen/codegen_c.h b/src/codegen/codegen_c.h index 55c9d01a8..471f3658a 100644 --- a/src/codegen/codegen_c.h +++ b/src/codegen/codegen_c.h @@ -16,6 +16,7 @@ class CodeGen_C : public CodeGen { /// Initialize a code generator that generates code to an /// output stream. CodeGen_C(std::ostream &dest, OutputKind outputKind, bool simplify=true); + CodeGen_C(std::ostream &dest, std::ostream &dest2, OutputKind outputKind, bool simplify=true); ~CodeGen_C(); /// Compile a lowered function @@ -28,23 +29,24 @@ class CodeGen_C : public CodeGen { protected: using IRPrinter::visit; - void visit(const Function*); - void visit(const VarDecl*); - void visit(const Yield*); - void visit(const Var*); - void visit(const For*); - void visit(const While*); - void visit(const GetProperty*); - void visit(const Min*); - void visit(const Max*); - void visit(const Allocate*); - void visit(const Sqrt*); - void visit(const Store*); - void visit(const Assign*); + virtual void visit(const Function*); + virtual void visit(const VarDecl*); + virtual void visit(const Yield*); + virtual void visit(const Var*); + virtual void visit(const For*); + virtual void visit(const While*); + virtual void visit(const GetProperty*); + virtual void visit(const Min*); + virtual void visit(const Max*); + virtual void visit(const Allocate*); + virtual void visit(const Sqrt*); + virtual void visit(const Store*); + virtual void visit(const Assign*); std::map varMap; std::vector localVars; std::ostream &out; + std::ostream &out2; OutputKind outputKind; diff --git a/src/codegen/codegen_cuda.cpp b/src/codegen/codegen_cuda.cpp index 77cf0cd88..14505f740 100644 --- a/src/codegen/codegen_cuda.cpp +++ b/src/codegen/codegen_cuda.cpp @@ -646,6 +646,7 @@ void CodeGen_CUDA::printDeviceFunctions(const Function* func) { // Collect device functions resetUniqueNameCounters(); deviceFunctionLoopDepth = 0; + // here they calculate the device FunctionCollecor DeviceFunctionCollector deviceFunctionCollector(func->inputs, func->outputs, this); func->body.accept(&deviceFunctionCollector); deviceFunctions = deviceFunctionCollector.blockFors; diff --git a/src/codegen/codegen_ispc.cpp b/src/codegen/codegen_ispc.cpp new file mode 100644 index 000000000..d4f428ccf --- /dev/null +++ b/src/codegen/codegen_ispc.cpp @@ -0,0 +1,1097 @@ +#include +#include +#include +#include +#include +#include + +#include "taco/cuda.h" +#include "taco/ir/ir_printer.h" +#include "taco/ir/ir_visitor.h" +#include "taco/ir/ir_rewriter.h" +#include "taco/ir/simplify.h" + +#include "codegen_c.h" +#include "codegen_ispc.h" +#include "taco/error.h" +#include "taco/util/strings.h" +#include "taco/util/collections.h" + +using namespace std; + +namespace taco { +namespace ir { + +// Some helper functions +namespace { + +// Include stdio.h for printf +// stdlib.h for malloc/realloc +// math.h for sqrt +// MIN preprocessor macro +// This *must* be kept in sync with taco_tensor_t.h +const string cHeaders = + "#ifndef TACO_C_HEADERS\n" + "#define TACO_C_HEADERS\n" + "#include \n" + "#include \n" + "#include \n" + "#include \n" + "#include \n" + "#include \n" + "#include \n" + "#if _OPENMP\n" + "#include \n" + "#endif\n" + "#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))\n" + "#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))\n" + "#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)\n" + "#ifndef TACO_TENSOR_T_DEFINED\n" + "#define TACO_TENSOR_T_DEFINED\n" + "typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;\n" + "typedef struct {\n" + " int32_t order; // tensor order (number of modes)\n" + " int32_t* dimensions; // tensor dimensions\n" + " int32_t csize; // component size\n" + " int32_t* mode_ordering; // mode storage ordering\n" + " taco_mode_t* mode_types; // mode storage types\n" + " uint8_t*** indices; // tensor index data (per mode)\n" + " uint8_t* vals; // tensor values\n" + " int32_t vals_size; // values array size\n" + "} taco_tensor_t;\n" + "#endif\n" + "#if !_OPENMP\n" + "int omp_get_thread_num() { return 0; }\n" + "int omp_get_max_threads() { return 1; }\n" + "#endif\n" + "int cmp(const void *a, const void *b) {\n" + " return *((const int*)a) - *((const int*)b);\n" + "}\n" + "int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {\n" + " if (array[arrayStart] >= target) {\n" + " return arrayStart;\n" + " }\n" + " int lowerBound = arrayStart; // always < target\n" + " int upperBound = arrayEnd; // always >= target\n" + " while (upperBound - lowerBound > 1) {\n" + " int mid = (upperBound + lowerBound) / 2;\n" + " int midValue = array[mid];\n" + " if (midValue < target) {\n" + " lowerBound = mid;\n" + " }\n" + " else if (midValue > target) {\n" + " upperBound = mid;\n" + " }\n" + " else {\n" + " return mid;\n" + " }\n" + " }\n" + " return upperBound;\n" + "}\n" + "int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {\n" + " if (array[arrayEnd] <= target) {\n" + " return arrayEnd;\n" + " }\n" + " int lowerBound = arrayStart; // always <= target\n" + " int upperBound = arrayEnd; // always > target\n" + " while (upperBound - lowerBound > 1) {\n" + " int mid = (upperBound + lowerBound) / 2;\n" + " int midValue = array[mid];\n" + " if (midValue < target) {\n" + " lowerBound = mid;\n" + " }\n" + " else if (midValue > target) {\n" + " upperBound = mid;\n" + " }\n" + " else {\n" + " return mid;\n" + " }\n" + " }\n" + " return lowerBound;\n" + "}\n" + "taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,\n" + " int32_t* dimensions, int32_t* mode_ordering,\n" + " taco_mode_t* mode_types) {\n" + " taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));\n" + " t->order = order;\n" + " t->dimensions = (int32_t *) malloc(order * sizeof(int32_t));\n" + " t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));\n" + " t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));\n" + " t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***));\n" + " t->csize = csize;\n" + " for (int32_t i = 0; i < order; i++) {\n" + " t->dimensions[i] = dimensions[i];\n" + " t->mode_ordering[i] = mode_ordering[i];\n" + " t->mode_types[i] = mode_types[i];\n" + " switch (t->mode_types[i]) {\n" + " case taco_mode_dense:\n" + " t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));\n" + " break;\n" + " case taco_mode_sparse:\n" + " t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));\n" + " break;\n" + " }\n" + " }\n" + " return t;\n" + "}\n" + "void deinit_taco_tensor_t(taco_tensor_t* t) {\n" + " for (int i = 0; i < t->order; i++) {\n" + " free(t->indices[i]);\n" + " }\n" + " free(t->indices);\n" + " free(t->dimensions);\n" + " free(t->mode_ordering);\n" + " free(t->mode_types);\n" + " free(t);\n" + "}\n" + "#endif\n"; + +const string ispcHeaders = + "#define __TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))\n" + "#define __TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))\n" + "#define __TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)\n" + "int __cmp(const void *a, const void *b) {\n" + " return *((const int*)a) - *((const int*)b);\n" + "}\n" + "int __taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {\n" + " if (array[arrayStart] >= target) {\n" + " return arrayStart;\n" + " }\n" + " int lowerBound = arrayStart; // always < target\n" + " int upperBound = arrayEnd; // always >= target\n" + " while (upperBound - lowerBound > 1) {\n" + " int mid = (upperBound + lowerBound) / 2;\n" + " int midValue = array[mid];\n" + " if (midValue < target) {\n" + " lowerBound = mid;\n" + " }\n" + " else if (midValue > target) {\n" + " upperBound = mid;\n" + " }\n" + " else {\n" + " return mid;\n" + " }\n" + " }\n" + " return upperBound;\n" + "}\n" + "int __taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {\n" + " if (array[arrayEnd] <= target) {\n" + " return arrayEnd;\n" + " }\n" + " int lowerBound = arrayStart; // always <= target\n" + " int upperBound = arrayEnd; // always > target\n" + " while (upperBound - lowerBound > 1) {\n" + " int mid = (upperBound + lowerBound) / 2;\n" + " int midValue = array[mid];\n" + " if (midValue < target) {\n" + " lowerBound = mid;\n" + " }\n" + " else if (midValue > target) {\n" + " upperBound = mid;\n" + " }\n" + " else {\n" + " return mid;\n" + " }\n" + " }\n" + " return lowerBound;\n" + "}\n\n\n"; + +} // anonymous namespace + + + +// find variables for generating declarations +// generates a single var for each GetProperty +class CodeGen_ISPC::FindVars : public IRVisitor { +public: + map varMap; + + // the variables for which we need to add declarations + map varDecls; + + vector localVars; + + // this maps from tensor, property, mode, index to the unique var + map, string> canonicalPropertyVar; + + // this is for convenience, recording just the properties unpacked + // from the output tensor so we can re-save them at the end + map, string> outputProperties; + + // TODO: should replace this with an unordered set + vector outputTensors; + vector inputTensors; + + CodeGen_ISPC *codeGen; + + // copy inputs and outputs into the map + FindVars(vector inputs, vector outputs, CodeGen_ISPC *codeGen) + : codeGen(codeGen) { + for (auto v: inputs) { + auto var = v.as(); + taco_iassert(var) << "Inputs must be vars in codegen"; + taco_iassert(varMap.count(var)==0) << "Duplicate input found in codegen"; + inputTensors.push_back(v); + varMap[var] = var->name; + } + for (auto v: outputs) { + auto var = v.as(); + taco_iassert(var) << "Outputs must be vars in codegen"; + taco_iassert(varMap.count(var)==0) << "Duplicate output found in codegen"; + outputTensors.push_back(v); + varMap[var] = var->name; + } + } + +protected: + using IRVisitor::visit; + + virtual void visit(const Var *op) { + if (varMap.count(op) == 0) { + varMap[op] = op->is_ptr? op->name : codeGen->genUniqueName(op->name); + } + } + + virtual void visit(const VarDecl *op) { + if (!util::contains(localVars, op->var)) { + localVars.push_back(op->var); + } + op->var.accept(this); + op->rhs.accept(this); + } + + virtual void visit(const For *op) { + if (!util::contains(localVars, op->var)) { + localVars.push_back(op->var); + } + op->var.accept(this); + op->start.accept(this); + op->end.accept(this); + op->increment.accept(this); + op->contents.accept(this); + } + + virtual void visit(const GetProperty *op) { + if (!util::contains(inputTensors, op->tensor) && + !util::contains(outputTensors, op->tensor)) { + // Don't create header unpacking code for temporaries + return; + } + + if (varMap.count(op) == 0) { + auto key = + tuple(op->tensor,op->property, + (size_t)op->mode, + (size_t)op->index); + if (canonicalPropertyVar.count(key) > 0) { + varMap[op] = canonicalPropertyVar[key]; + } else { + auto unique_name = codeGen->genUniqueName(op->name); + canonicalPropertyVar[key] = unique_name; + varMap[op] = unique_name; + varDecls[op] = unique_name; + if (util::contains(outputTensors, op->tensor)) { + outputProperties[key] = unique_name; + } + } + } + } +}; + + +// Finds all for loops tagged with accelerator and adds statements to deviceFunctions +// Also tracks scope of when device function is called and +// tracks which variables must be passed to function. +class CodeGen_ISPC::FunctionCollector : public IRVisitor { +public: + vector threadFors; // contents is device function + vector initFors; // for loops to initialize statements + map scopeMap; + + // the variables to pass to each device function + vector>> functionParameters; + vector> currentParameters; // keep as vector so code generation is deterministic + set currentParameterSet; + + set variablesDeclaredInKernel; + + vector> threadIDVars; + vector> blockIDVars; + vector> warpIDVars; + vector numThreads; + vector numWarps; + + CodeGen_ISPC *codeGen; + // copy inputs and outputs into the map + FunctionCollector(vector inputs, vector outputs, CodeGen_ISPC *codeGen) : codeGen(codeGen) { + inDeviceFunction = false; + for (auto v: inputs) { + auto var = v.as(); + taco_iassert(var) << "Inputs must be vars in codegen"; + taco_iassert(scopeMap.count(var) == 0) << + "Duplicate input found in codegen"; + scopeMap[var] = var->name; + } + for (auto v: outputs) { + auto var = v.as(); + taco_iassert(var) << "Outputs must be vars in codegen"; + taco_iassert(scopeMap.count(var) == 0) << + "Duplicate output found in codegen"; + + scopeMap[var] = var->name; + } + } + +protected: + bool inDeviceFunction; + using IRVisitor::visit; + + virtual void visit(const For *op) { + if (op->parallel_unit == ParallelUnit::CPUSpmd) { + std::cout << "ParallelUnit::CPUSpmd directive found\n"; + + inDeviceFunction = false; + op->var.accept(this); + inDeviceFunction = true; + + threadFors.push_back(op); + std::cout << "scopeMap: [" << scopeMap[op->var] << "], varExpr: [" << op->var << "]\n"; + threadIDVars.push_back(pair(scopeMap[op->var], op->var)); + Expr blockSize = ir::simplify(ir::Div::make(ir::Sub::make(op->end, op->start), op->increment)); + numThreads.push_back(blockSize); + + } + else if (op->parallel_unit == ParallelUnit::CPUSimd) { + std::cout << "************************************************************************** CPUSimd For node\n"; + } + else if (op->kind == LoopKind::Init) { + std::cout << "************************************************************************* Init loop kind found\n"; + initFors.push_back(op); + } + else{ + op->var.accept(this); + } + op->start.accept(this); + op->end.accept(this); + op->increment.accept(this); + op->contents.accept(this); + } + + virtual void visit(const Var *op) { + if (scopeMap.count(op) == 0) { + string name = codeGen->genUniqueName(op->name); + if (!inDeviceFunction) { + scopeMap[op] = name; + } + } + else if (scopeMap.count(op) == 1 && inDeviceFunction && currentParameterSet.count(op) == 0 + && (threadIDVars.empty() || op != threadIDVars.back().second) + && !variablesDeclaredInKernel.count(op)) { + currentParameters.push_back(pair(scopeMap[op], op)); + currentParameterSet.insert(op); + } + } + + virtual void visit(const VarDecl *op) { + if (inDeviceFunction) { + variablesDeclaredInKernel.insert(op->var); + } + op->var.accept(this); + op->rhs.accept(this); + } + + virtual void visit(const GetProperty *op) { + if (scopeMap.count(op->tensor) == 0 && !inDeviceFunction) { + auto key = + tuple(op->tensor,op->property, + (size_t)op->mode, + (size_t)op->index); + auto unique_name = codeGen->genUniqueName(op->name); + scopeMap[op->tensor] = unique_name; + } + else if (scopeMap.count(op->tensor) == 1 && inDeviceFunction && currentParameterSet.count(op->tensor) == 0) { + currentParameters.push_back(pair(op->tensor.as()->name, op->tensor)); + currentParameterSet.insert(op->tensor); + } + } +}; + + +CodeGen_ISPC::CodeGen_ISPC(std::ostream &dest, OutputKind outputKind, bool simplify) + : CodeGen_C(dest, dest, outputKind, simplify) {} + +CodeGen_ISPC::CodeGen_ISPC(std::ostream &dest, std::ostream &dest2, OutputKind outputKind, bool simplify) + : CodeGen_C(dest, dest2, outputKind, simplify) {} + +CodeGen_ISPC::~CodeGen_ISPC() {} + +void CodeGen_ISPC::compile(Stmt stmt, bool isFirst) { + varMap = {}; + localVars = {}; + + if (isFirst) { + // output the headers + out << cHeaders; + + if (&out != &out2) { + out2 << ispcHeaders; + } + } + out << endl; + // generate code for the Stmt + std::cout << "Compiling the code\n"; + stmt.accept(this); +} + + + +string CodeGen_ISPC::printCallISPCFunc(const std::string& funcName, map varMap, + vector &sortedProps) { + std::stringstream ret; + ret << " "; + unordered_set propsAlreadyGenerated; + + ret << "__" << funcName << "("; + + + for (unsigned long i=0; i < sortedProps.size(); i++) { + ret << varMap[sortedProps[i]]; + if (i != sortedProps.size()-1) { + ret << ", "; + } + propsAlreadyGenerated.insert(varMap[sortedProps[i]]); + } + + ret << ");\n"; + return ret.str(); +} + +// varMap is already sorted <- make sure to pass the sorted varMap +void CodeGen_ISPC::printISPCFunc(const Function *func, map varMap, + vector &sortedProps) { + + FunctionCollector functionCollector(func->inputs, func->outputs, this); + func->body.accept(&functionCollector); + + vector inputs = func->inputs; + vector outputs = func->outputs; + unordered_set propsAlreadyGenerated; + + for (unsigned long i=0; i < sortedProps.size(); i++) { + auto prop = sortedProps[i]; + bool isOutputProp = (find(outputs.begin(), outputs.end(), + prop->tensor) != outputs.end()); + + auto var = prop->tensor.as(); + if (var->is_parameter) { + if (isOutputProp) { + funcVariables << " " << printTensorProperty(varMap[prop], prop, false) << ";" << endl; + } else { + break; + } + } else { + funcVariables << getUnpackedTensorArgument(varMap[prop], prop, isOutputProp); + } + propsAlreadyGenerated.insert(varMap[prop]); + + if (i!=sortedProps.size()-1) { + funcVariables << ", "; + } + if (i%2==0) { + funcVariables << "\n\t"; + } + } + + resetUniqueNameCounters(); + + // threadFors code generation + for (size_t i = 0; i < functionCollector.threadFors.size(); i++) { + + const For *threadloop = to(functionCollector.threadFors[i]); + taco_iassert(threadloop->parallel_unit == ParallelUnit::CPUSpmd); + Stmt function = threadloop->contents; + std::cout << "threadloop function: " << function << std::endl; + + out2 << "\nstatic task void __" << func->name << "__ ("; + out2 << funcVariables.str(); + out2 << "\n) {\n\n"; + + indent++; + // output body of the threadloop + taskCode = true; + print(threadloop); + indent--; + out2 << "}\n\n"; + + } + + taskCode = false; + out2 << "export void __" << func->name << " ("; + out2 << funcVariables.str(); + out2 << "\n) {\n\n"; + + indent++; + // output body + print(func->body); + indent--; + out2 << "}\n"; + +} + +void CodeGen_ISPC::sendToStream(std::stringstream &stream) { + if (is_ISPC_code_stream_enabled()) { + this->out2 << stream.str(); + } + else { + CodeGen_C::sendToStream(stream); + } +} + +void CodeGen_ISPC::visit(const Function* func) { + set_ISPC_code_stream_enabled(false); + + // if generating a header, protect the function declaration with a guard + if (func->name == "assemble") { + if (outputKind == HeaderGen) { + out << "#ifndef TACO_GENERATED_" << func->name << "\n"; + out << "#define TACO_GENERATED_" << func->name << "\n"; + } + + int numYields = countYields(func); + emittingCoroutine = (numYields > 0); + funcName = func->name; + labelCount = 0; + + resetUniqueNameCounters(); + FindVars inputVarFinder(func->inputs, {}, this); + func->body.accept(&inputVarFinder); + FindVars outputVarFinder({}, func->outputs, this); + func->body.accept(&outputVarFinder); + + // output function declaration + doIndent(); + out << printFuncName(func, inputVarFinder.varDecls, outputVarFinder.varDecls); + + // if we're just generating a header, this is all we need to do + if (outputKind == HeaderGen) { + out << ";\n"; + out << "#endif\n"; + return; + } + + out << " {\n"; + + indent++; + + // find all the vars that are not inputs or outputs and declare them + resetUniqueNameCounters(); + FindVars varFinder(func->inputs, func->outputs, this); + func->body.accept(&varFinder); + varMap = varFinder.varMap; + localVars = varFinder.localVars; + + // Print variable declarations + out << printDecls(varFinder.varDecls, func->inputs, func->outputs) << endl; + + if (emittingCoroutine) { + out << printContextDeclAndInit(varMap, localVars, numYields, func->name) + << endl; + } + + // output body + print(func->body); + + // output repack only if we allocated memory + if (checkForAlloc(func)) + out << endl << printPack(varFinder.outputProperties, func->outputs); + + if (emittingCoroutine) { + out << printCoroutineFinish(numYields, funcName); + } + + doIndent(); + out << "return 0;\n"; + indent--; + + doIndent(); + out << "}\n"; + return; + + } + + + if (outputKind == HeaderGen) { + out << "#ifndef TACO_GENERATED_" << func->name << "\n"; + out << "#define TACO_GENERATED_" << func->name << "\n"; + } + + int numYields = countYields(func); + emittingCoroutine = (numYields > 0); + funcName = func->name; + labelCount = 0; + + resetUniqueNameCounters(); + FindVars inputVarFinder(func->inputs, {}, this); + func->body.accept(&inputVarFinder); + FindVars outputVarFinder({}, func->outputs, this); + func->body.accept(&outputVarFinder); + + // output function declaration + doIndent(); + out << printFuncName(func, inputVarFinder.varDecls, outputVarFinder.varDecls); + + // if we're just generating a header, this is all we need to do + if (outputKind == HeaderGen) { + out << ";\n"; + out << "#endif\n"; + return; + } + + out << " {\n"; + + indent++; + + // find all the vars that are not inputs or outputs and declare them + resetUniqueNameCounters(); + FindVars varFinder(func->inputs, func->outputs, this); + func->body.accept(&varFinder); + varMap = varFinder.varMap; + localVars = varFinder.localVars; + + // Print variable declarations + out << printDecls(varFinder.varDecls, func->inputs, func->outputs) << endl; + + sortedProps = {}; + vector inputs = func->inputs; + vector outputs = func->outputs; + getSortedProps(varFinder.varDecls, sortedProps, inputs, outputs); + out << printCallISPCFunc(func->name, varFinder.varDecls, sortedProps); + + if (emittingCoroutine) { + out << printContextDeclAndInit(varMap, localVars, numYields, func->name) + << endl; + } + + // output repack only if we allocated memory + if (checkForAlloc(func)) + out << endl << printPack(varFinder.outputProperties, func->outputs); + + if (emittingCoroutine) { + out << printCoroutineFinish(numYields, funcName); + } + + doIndent(); + out << "return 0;\n"; + indent--; + + doIndent(); + out << "}\n\n"; + + set_ISPC_code_stream_enabled(true); + printISPCFunc(func, varFinder.varDecls, sortedProps); + set_ISPC_code_stream_enabled(false); + +} + +void CodeGen_ISPC::visit(const VarDecl* op) { + // std::stringstream stream; + if (is_ISPC_code_stream_enabled()) { + if (emittingCoroutine) { + doIndent(); + op->var.accept(this); + parentPrecedence = Precedence::TOP; + stream2 << " = "; + op->rhs.accept(this); + stream2 << ";"; + stream2 << endl; + } else { + IRPrinter::visit(op); + } + } + else { + CodeGen_C::visit(op); + } + + // sendToStream(stream); +} + +void CodeGen_ISPC::visit(const Yield* op) { + printYield(op, localVars, varMap, labelCount, funcName); +} + +// For Vars, we replace their names with the generated name, +// since we match by reference (not name) +void CodeGen_ISPC::visit(const Var* op) { + if (is_ISPC_code_stream_enabled()) { + taco_iassert(varMap.count(op) > 0) << + "Var " << op->name << " not found in varMap"; + if (emittingCoroutine) { + // out << "TACO_DEREF("; + } + out2 << varMap[op]; + if (emittingCoroutine) { + // out << ")"; + } + } + else { + CodeGen_C::visit(op); + } +} + +static string genVectorizePragma(int width) { + stringstream ret; + ret << "#pragma clang loop interleave(enable) "; + if (!width) + ret << "vectorize(enable)"; + else + ret << "vectorize_width(" << width << ")"; + + return ret.str(); +} + +// static string getParallelizePragma(LoopKind kind) { +// stringstream ret; +// ret << "#pragma omp parallel for schedule"; +// switch (kind) { +// case LoopKind::Static: +// ret << "(static, 1)"; +// break; +// case LoopKind::Dynamic: +// ret << "(dynamic, 1)"; +// break; +// case LoopKind::Runtime: +// ret << "(runtime)"; +// break; +// case LoopKind::Static_Chunked: +// ret << "(static)"; +// break; +// default: +// break; +// } +// return ret.str(); +// } + +// static string getUnrollPragma(size_t unrollFactor) { +// return "#pragma unroll " + std::to_string(unrollFactor); +// } + +static string getAtomicPragma() { + return "#pragma omp atomic"; +} + +// The next two need to output the correct pragmas depending +// on the loop kind (Serial, Static, Dynamic, Vectorized) +// +// Docs for vectorization pragmas: +// http://clang.llvm.org/docs/LanguageExtensions.html#extensions-for-loop-hint-optimizations +void CodeGen_ISPC::visit(const For* op) { + if (!is_ISPC_code_stream_enabled()) { + CodeGen_C::visit(op); + return; + } + doIndent(); + + if (op->kind == LoopKind::Mul_Thread) { + if (!taskCode) { + out2 << "launch[4] " << printCallISPCFunc(funcName+"__", varMap, sortedProps) << "\n"; + return; + } + stream2 << "uniform unsigned int chunk_size = ("; + op->end.accept(this); + stream2 << " - "; + op->start.accept(this); + stream2 << ") / taskCount;\n"; + stream2 << " uniform unsigned int modulo = ("; + op->end.accept(this); + stream2 << " - "; + op->start.accept(this); + stream2 << ") % taskCount;\n"; + + stream2 << " uniform unsigned int start = "; + op->start.accept(this); + stream2 << " + chunk_size * taskIndex;\n"; + + stream2 << " if (taskIndex != 0) {\n"; + stream2 << " start += modulo;\n"; + stream2 << " }\n"; + + stream2 << " uniform unsigned int end = start + chunk_size;\n"; + stream2 << " if (taskIndex == 0) {\n"; + stream2 << " end += modulo;\n"; + stream2 << " }\n\n"; + + stream2 << keywordString(" for") << " ("; + if (!emittingCoroutine) { + if (op->var.type() == Int32) { + stream2 << "int32 "; + } + else if (op->var.type() == Int64) { + stream2 << "int64 "; + } + + } + op->var.accept(this); + stream2 << " = "; + stream2 << "start"; + // op->start.accept(this); + stream2 << keywordString("; "); + op->var.accept(this); + stream2 << " < "; + parentPrecedence = BOTTOM; + stream2 << "end"; + // op->end.accept(this); + stream2 << keywordString("; "); + op->var.accept(this); + + auto lit = op->increment.as(); + if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || + (lit->type.isUInt() && lit->equalsScalar(1)))) { + stream2 << "++"; + } + else { + stream2 << " += "; + op->increment.accept(this); + } + + } + + else if (op->kind == LoopKind::Foreach) { + stream2 << keywordString("foreach") << " ("; + + op->var.accept(this); + stream2 << " = "; + op->start.accept(this); + stream2 << keywordString(" ... "); + op->end.accept(this); + + } else { + stream2 << keywordString("for") << " ("; + if (!emittingCoroutine) { + if (op->var.type() == Int32) { + stream2 << "int32 "; + } + else if (op->var.type() == Int64) { + stream2 << "int64 "; + } + + } + op->var.accept(this); + stream2 << " = "; + op->start.accept(this); + stream2 << keywordString("; "); + op->var.accept(this); + stream2 << " < "; + parentPrecedence = BOTTOM; + op->end.accept(this); + stream2 << keywordString("; "); + op->var.accept(this); + + auto lit = op->increment.as(); + if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || + (lit->type.isUInt() && lit->equalsScalar(1)))) { + stream2 << "++"; + } + else { + stream2 << " += "; + op->increment.accept(this); + } + + } + + stream2 << ") {\n"; + op->contents.accept(this); + doIndent(); + stream2 << "}"; + stream2 << endl; + +} + +void CodeGen_ISPC::visit(const While* op) { + // it's not clear from documentation that clang will vectorize + // while loops + // however, we'll output the pragmas anyway + if (op->kind == LoopKind::Vectorized) { + doIndent(); + out << genVectorizePragma(op->vec_width); + out << "\n"; + } + + CodeGen_C::visit(op); +} + +void CodeGen_ISPC::visit(const GetProperty* op) { + taco_iassert(varMap.count(op) > 0) << + "Property " << Expr(op) << " of " << op->tensor << " not found in varMap"; + if (is_ISPC_code_stream_enabled()) { + out2 << varMap[op]; + } + else { + out << varMap[op]; + } + +} + +void CodeGen_ISPC::visit(const Min* op) { + if (op->operands.size() == 1) { + op->operands[0].accept(this); + return; + } + for (size_t i=0; ioperands.size()-1; i++) { + stream << "TACO_MIN("; + op->operands[i].accept(this); + stream << ","; + } + op->operands.back().accept(this); + for (size_t i=0; ioperands.size()-1; i++) { + stream << ")"; + } +} + +void CodeGen_ISPC::visit(const Max* op) { + if (op->operands.size() == 1) { + op->operands[0].accept(this); + return; + } + for (size_t i=0; ioperands.size()-1; i++) { + stream << "TACO_MAX("; + op->operands[i].accept(this); + stream << ","; + } + op->operands.back().accept(this); + for (size_t i=0; ioperands.size()-1; i++) { + stream << ")"; + } +} + +void CodeGen_ISPC::visit(const Allocate* op) { + + + if (is_ISPC_code_stream_enabled()) { + string elementType = printCType(op->var.type(), false); + doIndent(); + + op->var.accept(this); + stream2 << " = "; + // stream2 << " = ("; + // stream2 << elementType << "*"; + // stream2 << ")"; + if (op->is_realloc) { + stream2 << "realloc("; + op->var.accept(this); + stream2 << ", "; + } + else { + // If the allocation was requested to clear the allocated memory, + // use calloc instead of malloc. + if (op->clear) { + stream2 << "calloc(1, "; + } else { + stream2 << "new "; + } + } + stream2 << elementType << "["; + parentPrecedence = MUL; + op->num_elements.accept(this); + parentPrecedence = TOP; + stream2 << "];"; + stream2 << endl; + + + } else { + CodeGen_C::visit(op); + + } + + +} + +void CodeGen_ISPC::visit(const Sqrt* op) { + taco_tassert(op->type.isFloat() && op->type.getNumBits() == 64) << + "Codegen doesn't currently support non-double sqrt"; + stream << "sqrt("; + op->a.accept(this); + stream << ")"; +} + +void CodeGen_ISPC::visit(const Assign* op) { + if (is_ISPC_code_stream_enabled()) { + doIndent(); + op->lhs.accept(this); + parentPrecedence = Precedence::TOP; + bool printed = false; + if (simplify) { + if (isa(op->rhs)) { + auto add = to(op->rhs); + if (add->a == op->lhs) { + const Literal* lit = add->b.as(); + if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || + (lit->type.isUInt() && lit->equalsScalar(1)))) { + stream2 << "++"; + } + else { + if (op->use_atomics) { + stream2 << " += reduce_add("; + add->b.accept(this); + stream2 << ")"; + } + else { + stream2 << " += "; + add->b.accept(this); + } + } + printed = true; + } + } + else if (isa(op->rhs)) { + auto mul = to(op->rhs); + if (mul->a == op->lhs) { + stream2 << " *= "; + mul->b.accept(this); + printed = true; + } + } + else if (isa(op->rhs)) { + auto bitOr = to(op->rhs); + if (bitOr->a == op->lhs) { + stream2 << " |= "; + bitOr->b.accept(this); + printed = true; + } + } + } + if (!printed) { + stream2 << " = "; + op->rhs.accept(this); + } + + stream2 << ";"; + stream2 << endl; + + IRPrinter::visit(op); + } + else { + CodeGen_C::visit(op); + + } + + +} + +void CodeGen_ISPC::visit(const Store* op) { + if (is_ISPC_code_stream_enabled()) { + if (op->use_atomics) { + doIndent(); + stream2 << getAtomicPragma() << endl; + } + } + else { + if (op->use_atomics) { + doIndent(); + stream << getAtomicPragma() << endl; + } + } + IRPrinter::visit(op); +} + +} +} diff --git a/src/codegen/codegen_ispc.h b/src/codegen/codegen_ispc.h new file mode 100644 index 000000000..62d2897ca --- /dev/null +++ b/src/codegen/codegen_ispc.h @@ -0,0 +1,68 @@ +#ifndef TACO_BACKEND_ISPC_H +#define TACO_BACKEND_ISPC_H +#include +#include +#include + +#include "taco/ir/ir.h" +#include "taco/ir/ir_printer.h" +#include "codegen_c.h" + +namespace taco { +namespace ir { + + +class CodeGen_ISPC : public CodeGen_C { +public: + /// Initialize a code generator that generates code to an + /// output stream. + CodeGen_ISPC(std::ostream &dest, OutputKind outputKind, bool simplify=true); + CodeGen_ISPC(std::ostream &dest, std::ostream &dest2, OutputKind outputKind, bool simplify=true); + ~CodeGen_ISPC(); + + /// Compile a lowered function + void compile(Stmt stmt, bool isFirst=false); + + /// Generate shims that unpack an array of pointers representing + /// a mix of taco_tensor_t* and scalars into a function call + static void generateShim(const Stmt& func, std::stringstream &stream); + +protected: + using CodeGen_C::visit; + + void visit(const Function*); + void visit(const VarDecl*); + void visit(const Yield*); + void visit(const Var*); + void visit(const For*); + void visit(const While*); + void visit(const GetProperty*); + void visit(const Min*); + void visit(const Max*); + void visit(const Allocate*); + void visit(const Sqrt*); + void visit(const Store*); + void visit(const Assign*); + + Stmt simplifyFunctionBodies(Stmt stmt); + std::string printCallISPCFunc(const std::string& funcName, std::map varMap, + std::vector &sortedProps); + void printISPCFunc(const Function *func, std::map varMap, + std::vector &sortedProps); + + bool taskCode = false; + + std::stringstream funcVariables; + std::vector sortedProps; + + class FindVars; + class FunctionCollector; + +private: + virtual std::string restrictKeyword() const { return "restrict"; } + void sendToStream(std::stringstream &stream); +}; + +} // namespace ir +} // namespace taco +#endif diff --git a/src/codegen/module.cpp b/src/codegen/module.cpp index bd0f487b1..c95999365 100644 --- a/src/codegen/module.cpp +++ b/src/codegen/module.cpp @@ -13,6 +13,7 @@ #include "taco/util/strings.h" #include "taco/util/env.h" #include "codegen/codegen_c.h" +#include "codegen/codegen_ispc.h" #include "codegen/codegen_cuda.h" #include "taco/cuda.h" @@ -42,6 +43,7 @@ void Module::addFunction(Stmt func) { void Module::compileToSource(string path, string prefix) { if (!moduleFromUserSource) { + std::cout << "module not from user source\n"; // create a codegen instance and add all the funcs bool didGenRuntime = false; @@ -50,11 +52,13 @@ void Module::compileToSource(string path, string prefix) { header.clear(); source.str(""); source.clear(); + additional_source.str(""); + additional_source.clear(); taco_tassert(target.arch == Target::C99) << "Only C99 codegen supported currently"; std::shared_ptr sourcegen = - CodeGen::init_default(source, CodeGen::ImplementationGen); + CodeGen::init_default(source, additional_source, CodeGen::ImplementationGen); std::shared_ptr headergen = CodeGen::init_default(header, CodeGen::HeaderGen); @@ -68,8 +72,17 @@ void Module::compileToSource(string path, string prefix) { ofstream source_file; string file_ending = should_use_CUDA_codegen() ? ".cu" : ".c"; source_file.open(path+prefix+file_ending); + if (should_use_ISPC_codegen()) { + source_file << "#include \"" << path+prefix+"_ispc.h\"\n"; + } source_file << source.str(); source_file.close(); + + ofstream additional_source_file; + string file_ending2 = ".ispc"; + additional_source_file.open(path+prefix+file_ending2); + additional_source_file << additional_source.str(); + additional_source_file.close(); ofstream header_file; header_file.open(path+prefix+".h"); @@ -89,6 +102,9 @@ void writeShims(vector funcs, string path, string prefix) { if (should_use_CUDA_codegen()) { CodeGen_CUDA::generateShim(func, shims); } + // else if (should_use_ISPC_codegen()) { + // CodeGen_ISPC::generateShim(func, shims); + // } else { CodeGen_C::generateShim(func, shims); } @@ -98,6 +114,9 @@ void writeShims(vector funcs, string path, string prefix) { if (should_use_CUDA_codegen()) { shims_file.open(path+prefix+"_shims.cpp"); } + // else if (should_use_ISPC_codegen()) { + // shims_file.open(path+prefix+".c", ios::app); + // } else { shims_file.open(path+prefix+".c", ios::app); } @@ -109,6 +128,7 @@ void writeShims(vector funcs, string path, string prefix) { } // anonymous namespace string Module::compile() { + std::cout << "Module::compile\n"; string prefix = tmpdir+libname; string fullpath = prefix + ".so"; @@ -123,6 +143,13 @@ string Module::compile() { file_ending = ".cu"; shims_file = prefix + "_shims.cpp"; } + // else if (should_use_ISPC_codegen()) { + // cc = util::getFromEnv("TACO_ISPC", "ispc"); + // cflags = util::getFromEnv("TACO_ISPC_FLAGS", + // " --target=sse2-i32x4,sse4-i32x8,avx1-i32x8,avx2-i32x8,avx512knl-i32x16,avx512skx-i32x16 --pic -O3 --addressing=64 --arch=x86-64" + // ) + " "; + + // } else { cc = util::getFromEnv(target.compiler_env, target.compiler); cflags = util::getFromEnv("TACO_CFLAGS", @@ -137,17 +164,55 @@ string Module::compile() { string cmd = cc + " " + cflags + " " + prefix + file_ending + " " + shims_file + " " + "-o " + fullpath + " -lm"; + std::cout << "--------------------------------------------------------------------------------tmpdir: " << tmpdir << std::endl; + std::cout << "--------------------------------------------------------------------------------libname: " << libname << std::endl; + std::cout << "--------------------------------------------------------------------------------prefix: " << prefix << std::endl; + std::cout << "--------------------------------------------------------------------------------fullpath: " << fullpath << std::endl; + std::cout << "--------------------------------------------------------------------------------cmd: " << cmd << std::endl; // open the output file & write out the source compileToSource(tmpdir, libname); + // write out the shims writeShims(funcs, tmpdir, libname); + for (auto &statement : funcs) { + std::cout << "----- statement --------" << std::endl; + // std::cout << statement; + std::cout << std::endl; + } + std::cout << tmpdir << std::endl << libname << std::endl; - // now compile it - int err = system(cmd.data()); - taco_uassert(err == 0) << "Compilation command failed:\n" << cmd - << "\nreturned " << err; + if (should_use_ISPC_codegen()) { + string ispc = util::getFromEnv("TACO_ISPC", "ispc"); + string ispcflags = util::getFromEnv("TACO_ISPC_FLAGS", + " --target=sse2-i32x4,sse4-i32x8,avx1-i32x8,avx2-i32x8,avx512knl-i32x16,avx512skx-i32x16 --pic -O3 --addressing=64 --arch=x86-64" + ) + " "; + string cmd = ispc + " " + ispcflags + " -o " + prefix + ".ispc.o " + " --emit-obj " + prefix + ".ispc " + "-h " + prefix + "_ispc.h"; + + // now compile the ispc file to generate the object file and the ispc header file + std::cout << "--------------------------------------------------------------------------------cmd: " << cmd << std::endl; + int err = system(cmd.data()); + taco_uassert(err == 0) << "Compilation command failed:\n" << cmd + << "\nreturned " << err; + + string ispc_object_file = " " + prefix + ".ispc.o "; + string ispc_object_files_for_diff_targets = " " + prefix + ".ispc_* "; + cmd = cc + " " + cflags + " " + + prefix + file_ending + " " + ispc_object_file + ispc_object_files_for_diff_targets + shims_file + " " + + "-o " + fullpath + " -lm -lrt "; + + // now compile the c file linking the ispc object file. ispc header is added to the top of the c file + std::cout << "--------------------------------------------------------------------------------cmd: " << cmd << std::endl; + err = system(cmd.data()); + taco_uassert(err == 0) << "Compilation command failed:\n" << cmd + << "\nreturned " << err; + } else { + // now compile it + int err = system(cmd.data()); + taco_uassert(err == 0) << "Compilation command failed:\n" << cmd + << "\nreturned " << err; + } // use dlsym() to open the compiled library if (lib_handle) { diff --git a/src/cuda.cpp b/src/cuda.cpp index 059c60105..68e49fe98 100644 --- a/src/cuda.cpp +++ b/src/cuda.cpp @@ -7,6 +7,25 @@ using namespace std; namespace taco { + +static bool ISPC_codegen_enabled = ISPC_BUILT; +static bool ISPC_code_stream_enabled = false; +bool should_use_ISPC_codegen() { + return ISPC_codegen_enabled; +} + +bool is_ISPC_code_stream_enabled() { + return ISPC_code_stream_enabled; +} + +void set_ISPC_codegen_enabled(bool enabled) { + ISPC_codegen_enabled = enabled; +} + +void set_ISPC_code_stream_enabled(bool enabled) { + ISPC_code_stream_enabled = enabled; +} + /// Functions used by taco to interface with CUDA (especially unified memory) static bool CUDA_codegen_enabled = CUDA_BUILT; static bool CUDA_unified_memory_enabled = CUDA_BUILT; diff --git a/src/index_notation/index_notation.cpp b/src/index_notation/index_notation.cpp index 51fb8770c..d3483c2d6 100644 --- a/src/index_notation/index_notation.cpp +++ b/src/index_notation/index_notation.cpp @@ -2438,6 +2438,7 @@ bool isConcreteNotation(IndexStmt stmt, std::string* reason) { return isConcrete; } +// make reduction notation Assignment makeReductionNotation(Assignment assignment) { IndexExpr expr = assignment.getRhs(); std::vector free = assignment.getLhs().getIndexVars(); @@ -2513,7 +2514,10 @@ IndexStmt makeReductionNotation(IndexStmt stmt) { return makeReductionNotation(to(stmt)); } +// make concrete notation IndexStmt makeConcreteNotation(IndexStmt stmt) { + std::cout << "concrete notation original assignment: " << stmt << std::endl; + std::string reason; taco_iassert(isReductionNotation(stmt, &reason)) << "Not reduction notation: " << stmt << std::endl << reason; @@ -2521,6 +2525,7 @@ IndexStmt makeConcreteNotation(IndexStmt stmt) { // Free variables and reductions covering the whole rhs become top level loops vector freeVars = to(stmt).getFreeVars(); + std::cout << "free vars: " << freeVars << std::endl; struct RemoveTopLevelReductions : IndexNotationRewriter { using IndexNotationRewriter::visit; @@ -2535,12 +2540,17 @@ IndexStmt makeConcreteNotation(IndexStmt stmt) { topLevelReductions.push_back(reduction.getVar()); rhs = reduction.getExpr(); } + std::cout << "top level reductions: " << topLevelReductions << std::endl; if (rhs != node->rhs) { - stmt = Assignment(node->lhs, rhs, Add()); + stmt = Assignment(node->lhs, rhs, Add()); // write with add + int idx = 0; for (auto& i : util::reverse(topLevelReductions)) { + std::cout << idx << ": " << stmt << std::endl; + idx++; stmt = forall(i, stmt); } + std::cout << idx << ": " << stmt << std::endl; } else { stmt = node; @@ -2548,11 +2558,18 @@ IndexStmt makeConcreteNotation(IndexStmt stmt) { } }; stmt = RemoveTopLevelReductions().rewrite(stmt); + std::cout << "after remove top level reductions: " << stmt << std::endl; + // now we form the stmt in reverse order of freeVars + int idx = 0; for (auto& i : util::reverse(freeVars)) { + std::cout << idx << ": " << stmt << std::endl; stmt = forall(i, stmt); + idx++; } + std::cout << idx << ": " << stmt << std::endl; + std::cout << "replacing reductions with whereas statements\n"; // Replace other reductions with where and forall statements struct ReplaceReductionsWithWheres : IndexNotationRewriter { using IndexNotationRewriter::visit; diff --git a/src/index_notation/index_notation_printer.cpp b/src/index_notation/index_notation_printer.cpp index 0b41615ad..d7ee998ae 100644 --- a/src/index_notation/index_notation_printer.cpp +++ b/src/index_notation/index_notation_printer.cpp @@ -224,9 +224,9 @@ void IndexNotationPrinter::visit(const YieldNode* op) { void IndexNotationPrinter::visit(const ForallNode* op) { os << "forall(" << op->indexVar << ", "; op->stmt.accept(this); - if (op->parallel_unit != ParallelUnit::NotParallel) { + // if (op->parallel_unit != ParallelUnit::NotParallel) { os << ", " << ParallelUnit_NAMES[(int) op->parallel_unit] << ", " << OutputRaceStrategy_NAMES[(int) op->output_race_strategy]; - } + // } os << ")"; } diff --git a/src/index_notation/transformations.cpp b/src/index_notation/transformations.cpp index 47fc1dd55..37a89e617 100644 --- a/src/index_notation/transformations.cpp +++ b/src/index_notation/transformations.cpp @@ -1,9 +1,16 @@ #include "taco/index_notation/transformations.h" +#include "lower/iteration_graph.h" +#include "lower/tensor_path.h" +#include "taco/cuda.h" #include "taco/index_notation/index_notation.h" +#include "taco/index_notation/index_notation_nodes_abstract.h" #include "taco/index_notation/index_notation_rewriter.h" #include "taco/index_notation/index_notation_nodes.h" +#include "taco/index_notation/index_notation_printer.h" #include "taco/error/error_messages.h" +#include "taco/index_notation/intrinsic.h" +#include "taco/type.h" #include "taco/util/collections.h" #include "taco/lower/iterator.h" #include "taco/lower/merge_lattice.h" @@ -305,6 +312,7 @@ IndexStmt Precompute::apply(IndexStmt stmt, std::string* reason) const { IndexExpr e = precompute.getExpr(); IndexVar iw = precompute.getiw(); + // these lines of code looks interesting when creating the producer consumer relationship IndexStmt consumer = forall(i, replace(s, {{e, ws(i)}})); IndexStmt producer = forall(iw, Assignment(ws(iw), replace(e, {{i,iw}}), assign.getOperator())); @@ -592,7 +600,10 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const { std::string reason = ""; IndexStmt rewriteParallel(IndexStmt stmt) { + std::cout << "1 rewriting IndexStmt to support parallelize schedule directive\n--------------------------------------------\n"; + // std::cout << stmt << std::endl; provGraph = ProvenanceGraph(stmt); + std::cout << "2 rewriting IndexStmt to support parallelize schedule directive\n--------------------------------------------\n"; const auto reductionVars = getReductionVars(stmt); reductionIndexVars.clear(); @@ -607,15 +618,22 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const { tensorVars = createIRTensorVars(stmt); assembledByUngroupedInsert.clear(); + std::cout << "3 rewriting IndexStmt to support parallelize schedule directive\n--------------------------------------------\n"; for (const auto& result : getAssembledByUngroupedInsertion(stmt)) { assembledByUngroupedInsert.push_back(tensorVars[result]); } + std::cout << "4 rewriting IndexStmt to support parallelize schedule directive\n--------------------------------------------\n"; + // std::cout << stmt << std::endl; return rewrite(stmt); } void visit(const ForallNode* node) { + std::cout << "transformations.cpp void visit(const ForallNode* node)\n"; + std::cout << "node: \n" << node << std::endl; Forall foralli(node); + std::cout << "foralli: \n" << foralli << std::endl; + std::cout << "before stmt update stmt: \n" << stmt << std::endl; IndexVar i = parallelize.geti(); definedIndexVars.insert(foralli.getIndexVar()); @@ -632,6 +650,7 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const { Iterators iterators(foralli, tensorVars); MergeLattice lattice = MergeLattice::make(foralli, iterators, provGraph, definedIndexVars); + std::cout << "iter: " << i << ", lattice: \n" << lattice << std::endl; // Precondition 2: No coiteration of modes (i.e., merge lattice has // only one iterator) @@ -660,6 +679,7 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const { MergeLattice underivedLattice = MergeLattice::make(underivedForall, iterators, provGraph, definedIndexVars); + std::cout << "iter: " << i << ", underivedLattice: \n" << lattice << std::endl; // Precondition 3: Every result iterator must have insert capability for (Iterator iterator : underivedLattice.results()) { @@ -721,6 +741,7 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const { // build consumer that writes from temporary to output, mark consumer as parallel reduction ParallelUnit reductionUnit = ParallelUnit::CPUThreadGroupReduction; if (should_use_CUDA_codegen()) { + std::cout << "should_use_CUDA_codegen() true\n"; if (parentParallelUnits.count(ParallelUnit::GPUWarp)) { reductionUnit = ParallelUnit::GPUWarpReduction; } @@ -728,6 +749,9 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const { reductionUnit = ParallelUnit::GPUBlockReduction; } } + else { + std::cout << "should_use_CUDA_codegen() false\n"; + } IndexStmt consumer = forall(i, Assignment(assignment->lhs, w(i), assignment->op), reductionUnit, OutputRaceStrategy::ParallelReduction); precomputed_stmt = where(consumer, producer); } @@ -746,8 +770,9 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const { return; } - + std::cout << "updated stmt: \n"; stmt = forall(i, foralli.getStmt(), parallelize.getParallelUnit(), parallelize.getOutputRaceStrategy(), foralli.getUnrollFactor()); + std::cout << stmt << std::endl; return; } @@ -1181,6 +1206,7 @@ std::ostream& operator<<(std::ostream& os, IndexStmt parallelizeOuterLoop(IndexStmt stmt) { // get outer ForAll + std::cout << "get outer ForAll ----------------- \n"; Forall forall; bool matched = false; match(stmt, @@ -1215,7 +1241,19 @@ IndexStmt parallelizeOuterLoop(IndexStmt stmt) { } return parallelized256; } + else if (should_use_ISPC_codegen()) { + std::cout << "outer loop parallelization for ISPC codegen\n"; + // IndexStmt parallelized = Parallelize(forall.getIndexVar(), ParallelUnit::CPUSpmd, OutputRaceStrategy::NoRaces).apply(stmt, &reason); + // if (parallelized == IndexStmt()) { + // // can't parallelize + // return stmt; + // } + // return parallelized; + + return stmt; + } else { + std::cout << "outer loop parallelization for CPU codgen index statement\n"; IndexStmt parallelized = Parallelize(forall.getIndexVar(), ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces).apply(stmt, &reason); if (parallelized == IndexStmt()) { // can't parallelize @@ -1320,8 +1358,669 @@ topologicallySort(map> hardDeps, return sortedVars; } +bool checkFromBack(const TensorPath& resultTensorPath, + const vector& tensorPaths, + string& removedAccessNode, + vector& producerVars, + vector& consumerVars, + vector& modifiedResultIndexesAccessed, + vector& sortedAllIndexes) { + + std::cout << "check from back function execution\n"; + + const std::vector& resultIndexesVisited = resultTensorPath.getVariables(); + IndexVar lastVisitedIndexVar = resultIndexesVisited.back(); + + std::cout << "last visited index variable: " << lastVisitedIndexVar << std::endl; + + bool onlyLastTensorContainLastIndexOfOutput = true; + bool fissionFromBack = false; + + // check from the back + for (unsigned long i=0; i& indexesVisited = otherIndexPaths.getVariables(); + cout << "index paths: " << otherIndexPaths << endl; + + // if (i < tensorPaths.size()-1) { + // check if other tensors also contain last index of output tensor + for (auto index : indexesVisited) { + cout << "checking " << index << " " << lastVisitedIndexVar << endl; + if (index == lastVisitedIndexVar) { + onlyLastTensorContainLastIndexOfOutput = false; + } + } + // } + } + + if (onlyLastTensorContainLastIndexOfOutput) { // last accessed tensorVariable + const TensorPath& otherIndexPaths = tensorPaths.back(); + const vector& indexesVisited = otherIndexPaths.getVariables(); + cout << "index paths: " << otherIndexPaths << endl; + + cout << "index variable maybe removed from the back\n"; + auto lastTensorLastVisited = indexesVisited.back(); + cout << "last index last visited " << lastTensorLastVisited << endl; + + if (lastTensorLastVisited == lastVisitedIndexVar) { + cout << "we can diffuse from the back\n"; + fissionFromBack = true; + removedAccessNode = otherIndexPaths.getAccess().getTensorVar().getName(); + cout << "removed access node " << removedAccessNode << endl; + + // mark producer accessed index variables + for (auto indexVar : sortedAllIndexes) { + if (indexVar != lastVisitedIndexVar) { // add everything except the last accessed index + std::cout << "producer vars: " << indexVar << std::endl; + producerVars.push_back(indexVar); + } + } + + for (auto indexVar : sortedAllIndexes) { + if (indexVar != lastVisitedIndexVar) { + if ( + find(resultIndexesVisited.begin(), resultIndexesVisited.end(), indexVar) + != resultIndexesVisited.end() || + find(indexesVisited.begin(), indexesVisited.end(), indexVar) + != indexesVisited.end() + ) { + modifiedResultIndexesAccessed.push_back(indexVar); + } + } + } + + // // get modified index for the intermediate calculated tensor expression + // for (unsigned long j=0; j& tensorPaths, + string& removedAccessNode, + vector& producerVars, + vector& consumerVars, + vector& modifiedResultIndexesAccessed, + vector& sortedAllIndexes) { + + std::cout << "check from front function execution\n"; + + const std::vector& resultIndexesVisited = resultTensorPath.getVariables(); + IndexVar firstVisitedIndexVar = resultIndexesVisited.front(); + + std::cout << "first fisited index variable: " << firstVisitedIndexVar << std::endl; + std::cout << "tensor path size: " << tensorPaths.size() << std::endl; + + bool onlyFirstTensorContainFirstIndexOfOutput = true; + bool fissionFromFront = false; + + // check from the front + for (long i=tensorPaths.size()-1; i>0; i--) { // change tensor paths to recursively use the functionality + std::cout << "i: " << i << std::endl; + const TensorPath& otherIndexPaths = tensorPaths.at(i); + const vector& indexesVisited = otherIndexPaths.getVariables(); + cout << "index paths: " << otherIndexPaths << endl; + + if (i != 0) { // check if other tensors also contain last index of output tensor + for (auto index : indexesVisited) { + cout << "checking " << index << " " << firstVisitedIndexVar << endl; + if (index == firstVisitedIndexVar) { + onlyFirstTensorContainFirstIndexOfOutput = false; + } + } + } + } + + + if (onlyFirstTensorContainFirstIndexOfOutput) { // last accessed tensorVariable + const TensorPath& otherIndexPaths = tensorPaths.front(); + const vector& indexesVisited = otherIndexPaths.getVariables(); + cout << "index paths: " << otherIndexPaths << endl; + + cout << "index variable maybe removed from the front\n"; + auto firstTensorFirstVisited = indexesVisited.front(); + cout << "first index first visited " << firstTensorFirstVisited << endl; + + if (firstTensorFirstVisited == firstVisitedIndexVar) { + cout << "we can diffuse from the front\n"; + fissionFromFront = true; + removedAccessNode = otherIndexPaths.getAccess().getTensorVar().getName(); + cout << "removed access node " << removedAccessNode << endl; + + // mark producer accessed index variables + for (auto indexVar : sortedAllIndexes) { + if (indexVar != firstVisitedIndexVar) { // add everything except the first accessed index + producerVars.emplace_back(indexVar); + } + } + + for (auto indexVar : sortedAllIndexes) { + if (indexVar != firstVisitedIndexVar) { + if ( + find(resultIndexesVisited.begin(), resultIndexesVisited.end(), indexVar) + != resultIndexesVisited.end() || + find(indexesVisited.begin(), indexesVisited.end(), indexVar) + != indexesVisited.end() + ) { + modifiedResultIndexesAccessed.push_back(indexVar); + } + } + } + + for (auto& idx : modifiedResultIndexesAccessed) { + std::cout << "modifiedResultIndexesAccessed: " << idx << std::endl; + } + + // get modified index for the intermediate calculated tensor expression + // for (unsigned long j=0; j forallParallelUnit; + map forallOutputRaceStrategy; + vector sortedIndexes; + Assignment innerBody; + + SortedIndexVars() {}; + + void visit(const ForallNode* node) { + Forall forallNode(node); + IndexVar i = forallNode.getIndexVar(); + std::cout << forallNode << std::endl; + + sortedIndexes.push_back(i); + forallParallelUnit[i] = forallNode.getParallelUnit(); + forallOutputRaceStrategy[i] = forallNode.getOutputRaceStrategy(); + + if (isa(forallNode.getStmt())) { + cout << "assignment node found: " << forallNode.getStmt() << endl;; + innerBody = to(forallNode.getStmt()); + return; // Only reorder first contiguous section of ForAlls + } + + IndexNotationVisitor::visit(node); + } + }; + + std::cout << "traversing through the index statement\n"; + SortedIndexVars sortedIndexVars; + stmt.accept(&sortedIndexVars); + std::cout << std::endl; + + struct IndexExprBuilder : public IndexNotationVisitor { + + using IndexNotationVisitor::visit; + vector accessLeftToRight; + map>> indexDimensionsMap; + + void visit(const AccessNode* node) { + Access accessNode(node); + std::cout << "access node: " << accessNode << std::endl; + accessLeftToRight.push_back(accessNode); + + TensorVar tensorVar = accessNode.getTensorVar(); + + for (unsigned long i=0; i < accessNode.getIndexVars().size(); i++) { + auto var = accessNode.getIndexVars()[i]; + + if (indexDimensionsMap.find(var) != indexDimensionsMap.end()) { + indexDimensionsMap[var].emplace_back( + pair(tensorVar.getType().getShape().getDimension(i), + tensorVar.getType())); + } + else { + indexDimensionsMap[var] = { + pair( + tensorVar.getType().getShape().getDimension(i), + tensorVar.getType()) + }; + } + } + + } + + }; + + IndexExpr rhsExpr = assignment.getRhs(); + Access lhsAccess = to(assignment.getLhs()); + std::cout << "right hand side expression: " << rhsExpr << std::endl; + IndexExprBuilder indexExprBuilder; + rhsExpr.accept(&indexExprBuilder); + TensorVar resultVar = lhsAccess.getTensorVar(); + + for (auto item : indexExprBuilder.indexDimensionsMap) { + auto indexVar = item.first; + cout << "var: " << indexVar << " "; + for (auto elem : item.second) { + cout << elem.first << " " << elem.second << " " ; + } + cout << endl; + } + + + // now I have the iteration graph + IterationGraph iterationGraph = IterationGraph::make(assignment); + std::cout << "/*******************************************/\n"; + std::cout << "/********** ITERATION GRAPH ****************/\n"; + std::cout << "/*******************************************/\n"; + std::cout << iterationGraph << std::endl; + + const TensorPath& resultTensorPath = iterationGraph.getResultTensorPath(); + const std::vector& tensorPaths = iterationGraph.getTensorPaths(); + + + string removedAccessNode; + vector producerVars; // producer accessed index variables + vector consumerVars; // consumer accessed index variables + vector fusedVars; + vector modifiedResultIndexesAccessed; + bool fissionFromBack = false; + if (side == "b") { + fissionFromBack = true; + } + + if (fissionFromBack) { + fissionFromBack = checkFromBack(resultTensorPath, tensorPaths, + removedAccessNode, producerVars, consumerVars, + modifiedResultIndexesAccessed, sortedIndexVars.sortedIndexes + ); + } + + vector newAccessDims{}; + for (auto var : modifiedResultIndexesAccessed) { + auto item = indexExprBuilder.indexDimensionsMap[var]; + cout << "shared vars: " << var << endl; + newAccessDims.emplace_back(item[0].first); + } + TensorVar newAccessVar(resultVar.getName() + "_inner", + Type(resultVar.getType().getDataType(), newAccessDims)); + Access newResultAccess(newAccessVar, modifiedResultIndexesAccessed); + cout << "new access variable for iterative apply: " << newResultAccess << std::endl; + + bool fissionFromFront = false; + if (side == "f") { + fissionFromFront = true; + } + if (fissionFromBack == false && fissionFromFront) { + fissionFromFront = checkFromFront(resultTensorPath, tensorPaths, + removedAccessNode, producerVars, consumerVars, + modifiedResultIndexesAccessed, sortedIndexVars.sortedIndexes + ); + } + + if (!fissionFromBack && !fissionFromFront) { + cout << "fission operation cannot be performed from the back\n"; + return stmt; + } + + if (fissionFromBack) { + std::cout << "fission from the back is possible\n"; + } + if (fissionFromFront) { + std::cout << "fission from the front is possible\n"; + } + + // // check from the front + // struct IndexExprSeparator : public IndexNotationVisitor { + + // using IndexNotationVisitor::visit; + // vector accessLeftToRight; + + // void visit(const MulNode* node) { + // Mul mulNode(node); + // IndexExpr lhs = mulNode.getA(); + // IndexExpr rhs = mulNode.getB(); + // std::cout << "access node: " << accessNode << std::endl; + // accessLeftToRight.push_back(accessNode); + // } + + // }; + + + cout << "\n\nProducer accessed index variables\n"; + auto it = producerVars.begin(); + for (; it != producerVars.end(); it++) { + cout << *it << endl; + } + cout << "\n\nConsumer accessed index variables\n"; + it = consumerVars.begin(); + for (; it != consumerVars.end(); it++) { + cout << *it << endl; + } + cout << endl << endl; + + // check common vars that can be fused + for (auto var : sortedIndexVars.sortedIndexes) { + if (find(producerVars.begin(), producerVars.end(), var) != producerVars.end() && + find(consumerVars.begin(), consumerVars.end(), var) != consumerVars.end()) { + fusedVars.emplace_back(var); + } + else { + break; + } + } + + for (auto& fv : fusedVars) { + std::cout << "fusable vars: " << fv << std::endl; + } + + vector sharedVars; + for (auto var : sortedIndexVars.sortedIndexes) { + if (find(fusedVars.begin(), fusedVars.end(), var) == fusedVars.end() && + find(producerVars.begin(), producerVars.end(), var) != producerVars.end() && + find(consumerVars.begin(), consumerVars.end(), var) != consumerVars.end() + ) { + sharedVars.emplace_back(var); + } + } + + for (auto& sv : sharedVars) { + std::cout << "shared vars: " << sv << std::endl; + } + + vector sharedDims{}; + for (auto var : sharedVars) { + auto item = indexExprBuilder.indexDimensionsMap[var]; + cout << "shared vars: " << var << endl; + sharedDims.emplace_back(item[0].first); + } + + + // get removing tensorvars and workspace dimension + const Type& type = resultTensorPath.getAccess().getTensorVar().getType(); + const Format& format = resultTensorPath.getAccess().getTensorVar().getFormat(); + TensorVar intermediateTensor("ws", type, format); + cout << intermediateTensor << endl; + + // TensorVar A("A", Type(), taco::dense); + TensorVar tempVar("t" + resultVar.getName(), + Type(resultVar.getType().getDataType(), sharedDims)); + cout << "tensor order: " << tempVar.getOrder() << endl; + cout << "tensor format: " << tempVar.getFormat() << endl; + cout << "format order: " << tempVar.getFormat().getOrder() << endl; + + // TensorVar* a = new TensorVar("A", Type()); + // TensorVar ws("ws", Type(type(), {jdim}) ); + + // get removing indexExpr and the rest of the indexExpr + Access workspace(tempVar, sharedVars); + std::cout << "workspace access tensor: " << workspace << std::endl; + + + + // construct producer expression right hand side + cout << "generating consumer expression\n"; + IndexExpr producerExpr; + int num_muls = 0; + for (Access accessNode : indexExprBuilder.accessLeftToRight) { + std::cout << "accessNodes: " << accessNode << endl; + if (removedAccessNode != accessNode.getTensorVar().getName()) { + if (producerExpr == NULL) { + std::cout << "index expression is null"; + producerExpr = accessNode; + std::cout << "producerExpr: " << producerExpr << std::endl; + } else { + num_muls++; + producerExpr = producerExpr * accessNode; + std::cout << "producerExpr: " << producerExpr << std::endl; + } + } + } + std::cout << producerExpr << std::endl; + Assignment producerAssignment(newResultAccess, + producerExpr); + std::cout << "new inner assignment statement: " << producerAssignment << std::endl; + Assignment producerInnerBody(workspace, + producerExpr, + sortedIndexVars.innerBody.getOperator() + ); + std::cout << "producerInnerBody: " << producerInnerBody << std::endl; + + // construct consumer expression right hand side + IndexExpr consumerExpr; + if (fissionFromBack) { + consumerExpr = workspace; + } + cout << "generating consumer expression: " << consumerExpr << std::endl; + for (Access accessNode : indexExprBuilder.accessLeftToRight) { + TensorVar tv = accessNode.getTensorVar(); + std::cout << "accessNodes: " << accessNode << endl; + if (removedAccessNode == accessNode.getTensorVar().getName()) { + if (consumerExpr == NULL) { + std::cout << "index expression is null"; + consumerExpr = accessNode; + std::cout << "consumerExpr: " << consumerExpr << std::endl; + } else { + consumerExpr = consumerExpr * accessNode; + std::cout << "consumerExpr: " << consumerExpr << std::endl; + } + } + } + if (fissionFromFront) { + consumerExpr = consumerExpr * workspace; + } + Assignment consumerInnerBody(lhsAccess, + consumerExpr, + sortedIndexVars.innerBody.getOperator() + ); + + cout << "Producer inner body: " << producerInnerBody << endl; + cout << "Consumer inner body: " << consumerInnerBody << endl; + + // rewrite indexstmt + // Reorder Foralls use a rewriter in case new nodes introduced outside of Forall + struct ProducerConsumerRewriter : public IndexNotationRewriter { + using IndexNotationRewriter::visit; + + const vector& producerConsumerVars; + const vector& fusedVars; + IndexStmt innerBody; + const map forallParallelUnit; + const map forallOutputRaceStrategy; + + ProducerConsumerRewriter(const vector& producerConsumerVars, + const vector& fusedVars, IndexStmt innerBody, + const map forallParallelUnit, + const map forallOutputRaceStrategy) + : producerConsumerVars(producerConsumerVars), fusedVars(fusedVars), innerBody(innerBody), + forallParallelUnit(forallParallelUnit), forallOutputRaceStrategy(forallOutputRaceStrategy) { + } + + void visit(const ForallNode* node) { + Forall foralli(node); + IndexVar i = foralli.getIndexVar(); + cout << "going through var: " << i << endl; + + // first forall must be in collected variables + // taco_iassert(util::contains(producerVars, i)); + // std::cout << "\ninner body of the statement\n" << innerBody; + // // done in reverse order? + // for (auto it = sortedVars.rbegin(); it != sortedVars.rend(); ++it) { + // stmt = forall(*it, stmt, forallParallelUnit.at(*it), forallOutputRaceStrategy.at(*it), foralli.getUnrollFactor()); + // } + stmt = rewrite(foralli.getStmt()); + cout << "after rewrite statement: " << stmt << endl; + + // omit the index variables in the fusedVar list + if (find(fusedVars.begin(), fusedVars.end(), i) == fusedVars.end() && + find(producerConsumerVars.begin(), producerConsumerVars.end(), i) != producerConsumerVars.end()) { + stmt = forall(i, stmt, forallParallelUnit.at(i), forallOutputRaceStrategy.at(i), foralli.getUnrollFactor()); + } + } + + void visit (const AssignmentNode* node) { + cout << "assignment node: " << node << endl; + stmt = innerBody; + cout << "producerStmt: " << innerBody << endl; + cout << "stmt: " << stmt << endl; + } + + }; + ProducerConsumerRewriter producerRewriter(producerVars, fusedVars, + producerInnerBody, + sortedIndexVars.forallParallelUnit, + sortedIndexVars.forallOutputRaceStrategy); + IndexStmt producerStmt = producerRewriter.rewrite(stmt); + std::cout << "\nAfter Producer rewriter\n"; + std::cout << producerStmt << std::endl; + if (num_muls > 1) { + producerStmt = loopFusionOverFission(producerStmt, producerInnerBody, + side, iters-1); + } + + + ProducerConsumerRewriter consumerRewriter(consumerVars, fusedVars, + consumerInnerBody, + sortedIndexVars.forallParallelUnit, + sortedIndexVars.forallOutputRaceStrategy); + IndexStmt consumerStmt = consumerRewriter.rewrite(stmt); + std::cout << "\nAfter Consumer rewriter\n"; + std::cout << consumerStmt << std::endl; + + + struct CombineProducerConsumerRewriter : public IndexNotationRewriter { + + const vector& fusedVars; + IndexStmt consumerStmt; + IndexStmt producerStmt; + const map forallParallelUnit; + const map forallOutputRaceStrategy; + + CombineProducerConsumerRewriter(const vector& fusedVars, + IndexStmt producerStmt, IndexStmt consumerStmt, + const map forallParallelUnit, + const map forallOutputRaceStrategy) + : fusedVars(fusedVars), consumerStmt(consumerStmt), producerStmt(producerStmt), + forallParallelUnit(forallParallelUnit), + forallOutputRaceStrategy(forallOutputRaceStrategy) {} + + using IndexNotationRewriter::visit; + + void visit(const ForallNode* node) { + Forall foralli(node); + IndexVar i = foralli.getIndexVar(); + cout << "going through var: " << i << endl; + + // omit the index variables in the fusedVar list + if (find(fusedVars.begin(), fusedVars.end(), i) != fusedVars.end()) { + cout << "fused var in stmt\n"; + stmt = rewrite(foralli.getStmt()); + cout << "rewritten stmt: " << stmt << endl; + stmt = forall(i, stmt, forallParallelUnit.at(i), forallOutputRaceStrategy.at(i), foralli.getUnrollFactor()); + } + else { + cout << "fused var not in stmt\n"; + cout << "producerStmt: " << producerStmt << endl; + cout << "consumerStmt: " << consumerStmt << endl; + stmt = where(consumerStmt, producerStmt); + cout << "where stmt: " << stmt << endl; + } + + cout << "after rewrite statement: " << stmt << endl; + } + + }; + + CombineProducerConsumerRewriter combineRewriter(fusedVars, + producerStmt, consumerStmt, + sortedIndexVars.forallParallelUnit, + sortedIndexVars.forallOutputRaceStrategy); + IndexStmt combinedStmt = combineRewriter.rewrite(stmt); + std::cout << "\nAfter Combine rewriter\n"; + std::cout << combinedStmt << std::endl; + + + return combinedStmt; + +} + IndexStmt reorderLoopsTopologically(IndexStmt stmt) { + std::cout << "executing reorderLoopsTopologically\n"; // Collect tensorLevelVars which stores the pairs of IndexVar and tensor // level that each tensor is accessed at struct DAGBuilder : public IndexNotationVisitor { @@ -1382,8 +2081,11 @@ IndexStmt reorderLoopsTopologically(IndexStmt stmt) { }; Iterators iterators(stmt); + std::cout << "DAG builder with iterators" << std::endl; DAGBuilder dagBuilder(iterators); stmt.accept(&dagBuilder); + std::cout << "After DAGBuilder\n"; + std::cout << stmt << std::endl; // Construct tensor dependencies (sorted list of IndexVars) from tensorLevelVars map>> tensorVarOrders; @@ -1391,6 +2093,7 @@ IndexStmt reorderLoopsTopologically(IndexStmt stmt) { tensorVarOrders[tensorLevelVar.first] = varOrderFromTensorLevels(tensorLevelVar.second); } + // hard dependencies const auto hardDeps = depsFromVarOrders(tensorVarOrders); struct CollectSoftDependencies : public IndexNotationVisitor { @@ -1412,12 +2115,17 @@ IndexStmt reorderLoopsTopologically(IndexStmt stmt) { } } }; + // soft dependencies CollectSoftDependencies collectSoftDeps; stmt.accept(&collectSoftDeps); + std::cout << "After CollectSoftDependencies\n"; + std::cout << stmt << std::endl; + // topological sort const auto sortedVars = topologicallySort(hardDeps, collectSoftDeps.softDeps, dagBuilder.indexVarOriginalOrder); + // rewrite indexstmt // Reorder Foralls use a rewriter in case new nodes introduced outside of Forall struct TopoReorderRewriter : public IndexNotationRewriter { using IndexNotationRewriter::visit; @@ -1440,7 +2148,9 @@ IndexStmt reorderLoopsTopologically(IndexStmt stmt) { // first forall must be in collected variables taco_iassert(util::contains(sortedVars, i)); + std::cout << "\ninner body of the statement\n" << innerBody; stmt = innerBody; + // done in reverse order? for (auto it = sortedVars.rbegin(); it != sortedVars.rend(); ++it) { stmt = forall(*it, stmt, forallParallelUnit.at(*it), forallOutputRaceStrategy.at(*it), foralli.getUnrollFactor()); } @@ -1450,7 +2160,11 @@ IndexStmt reorderLoopsTopologically(IndexStmt stmt) { }; TopoReorderRewriter rewriter(sortedVars, dagBuilder.innerBody, dagBuilder.forallParallelUnit, dagBuilder.forallOutputRaceStrategy); - return rewriter.rewrite(stmt); + IndexStmt stmtChanged = rewriter.rewrite(stmt); + std::cout << "After TopoReorderRewriter\n"; + std::cout << stmtChanged << std::endl; + + return stmtChanged; } IndexStmt scalarPromote(IndexStmt stmt, ProvenanceGraph provGraph, @@ -1478,6 +2192,7 @@ IndexStmt scalarPromote(IndexStmt stmt, ProvenanceGraph provGraph, void visit(const ForallNode* node) { Forall foralli(node); + std::cout << "scalar promote: " << foralli << std::endl; IndexVar i = foralli.getIndexVar(); // Don't allow hoisting out of forall's for GPU warp and block reduction diff --git a/src/ir/ir_printer.cpp b/src/ir/ir_printer.cpp index a1997a9b7..eddca3f29 100644 --- a/src/ir/ir_printer.cpp +++ b/src/ir/ir_printer.cpp @@ -1,6 +1,7 @@ #include #include +#include "taco/cuda.h" #include "taco/ir/ir.h" #include "taco/ir/ir_printer.h" #include "taco/ir/simplify.h" @@ -34,7 +35,11 @@ IRPrinter::IRPrinter(ostream &s) : IRPrinter(s, false, false) { } IRPrinter::IRPrinter(ostream &s, bool color, bool simplify) - : stream(s), indent(0), color(color), simplify(simplify) { + : stream(s), stream2(s), indent(0), color(color), simplify(simplify) { +} + +IRPrinter::IRPrinter(ostream &s, ostream &s2, bool color, bool simplify) + : stream(s), stream2(s2), indent(0), color(color), simplify(simplify) { } IRPrinter::~IRPrinter() { @@ -59,79 +64,169 @@ void IRPrinter::print(Stmt stmt) { } void IRPrinter::visit(const Literal* op) { - if (color) { - stream << blue ; - } - - switch (op->type.getKind()) { - case Datatype::Bool: - stream << op->getValue(); - break; - case Datatype::UInt8: - stream << static_cast(op->getValue()); - break; - case Datatype::UInt16: - stream << op->getValue(); - break; - case Datatype::UInt32: - stream << op->getValue(); - break; - case Datatype::UInt64: - stream << op->getValue(); - break; - case Datatype::UInt128: - taco_not_supported_yet; - break; - case Datatype::Int8: - stream << static_cast(op->getValue()); - break; - case Datatype::Int16: - stream << op->getValue(); - break; - case Datatype::Int32: - stream << op->getValue(); - break; - case Datatype::Int64: - stream << op->getValue(); - break; - case Datatype::Int128: - taco_not_supported_yet; - break; - case Datatype::Float32: - stream << ((op->getValue() != 0.0) - ? util::toString(op->getValue()) : "0.0"); - break; - case Datatype::Float64: - stream << ((op->getValue()!=0.0) - ? util::toString(op->getValue()) : "0.0"); - break; - case Datatype::Complex64: { - std::complex val = op->getValue>(); - stream << val.real() << " + I*" << val.imag(); - } - break; - case Datatype::Complex128: { - std::complex val = op->getValue>(); - stream << val.real() << " + I*" << val.imag(); + if (is_ISPC_code_stream_enabled()) { + if (color) { + stream2 << blue ; + } + + // It seems this is where all the types get printed in the final code generation. + // Come up with a way to generate different values if stream2 is used to generate ispc code + switch (op->type.getKind()) { + case Datatype::Bool: + stream2 << op->getValue(); + break; + case Datatype::UInt8: + stream2 << static_cast(op->getValue()); + break; + case Datatype::UInt16: + stream2 << op->getValue(); + break; + case Datatype::UInt32: + stream2 << op->getValue(); + break; + case Datatype::UInt64: + stream2 << op->getValue(); + break; + case Datatype::UInt128: + taco_not_supported_yet; + break; + case Datatype::Int8: + stream2 << static_cast(op->getValue()); + break; + case Datatype::Int16: + stream2 << op->getValue(); + break; + case Datatype::Int32: + stream2 << op->getValue(); + break; + case Datatype::Int64: + stream2 << op->getValue(); + break; + case Datatype::Int128: + taco_not_supported_yet; + break; + case Datatype::Float32: + stream2 << ((op->getValue() != 0.0) + ? util::toString(op->getValue()) : "0.0"); + break; + case Datatype::Float64: + stream2 << ((op->getValue()!=0.0) + ? util::toString(op->getValue()) : "0.0"); + break; + case Datatype::Complex64: { + std::complex val = op->getValue>(); + stream2 << val.real() << " + I*" << val.imag(); + } + break; + case Datatype::Complex128: { + std::complex val = op->getValue>(); + stream2 << val.real() << " + I*" << val.imag(); + } + break; + case Datatype::Undefined: + taco_ierror << "Undefined type in IR"; + break; + } + + if (color) { + stream2 << nc; + } } - break; - case Datatype::Undefined: - taco_ierror << "Undefined type in IR"; - break; - } - if (color) { - stream << nc; + + + else { + + if (color) { + stream << blue ; + } + + // It seems this is where all the types get printed in the final code generation. + // Come up with a way to generate different values if stream2 is used to generate ispc code + switch (op->type.getKind()) { + case Datatype::Bool: + stream << op->getValue(); + break; + case Datatype::UInt8: + stream << static_cast(op->getValue()); + break; + case Datatype::UInt16: + stream << op->getValue(); + break; + case Datatype::UInt32: + stream << op->getValue(); + break; + case Datatype::UInt64: + stream << op->getValue(); + break; + case Datatype::UInt128: + taco_not_supported_yet; + break; + case Datatype::Int8: + stream << static_cast(op->getValue()); + break; + case Datatype::Int16: + stream << op->getValue(); + break; + case Datatype::Int32: + stream << op->getValue(); + break; + case Datatype::Int64: + stream << op->getValue(); + break; + case Datatype::Int128: + taco_not_supported_yet; + break; + case Datatype::Float32: + stream << ((op->getValue() != 0.0) + ? util::toString(op->getValue()) : "0.0"); + break; + case Datatype::Float64: + stream << ((op->getValue()!=0.0) + ? util::toString(op->getValue()) : "0.0"); + break; + case Datatype::Complex64: { + std::complex val = op->getValue>(); + stream << val.real() << " + I*" << val.imag(); + } + break; + case Datatype::Complex128: { + std::complex val = op->getValue>(); + stream << val.real() << " + I*" << val.imag(); + } + break; + case Datatype::Undefined: + taco_ierror << "Undefined type in IR"; + break; + } + + if (color) { + stream << nc; + } + + } + } void IRPrinter::visit(const Var* op) { - if (varNames.contains(op)) { - stream << varNames.get(op); + if (is_ISPC_code_stream_enabled()) { + if (varNames.contains(op)) { + stream2 << varNames.get(op); + } + else { + stream2 << op->name; + } } else { - stream << op->name; + if (varNames.contains(op)) { + stream << varNames.get(op); + } + else { + stream << op->name; + } } + } void IRPrinter::visit(const Neg* op) { @@ -238,51 +333,101 @@ void IRPrinter::visit(const Cast* op) { } void IRPrinter::visit(const Call* op) { - stream << op->func << "("; - parentPrecedence = Precedence::CALL; - acceptJoin(this, stream, op->args, ", "); - stream << ")"; + if (!is_ISPC_code_stream_enabled()) { + stream << op->func << "("; + parentPrecedence = Precedence::CALL; + acceptJoin(this, stream, op->args, ", "); + stream << ")"; + } else { + // statically added function to the ispc file has __ in the front + stream2 << "__" << op->func << "("; + parentPrecedence = Precedence::CALL; + acceptJoin(this, stream2, op->args, ", "); + stream2 << ")"; + } } void IRPrinter::visit(const IfThenElse* op) { taco_iassert(op->cond.defined()); taco_iassert(op->then.defined()); doIndent(); - stream << keywordString("if "); - stream << "("; - parentPrecedence = Precedence::TOP; - op->cond.accept(this); - stream << ")"; + if (is_ISPC_code_stream_enabled()) { + stream2 << keywordString("if "); + stream2 << "("; + parentPrecedence = Precedence::TOP; + op->cond.accept(this); + stream2 << ")"; + + Stmt scopedStmt = Stmt(to(op->then)->scopedStmt); + if (isa(scopedStmt)) { + stream2 << " {" << endl; + op->then.accept(this); + doIndent(); + stream2 << "}"; + } + else if (isa(scopedStmt)) { + int tmp = indent; + indent = 0; + stream2 << " "; + scopedStmt.accept(this); + indent = tmp; + } + else { + stream2 << endl; + op->then.accept(this); + } - Stmt scopedStmt = Stmt(to(op->then)->scopedStmt); - if (isa(scopedStmt)) { - stream << " {" << endl; - op->then.accept(this); - doIndent(); - stream << "}"; - } - else if (isa(scopedStmt)) { - int tmp = indent; - indent = 0; - stream << " "; - scopedStmt.accept(this); - indent = tmp; + if (op->otherwise.defined()) { + stream2 << "\n"; + doIndent(); + stream2 << keywordString("else"); + stream2 << " {\n"; + op->otherwise.accept(this); + doIndent(); + stream2 << "}"; + } + stream2 << endl; } + + else { - stream << endl; - op->then.accept(this); - } + stream << keywordString("if "); + stream << "("; + parentPrecedence = Precedence::TOP; + op->cond.accept(this); + stream << ")"; - if (op->otherwise.defined()) { - stream << "\n"; - doIndent(); - stream << keywordString("else"); - stream << " {\n"; - op->otherwise.accept(this); - doIndent(); - stream << "}"; + Stmt scopedStmt = Stmt(to(op->then)->scopedStmt); + if (isa(scopedStmt)) { + stream << " {" << endl; + op->then.accept(this); + doIndent(); + stream << "}"; + } + else if (isa(scopedStmt)) { + int tmp = indent; + indent = 0; + stream << " "; + scopedStmt.accept(this); + indent = tmp; + } + else { + stream << endl; + op->then.accept(this); + } + + if (op->otherwise.defined()) { + stream << "\n"; + doIndent(); + stream << keywordString("else"); + stream << " {\n"; + op->otherwise.accept(this); + doIndent(); + stream << "}"; + } + stream << endl; } - stream << endl; + } void IRPrinter::visit(const Case* op) { @@ -345,12 +490,22 @@ void IRPrinter::visit(const Switch* op) { } void IRPrinter::visit(const Load* op) { - parentPrecedence = Precedence::LOAD; - op->arr.accept(this); - stream << "["; - parentPrecedence = Precedence::LOAD; - op->loc.accept(this); - stream << "]"; + if (is_ISPC_code_stream_enabled()) { + parentPrecedence = Precedence::LOAD; + op->arr.accept(this); + stream2 << "["; + parentPrecedence = Precedence::LOAD; + op->loc.accept(this); + stream2 << "]"; + } + else { + parentPrecedence = Precedence::LOAD; + op->arr.accept(this); + stream << "["; + parentPrecedence = Precedence::LOAD; + op->loc.accept(this); + stream << "]"; + } } void IRPrinter::visit(const Malloc* op) { @@ -367,66 +522,149 @@ void IRPrinter::visit(const Sizeof* op) { } void IRPrinter::visit(const Store* op) { - doIndent(); - op->arr.accept(this); - stream << "["; - parentPrecedence = Precedence::TOP; - op->loc.accept(this); - stream << "] = "; - parentPrecedence = Precedence::TOP; - op->data.accept(this); - stream << ";"; - stream << endl; + if (is_ISPC_code_stream_enabled()) { + doIndent(); + op->arr.accept(this); + stream2 << "["; + parentPrecedence = Precedence::TOP; + op->loc.accept(this); + stream2 << "] = "; + parentPrecedence = Precedence::TOP; + op->data.accept(this); + stream2 << ";"; + stream2 << endl; + } + else { + doIndent(); + op->arr.accept(this); + stream << "["; + parentPrecedence = Precedence::TOP; + op->loc.accept(this); + stream << "] = "; + parentPrecedence = Precedence::TOP; + op->data.accept(this); + stream << ";"; + stream << endl; + } + } void IRPrinter::visit(const For* op) { - doIndent(); - stream << keywordString("for") << " (" - << keywordString(util::toString(op->var.type())) << " "; - op->var.accept(this); - stream << " = "; - op->start.accept(this); - stream << keywordString("; "); - op->var.accept(this); - stream << " < "; - parentPrecedence = BOTTOM; - op->end.accept(this); - stream << keywordString("; "); - op->var.accept(this); + // std::cout << "This is IRPrinter::visit For op method\n"; + if (is_ISPC_code_stream_enabled()) { + doIndent(); + stream2 << keywordString("for") << " (" + << keywordString(util::toString(op->var.type())) << " "; + op->var.accept(this); + stream2 << " = "; + op->start.accept(this); + stream2 << keywordString("; "); + op->var.accept(this); + stream2 << " < "; + parentPrecedence = BOTTOM; + op->end.accept(this); + stream2 << keywordString("; "); + op->var.accept(this); + + auto lit = op->increment.as(); + if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || + (lit->type.isUInt() && lit->equalsScalar(1)))) { + stream2 << "++"; + } + else { + stream2 << " += "; + op->increment.accept(this); + } + stream2 << ") {\n"; - auto lit = op->increment.as(); - if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || - (lit->type.isUInt() && lit->equalsScalar(1)))) { - stream << "++"; + op->contents.accept(this); + doIndent(); + stream2 << "}"; + stream2 << endl; } + + else { - stream << " += "; - op->increment.accept(this); + doIndent(); + stream << keywordString("for") << " (" + << keywordString(util::toString(op->var.type())) << " "; + op->var.accept(this); + stream << " = "; + op->start.accept(this); + stream << keywordString("; "); + op->var.accept(this); + stream << " < "; + parentPrecedence = BOTTOM; + op->end.accept(this); + stream << keywordString("; "); + op->var.accept(this); + + auto lit = op->increment.as(); + if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || + (lit->type.isUInt() && lit->equalsScalar(1)))) { + stream << "++"; + } + else { + stream << " += "; + op->increment.accept(this); + } + stream << ") {\n"; + + op->contents.accept(this); + doIndent(); + stream << "}"; + stream << endl; } - stream << ") {\n"; - op->contents.accept(this); - doIndent(); - stream << "}"; - stream << endl; +} + +void IRPrinter::sendToStream(std::stringstream &stream) { + if (is_ISPC_code_stream_enabled()) { + this->stream2 << stream.str(); + } + else { + this->stream << stream.str(); + } } void IRPrinter::visit(const While* op) { - doIndent(); - stream << keywordString("while "); - stream << "("; - parentPrecedence = Precedence::TOP; - op->cond.accept(this); - stream << ")"; - stream << " {\n"; - op->contents.accept(this); - doIndent(); - stream << "}"; - stream << endl; + // std::stringstream stream; + if (is_ISPC_code_stream_enabled()) { + doIndent(); + stream2 << keywordString("while "); + stream2 << "("; + parentPrecedence = Precedence::TOP; + op->cond.accept(this); + stream2 << ")"; + stream2 << " {\n"; + op->contents.accept(this); + doIndent(); + stream2 << "}"; + stream2 << endl; + } + else { + doIndent(); + stream << keywordString("while "); + stream << "("; + parentPrecedence = Precedence::TOP; + op->cond.accept(this); + stream << ")"; + stream << " {\n"; + op->contents.accept(this); + doIndent(); + stream << "}"; + stream << endl; + } + // sendToStream(stream); } void IRPrinter::visit(const Block* op) { - acceptJoin(this, stream, op->contents, ""); + if (is_ISPC_code_stream_enabled()) { + acceptJoin(this, stream2, op->contents, ""); + } + else { + acceptJoin(this, stream, op->contents, ""); + } } void IRPrinter::visit(const Scope* op) { @@ -438,85 +676,140 @@ void IRPrinter::visit(const Scope* op) { } void IRPrinter::visit(const Function* op) { - stream << keywordString("void ") << op->name; - stream << "("; - if (op->outputs.size() > 0) stream << "Tensor "; - acceptJoin(this, stream, op->outputs, ", Tensor "); - if (op->outputs.size() > 0 && op->inputs.size()) stream << ", "; - if (op->inputs.size() > 0) stream << "Tensor "; - acceptJoin(this, stream, op->inputs, ", Tensor "); - stream << ") {" << endl; + if (is_ISPC_code_stream_enabled()) { + stream2 << keywordString("void ") << op->name; + stream2 << "("; + if (op->outputs.size() > 0) stream2 << "Tensor "; + acceptJoin(this, stream2, op->outputs, ", Tensor "); + if (op->outputs.size() > 0 && op->inputs.size()) stream2 << ", "; + if (op->inputs.size() > 0) stream2 << "Tensor "; + acceptJoin(this, stream2, op->inputs, ", Tensor "); + stream2 << ") {" << endl; + + resetNameCounters(); + op->body.accept(this); - resetNameCounters(); - op->body.accept(this); + doIndent(); + stream2 << "}"; + } + else { + stream << keywordString("void ") << op->name; + stream << "("; + if (op->outputs.size() > 0) stream << "Tensor "; + acceptJoin(this, stream, op->outputs, ", Tensor "); + if (op->outputs.size() > 0 && op->inputs.size()) stream << ", "; + if (op->inputs.size() > 0) stream << "Tensor "; + acceptJoin(this, stream, op->inputs, ", Tensor "); + stream << ") {" << endl; + + resetNameCounters(); + op->body.accept(this); + + doIndent(); + stream << "}"; + } - doIndent(); - stream << "}"; } void IRPrinter::visit(const VarDecl* op) { - doIndent(); - stream << keywordString(util::toString(op->var.type())); - taco_iassert(isa(op->var)); - if (to(op->var)->is_ptr) { - stream << "* restrict"; - } - stream << " "; - string varName = varNameGenerator.getUniqueName(util::toString(op->var)); - varNames.insert({op->var, varName}); - op->var.accept(this); - parentPrecedence = Precedence::TOP; - stream << " = "; - op->rhs.accept(this); - stream << ";"; - stream << endl; + if (is_ISPC_code_stream_enabled()) { + doIndent(); + if (op->var.type() == Int32) { + stream2 << keywordString("int32"); + } + else if (op->var.type() == Int64) { + stream2 << keywordString("int64"); + } else { + stream2 << keywordString(util::toString(op->var.type())); + } + taco_iassert(isa(op->var)); + if (to(op->var)->is_ptr) { + stream2 << "* "; // removed restrict keyword from here + } + stream2 << " "; + string varName = varNameGenerator.getUniqueName(util::toString(op->var)); + varNames.insert({op->var, varName}); + op->var.accept(this); + parentPrecedence = Precedence::TOP; + stream2 << " = "; + op->rhs.accept(this); + stream2 << ";"; + stream2 << endl; + } + else { + doIndent(); + stream << keywordString(util::toString(op->var.type())); + taco_iassert(isa(op->var)); + if (to(op->var)->is_ptr) { + stream << "* restrict"; + } + stream << " "; + string varName = varNameGenerator.getUniqueName(util::toString(op->var)); + varNames.insert({op->var, varName}); + op->var.accept(this); + parentPrecedence = Precedence::TOP; + stream << " = "; + op->rhs.accept(this); + stream << ";"; + stream << endl; + } + } void IRPrinter::visit(const Assign* op) { - doIndent(); - op->lhs.accept(this); - parentPrecedence = Precedence::TOP; - bool printed = false; - if (simplify) { - if (isa(op->rhs)) { - auto add = to(op->rhs); - if (add->a == op->lhs) { - const Literal* lit = add->b.as(); - if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || - (lit->type.isUInt() && lit->equalsScalar(1)))) { - stream << "++"; + if (is_ISPC_code_stream_enabled()) { + + } + + + + else { + doIndent(); + op->lhs.accept(this); + parentPrecedence = Precedence::TOP; + bool printed = false; + if (simplify) { + if (isa(op->rhs)) { + auto add = to(op->rhs); + if (add->a == op->lhs) { + const Literal* lit = add->b.as(); + if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || + (lit->type.isUInt() && lit->equalsScalar(1)))) { + stream << "++"; + } + else { + stream << " += "; + add->b.accept(this); + } + printed = true; } - else { - stream << " += "; - add->b.accept(this); + } + else if (isa(op->rhs)) { + auto mul = to(op->rhs); + if (mul->a == op->lhs) { + stream << " *= "; + mul->b.accept(this); + printed = true; } - printed = true; } - } - else if (isa(op->rhs)) { - auto mul = to(op->rhs); - if (mul->a == op->lhs) { - stream << " *= "; - mul->b.accept(this); - printed = true; + else if (isa(op->rhs)) { + auto bitOr = to(op->rhs); + if (bitOr->a == op->lhs) { + stream << " |= "; + bitOr->b.accept(this); + printed = true; + } } } - else if (isa(op->rhs)) { - auto bitOr = to(op->rhs); - if (bitOr->a == op->lhs) { - stream << " |= "; - bitOr->b.accept(this); - printed = true; - } + if (!printed) { + stream << " = "; + op->rhs.accept(this); } - } - if (!printed) { - stream << " = "; - op->rhs.accept(this); + + stream << ";"; + stream << endl; } - stream << ";"; - stream << endl; } void IRPrinter::visit(const Yield* op) { @@ -544,12 +837,22 @@ void IRPrinter::visit(const Allocate* op) { } void IRPrinter::visit(const Free* op) { - doIndent(); - stream << "free("; - parentPrecedence = Precedence::TOP; - op->var.accept(this); - stream << ");"; - stream << endl; + if (is_ISPC_code_stream_enabled()) { + doIndent(); + stream2 << "delete[] "; + parentPrecedence = Precedence::TOP; + op->var.accept(this); + stream2 << ";"; + stream2 << endl; + } + else { + doIndent(); + stream << "free("; + parentPrecedence = Precedence::TOP; + op->var.accept(this); + stream << ");"; + stream << endl; + } } void IRPrinter::visit(const Comment* op) { @@ -559,17 +862,32 @@ void IRPrinter::visit(const Comment* op) { } void IRPrinter::visit(const BlankLine*) { - stream << endl; + if (is_ISPC_code_stream_enabled()) { + stream2 << endl; + } + else { + stream << endl; + } } void IRPrinter::visit(const Continue*) { doIndent(); - stream << "continue;" << endl; + if (!is_ISPC_code_stream_enabled()) { + stream << "continue;" << endl; + } + else { + stream2 << "continue;" << endl; + } } void IRPrinter::visit(const Break*) { doIndent(); - stream << "break;" << endl; + if (!is_ISPC_code_stream_enabled()) { + stream << "break;" << endl; + } + else { + stream2 << "break;" << endl; + } } void IRPrinter::visit(const Print* op) { @@ -585,7 +903,12 @@ void IRPrinter::visit(const Print* op) { } void IRPrinter::visit(const GetProperty* op) { - stream << op->name; + if (is_ISPC_code_stream_enabled()) { + stream2 << op->name; + } + else { + stream << op->name; + } } void IRPrinter::visit(const Sort* op) { @@ -643,23 +966,47 @@ void IRPrinter::resetNameCounters() { } void IRPrinter::doIndent() { - for (int i=0; ivar); Expr start = rewrite(op->start); Expr end = rewrite(op->end); diff --git a/src/ir_tags.cpp b/src/ir_tags.cpp index af3dbd775..e7365d6c2 100644 --- a/src/ir_tags.cpp +++ b/src/ir_tags.cpp @@ -2,7 +2,7 @@ namespace taco { -const char *ParallelUnit_NAMES[] = {"NotParallel", "DefaultUnit", "GPUBlock", "GPUWarp", "GPUThread", "CPUThread", "CPUVector", "CPUThreadGroupReduction", "GPUBlockReduction", "GPUWarpReduction"}; +const char *ParallelUnit_NAMES[] = {"NotParallel", "DefaultUnit", "GPUBlock", "GPUWarp", "GPUThread", "CPUThread", "CPUVector", "CPUThreadGroupReduction", "GPUBlockReduction", "GPUWarpReduction", "CPUSimd", "CPUSpmd"}; const char *OutputRaceStrategy_NAMES[] = {"IgnoreRaces", "NoRaces", "Atomics", "Temporary", "ParallelReduction"}; const char *BoundType_NAMES[] = {"MinExact", "MinConstraint", "MaxExact", "MaxConstraint"}; const char *AssembleStrategy_NAMES[] = {"Append", "Insert"}; diff --git a/src/lower/iteration_graph.cpp b/src/lower/iteration_graph.cpp index 77735a8d2..482d84aae 100644 --- a/src/lower/iteration_graph.cpp +++ b/src/lower/iteration_graph.cpp @@ -48,6 +48,8 @@ struct IterationGraph::Content { IterationGraph::IterationGraph() { } +// remember that iteration graph does not have an ordering +// I got the ordering from topologically reorder index Ryan wrote IterationGraph IterationGraph::make(Assignment assignment) { TensorVar tensor = assignment.getLhs().getTensorVar(); IndexExpr expr = assignment.getRhs(); @@ -64,8 +66,16 @@ IterationGraph IterationGraph::make(Assignment assignment) { oldToSplitVar.insert({indexVar, indexVar}); } + // access nodes of right hand side match(expr, function([&](const AccessNode* op) { + std::cout << "access node: " << op->tensorVar << " <- " << IndexExpr(op) << std::endl; + std::cout << "index var: "; + for (auto indexVar : op->indexVars) { + std::cout << indexVar << " "; + } + std::cout << std::endl; + auto type = op->tensorVar.getType(); taco_iassert((size_t)type.getShape().getOrder() == op->indexVars.size()) << "Tensor access " << IndexExpr(op) << " but tensor format only has " diff --git a/src/lower/iterator.cpp b/src/lower/iterator.cpp index 0f0c024c5..eb3d8ac3b 100644 --- a/src/lower/iterator.cpp +++ b/src/lower/iterator.cpp @@ -569,6 +569,9 @@ void Iterators::createAccessIterators(Access access, Format format, Expr tensorI ProvenanceGraph provGraph, const map &tensorVars) { TensorVar tensorConcrete = access.getTensorVar(); + cout << "tensor: " << tensorConcrete << " " ; + cout << "tensorConcrete order: " << tensorConcrete.getOrder(); + cout << ", format order: " << format.getOrder() << endl; taco_iassert(tensorConcrete.getOrder() == format.getOrder()) << tensorConcrete << ", Format" << format; Shape shape = tensorConcrete.getType().getShape(); diff --git a/src/lower/lowerer_impl_imperative.cpp b/src/lower/lowerer_impl_imperative.cpp index b4c9ea710..cce8f2166 100644 --- a/src/lower/lowerer_impl_imperative.cpp +++ b/src/lower/lowerer_impl_imperative.cpp @@ -1,4 +1,6 @@ #include +#include "taco/cuda.h" +#include "taco/ir_tags.h" #include "taco/lower/lowerer_impl_imperative.h" #include "taco/lower/lowerer_impl.h" @@ -26,6 +28,7 @@ class LowererImplImperative::Visitor : public IndexNotationVisitorStrict { public: Visitor(LowererImplImperative* impl) : impl(impl) {} Stmt lower(IndexStmt stmt) { + // std::cout << "lowering IndexStmt to ir:Stmt - IndexStmt: " << stmt << std::endl; this->stmt = Stmt(); impl->accessibleIterators.scope(); IndexStmtVisitorStrict::visit(stmt); @@ -200,6 +203,7 @@ static std::set hasSparseInserts(IndexStmt stmt, Iterators iterators, return ret; } + Stmt LowererImplImperative::lower(IndexStmt stmt, string name, bool assemble, bool compute, bool pack, bool unpack) @@ -414,6 +418,7 @@ LowererImplImperative::lower(IndexStmt stmt, string name, Stmt LowererImplImperative::lowerAssignment(Assignment assignment) { + // std::cout << "\n\n converting assignment IndexStmt============================================ Assignment\n"; taco_iassert(generateAssembleCode() || generateComputeCode()); Stmt computeStmt; @@ -421,7 +426,7 @@ Stmt LowererImplImperative::lowerAssignment(Assignment assignment) Expr var = getTensorVar(result); const bool needComputeAssign = util::contains(needCompute, result); - + // std::cout << "does assignment need compute assign: " << needComputeAssign << std::endl; Expr rhs; if (needComputeAssign) { rhs = lower(assignment.getRhs()); @@ -429,20 +434,51 @@ Stmt LowererImplImperative::lowerAssignment(Assignment assignment) // Assignment to scalar variables. if (isScalar(result.getType())) { + // std::cout << "assignment to scalar variables\n"; if (needComputeAssign) { + // std::cout << "compute assign\n"; if (!assignment.getOperator().defined()) { + // std::cout << "assignment operator is not defined\n"; + // std::cout << "var: " << var << ", rhs, : " << rhs << std::endl; computeStmt = Assign::make(var, rhs); } else { taco_iassert(isa(assignment.getOperator())); - bool useAtomics = markAssignsAtomicDepth > 0 && - !util::contains(whereTemps, result); + + // std::cout << "assignment depth -- loopDepth: " << loopDepth << std::endl; + // std::cout << "is markAssignsAtomicDepth > 0: " << (markAssignsAtomicDepth > 0) << std::endl; + // for (auto &tensors_ : whereTemps) { + // // std::cout << tensors_ << ", "; + // } + // std::cout << std::endl; + // std::cout << result << std::endl; + int tempVarInitLoopDepth = whereTempsWithLoopDepth.find(result)->second; + // std::cout << "tempInitLoopDepth: " << tempVarInitLoopDepth << std::endl; + + bool reduction = false; + std::map::iterator itr; + for (itr = forUnits.begin(); itr!=forUnits.end(); ++itr) { + if (itr->first<=loopDepth && itr->first>tempVarInitLoopDepth && itr->second == ParallelUnit::CPUSimd) { + reduction = true; + } + // std::cout << itr->first << "\t" << ParallelUnit_NAMES[(int) itr->second] << std::endl; + } + + // less than or equal to loopDepth but greater than temp variable initialized loop depth + bool useAtomics = markAssignsAtomicDepth > 0 && (!util::contains(whereTemps, result) || reduction); + // std::cout << "whereTemps and result: " << !util::contains(whereTemps, result) << std::endl; + // std::cout << "assignment to scalar variables useAtomics: " << useAtomics << std::endl; computeStmt = compoundAssign(var, rhs, useAtomics, atomicParallelUnit); + // std::cout << "computeStatment: " << computeStmt << std::endl; } } + else { + // std::cout << "not compute assign\n"; + } } // Assignments to tensor variables (non-scalar). else { + // std::cout << "assignment to tensor variables\n"; Expr values = getValuesArray(result); Expr loc = generateValueLocExpr(assignment.getLhs()); @@ -476,6 +512,7 @@ Stmt LowererImplImperative::lowerAssignment(Assignment assignment) } if (needComputeAssign && values.defined()) { + // std::cout << "assign compute statement\n"; if (!assignment.getOperator().defined()) { computeStmt = Store::make(values, loc, rhs); } @@ -586,19 +623,39 @@ LowererImplImperative::splitAppenderAndInserters(const vector& results } +// important function +/* +* This is the for loop lowering part +*/ + Stmt LowererImplImperative::lowerForall(Forall forall) { + loopDepth++; + forUnits.insert(std::pair(loopDepth,forall.getParallelUnit())); + // std::cout << "doing lowerForall: " << forall << std::endl; bool hasExactBound = provGraph.hasExactBound(forall.getIndexVar()); bool forallNeedsUnderivedGuards = !hasExactBound && emitUnderivedGuards; + + + // std::cout << "printing temporary variables with their atomic depths\n"; + map::iterator itr; + for (itr = whereTempsWithLoopDepth.begin(); itr != whereTempsWithLoopDepth.end(); ++itr) { + // std::cout << itr->first << "\t" << itr->second << "\n"; + } + + if (!ignoreVectorize && forallNeedsUnderivedGuards && (forall.getParallelUnit() == ParallelUnit::CPUVector || forall.getUnrollFactor() > 0)) { + // std::cout << "calling lowerForallCloned(forall)\n"; return lowerForallCloned(forall); } + // std::cout << "inParallelLoopDepth: " << inParallelLoopDepth << "========================\n"; if (forall.getParallelUnit() != ParallelUnit::NotParallel) { inParallelLoopDepth++; } + // std::cout << "inParallelLoopDepth: " << inParallelLoopDepth << "========================\n"; // Recover any available parents that were not recoverable previously vector recoverySteps; @@ -786,19 +843,23 @@ Stmt LowererImplImperative::lowerForall(Forall forall) } if (!isWhereProducer && hasPosDescendant && underivedAncestors.size() > 1 && provGraph.isPosVariable(iterator.getIndexVar()) && posDescendant == forall.getIndexVar()) { + // std::cout << "calling lowerForallFusedPosition(forall\n"; loops = lowerForallFusedPosition(forall, iterator, locators, inserters, appenders, reducedAccesses, recoveryStmt); } else if (canAccelWithSparseIteration) { + // std::cout << "calling lowerForallDenseAcceleration(forall\n"; loops = lowerForallDenseAcceleration(forall, locators, inserters, appenders, reducedAccesses, recoveryStmt); } // Emit dimension coordinate iteration loop else if (iterator.isDimensionIterator()) { + // std::cout << "calling lowerForallDimension(forall\n"; loops = lowerForallDimension(forall, point.locators(), inserters, appenders, reducedAccesses, recoveryStmt); } // Emit position iteration loop else if (iterator.hasPosIter()) { + // std::cout << "calling lowerForallPosition(forall\n"; loops = lowerForallPosition(forall, iterator, locators, inserters, appenders, reducedAccesses, recoveryStmt); } @@ -816,6 +877,10 @@ Stmt LowererImplImperative::lowerForall(Forall forall) loops = lowerMergeLattice(lattice, underivedAncestors[0], forall.getStmt(), reducedAccesses); } + + // std::cout << "printing loops ----------------------------------------------------------------------------------------------\n"; + // std::cout << loops << std::endl; + // std::cout << "loops printed -----------------------------------------------------------------------------------------------\n"; // taco_iassert(loops.defined()); if (!generateComputeCode() && !hasStores(loops)) { @@ -832,6 +897,9 @@ Stmt LowererImplImperative::lowerForall(Forall forall) parallelUnitIndexVars.erase(forall.getParallelUnit()); parallelUnitSizes.erase(forall.getParallelUnit()); } + + forUnits.erase(loopDepth); + loopDepth--; return Block::blanks(preInitValues, temporaryValuesInitFree[0], loops, @@ -1136,13 +1204,22 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, set reducedAccesses, ir::Stmt recoveryStmt) { + // std::cout << "1 Stmt LowererImplImperative::lowerForallDimension\n"; + // std::cout << "1 Stmt LowererImplImperative::lowerForallDimension markAssignsAtomicDepth: " << markAssignsAtomicDepth << std::endl; Expr coordinate = getCoordinateVar(forall.getIndexVar()); if (forall.getParallelUnit() != ParallelUnit::NotParallel && forall.getOutputRaceStrategy() == OutputRaceStrategy::Atomics) { markAssignsAtomicDepth++; + // std::cout << "1 Stmt LowererImplImperative::lowerForallDimension getParallelUnit() is Not NotParallel and outputRaceStrategy is Atomics\n"; + // std::cout << "markAssignsAtomicDepth: " << markAssignsAtomicDepth << std::endl; atomicParallelUnit = forall.getParallelUnit(); } + else { + // std::cout << "1 Stmt LowererImplImperative::lowerForallDimension getParallelUnit() is NotParallel or outputRaceStrategy is not Atomics\n"; + } + // std::cout << "original forall : " << forall << std::endl; + // std::cout << "inside IndexStmt: " << forall.getStmt() << std::endl; Stmt body = lowerForallBody(coordinate, forall.getStmt(), locators, inserters, appenders, reducedAccesses); @@ -1158,7 +1235,18 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, std::vector bounds = provGraph.deriveIterBounds(forall.getIndexVar(), definedIndexVarsOrdered, underivedBounds, indexVarToExprMap, iterators); LoopKind kind = LoopKind::Serial; - if (forall.getParallelUnit() == ParallelUnit::CPUVector && !ignoreVectorize) { + if (should_use_ISPC_codegen()) { + // std::cout << "Foreach compatible loop\n"; + if (forall.getParallelUnit() == ParallelUnit::CPUSimd) { + kind = LoopKind::Foreach; + } + else if (forall.getParallelUnit() == ParallelUnit::CPUSpmd + && forall.getOutputRaceStrategy() != OutputRaceStrategy::ParallelReduction + ) { + kind = LoopKind::Mul_Thread; + } + } + else if (forall.getParallelUnit() == ParallelUnit::CPUVector && !ignoreVectorize) { kind = LoopKind::Vectorized; } else if (forall.getParallelUnit() != ParallelUnit::NotParallel @@ -1166,6 +1254,7 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, kind = LoopKind::Runtime; } + // std::cout << "2 Stmt LowererImplImperative::lowerForallDimension\n"; return Block::blanks(For::make(coordinate, bounds[0], bounds[1], 1, body, kind, ignoreVectorize ? ParallelUnit::NotParallel : forall.getParallelUnit(), ignoreVectorize ? 0 : forall.getUnrollFactor()), @@ -1179,6 +1268,7 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, set reducedAccesses, ir::Stmt recoveryStmt) { + // std::cout << "1 Stmt LowererImplImperative::lowerForallDenseAcceleration\n"; taco_iassert(locators.size() == 1) << "Optimizing a dense workspace is only supported when the consumer is the only RHS tensor"; taco_iassert(provGraph.isFullyDerived(forall.getIndexVar())) << "Sparsely accelerating a dense workspace only works with fully derived index vars"; taco_iassert(forall.getParallelUnit() == ParallelUnit::NotParallel) << "Sparsely accelerating a dense workspace only works within serial loops"; @@ -1204,6 +1294,8 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, } Stmt declareVar = VarDecl::make(coordinate, Load::make(indexList, loopVar)); + // std::cout << "original forall : " << forall << std::endl; + // std::cout << "inside IndexStmt: " << forall.getStmt() << std::endl; Stmt body = lowerForallBody(coordinate, forall.getStmt(), locators, inserters, appenders, reducedAccesses); Stmt resetGuard = ir::Store::make(bitGuard, coordinate, ir::Literal::make(false), markAssignsAtomicDepth > 0, atomicParallelUnit); @@ -1216,7 +1308,12 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, Stmt posAppend = generateAppendPositions(appenders); LoopKind kind = LoopKind::Serial; - if (forall.getParallelUnit() == ParallelUnit::CPUVector && !ignoreVectorize) { + if (should_use_ISPC_codegen()) { + if (forall.getParallelUnit() == ParallelUnit::CPUSimd) { + kind = LoopKind::Foreach; + } + } + else if (forall.getParallelUnit() == ParallelUnit::CPUVector && !ignoreVectorize) { kind = LoopKind::Vectorized; } else if (forall.getParallelUnit() != ParallelUnit::NotParallel @@ -1224,6 +1321,7 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, kind = LoopKind::Runtime; } + // std::cout << "2 Stmt LowererImplImperative::lowerForallDenseAcceleration\n"; return Block::blanks(For::make(loopVar, 0, indexListSize, 1, body, kind, ignoreVectorize ? ParallelUnit::NotParallel : forall.getParallelUnit(), ignoreVectorize ? 0 : forall.getUnrollFactor()), @@ -1247,6 +1345,8 @@ Stmt LowererImplImperative::lowerForallPosition(Forall forall, Iterator iterator set reducedAccesses, ir::Stmt recoveryStmt) { + // std::cout << "1 Stmt LowererImplImperative::lowerForallPosition\n" << std::endl; + Expr coordinate = getCoordinateVar(forall.getIndexVar()); Stmt declareCoordinate = Stmt(); Stmt strideGuard = Stmt(); @@ -1278,6 +1378,11 @@ Stmt LowererImplImperative::lowerForallPosition(Forall forall, Iterator iterator markAssignsAtomicDepth++; } + // see we are inside a forall. ex: forall(i, forall(j, y(i) += A(i,j) * x(j))) + // when you call forall.getStmt it returns forall(j, y(i) += A(i,j) * x(j)) which is the + // IndexStmt inside the forall IndexStmt + // std::cout << "original forall : " << forall << std::endl; + // std::cout << "inside IndexStmt: " << forall.getStmt() << std::endl; Stmt body = lowerForallBody(coordinate, forall.getStmt(), locators, inserters, appenders, reducedAccesses); @@ -1339,6 +1444,7 @@ Stmt LowererImplImperative::lowerForallPosition(Forall forall, Iterator iterator kind = LoopKind::Runtime; } + // std::cout << "2 Stmt LowererImplImperative::lowerForallPosition\n" << std::endl; // Loop with preamble and postamble return Block::blanks( boundsCompute, @@ -1357,6 +1463,7 @@ Stmt LowererImplImperative::lowerForallFusedPosition(Forall forall, Iterator ite set reducedAccesses, ir::Stmt recoveryStmt) { + // std::cout << "1 Stmt LowererImplImperative::lowerForallFusedPosition\n" << std::endl; Expr coordinate = getCoordinateVar(forall.getIndexVar()); Stmt declareCoordinate = Stmt(); if (provGraph.isCoordVariable(forall.getIndexVar())) { @@ -1447,6 +1554,8 @@ Stmt LowererImplImperative::lowerForallFusedPosition(Forall forall, Iterator ite markAssignsAtomicDepth++; } + // std::cout << "original forall : " << forall << std::endl; + // std::cout << "inside IndexStmt: " << forall.getStmt() << std::endl; Stmt body = lowerForallBody(coordinate, forall.getStmt(), locators, inserters, appenders, reducedAccesses); @@ -1503,6 +1612,8 @@ Stmt LowererImplImperative::lowerForallFusedPosition(Forall forall, Iterator ite && forall.getOutputRaceStrategy() != OutputRaceStrategy::ParallelReduction && !ignoreVectorize) { kind = LoopKind::Runtime; } + + // std::cout << "2 Stmt LowererImplImperative::lowerForallFusedPosition\n" << std::endl; // Loop with preamble and postamble return Block::blanks(boundsCompute, Block::make(Block::make(searchForUnderivedStart), @@ -1765,6 +1876,9 @@ Stmt LowererImplImperative::lowerForallBody(Expr coordinate, IndexStmt stmt, vector inserters, vector appenders, const set& reducedAccesses) { + + // std::cout << "lowering a forall body----------------------------------------------------\n"; + Stmt initVals = resizeAndInitValues(appenders, reducedAccesses); // Inserter positions @@ -1780,6 +1894,7 @@ Stmt LowererImplImperative::lowerForallBody(Expr coordinate, IndexStmt stmt, // Code of loop body statement Stmt body = lower(stmt); + // std::cout << "\nBefore: [" << stmt << "]\nAfter : [" << body << "]\n"; // Code to append coordinates Stmt appendCoords = appendCoordinate(appenders, coordinate); @@ -1889,6 +2004,7 @@ vector LowererImplImperative::codeToInitializeDenseAcceleratorArrays(Where Expr p = Var::make("p" + temporary.getName(), Int()); Stmt guardZeroInit = Store::make(alreadySetArr, p, ir::Literal::zero(bitGuardType)); + // std::cout << "vector LowererImplImperative::codeToInitializeDenseAcceleratorArrays\n" << std::endl; Stmt zeroInitLoop = For::make(p, 0, bitGuardSize, 1, guardZeroInit, LoopKind::Serial); Stmt inits = Block::make(alreadySetDecl, indexListDecl, allocateAlreadySet, allocateIndexList, zeroInitLoop); return {inits, freeTemps}; @@ -2144,6 +2260,7 @@ vector LowererImplImperative::codeToInitializeTemporary(Where where) { } Stmt LowererImplImperative::lowerWhere(Where where) { + // std::cout << "\n--------------------------------------- lowering where statement: " << where << "\n\n\n"; TensorVar temporary = where.getTemporary(); bool accelerateDenseWorkSpace, sortAccelerator; std::tie(accelerateDenseWorkSpace, sortAccelerator) = @@ -2180,6 +2297,7 @@ Stmt LowererImplImperative::lowerWhere(Where where) { }) ); + // std::cout << "\ninitiating lowering of where consumer: " << where.getConsumer() << std::endl; Stmt consumer = lower(where.getConsumer()); if (accelerateDenseWorkSpace && sortAccelerator) { // We need to sort the indices array @@ -2203,11 +2321,13 @@ Stmt LowererImplImperative::lowerWhere(Where where) { true, false); Expr size = getTemporarySize(where); Stmt zeroInit = Store::make(values, p, ir::Literal::zero(temporary.getType().getDataType())); + // std::cout << "Stmt LowererImplImperative::lowerWhere\n"; Stmt loopInit = For::make(p, 0, size, 1, zeroInit, LoopKind::Serial); initializeTemporary = Block::make(initializeTemporary, loopInit); } whereConsumers.push_back(consumer); + // std::cout << "\nwhere temporaries: " << where.getTemporary() << std::endl; whereTemps.push_back(where.getTemporary()); captureNextLocatePos = true; @@ -2218,6 +2338,9 @@ Stmt LowererImplImperative::lowerWhere(Where where) { restoreAtomicDepth = true; } + whereTempsWithLoopDepth.insert(std::pair(where.getTemporary(), loopDepth)); + + // std::cout << "\ninitiating lowering of where producer: " << where.getConsumer() << std::endl; Stmt producer = lower(where.getProducer()); if (accelerateDenseWorkSpace) { const Expr indexListSizeExpr = tempToIndexListSize.at(temporary); @@ -2225,6 +2348,8 @@ Stmt LowererImplImperative::lowerWhere(Where where) { initializeTemporary = Block::make(indexListSizeDecl, initializeTemporary); } + whereTempsWithLoopDepth.erase(where.getTemporary()); + if (restoreAtomicDepth) { markAssignsAtomicDepth++; } @@ -2334,6 +2459,7 @@ Stmt LowererImplImperative::lowerAssemble(Assemble assemble) { resultModeOrdering[iter.getMode().getLevel() - 1]); Expr pos = iter.getPosVar(); Stmt initPos = VarDecl::make(pos, iter.locate(locateCoords)[0]); + // std::cout << "Stmt LowererImplImperative::lowerAssemble\n"; insertEdgeLoop = For::make(coords.back(), 0, dim, 1, Block::make(initPos, insertEdgeLoop)); } else { @@ -2371,7 +2497,7 @@ Stmt LowererImplImperative::lowerAssemble(Assemble assemble) { initAssembleStmts.push_back(initValues); } } else if (zeroInit) { - initAssembleStmts.push_back(zeroInitValues(resultTensorVar, 0, prevSize)); + initAssembleStmts.push_back(zeroInitValues(resultTensorVar, 0, prevSize)); // init values } } Stmt initAssemble = Block::make(initAssembleStmts); @@ -2415,6 +2541,7 @@ Stmt LowererImplImperative::lowerMulti(Multi multi) { } Stmt LowererImplImperative::lowerSuchThat(SuchThat suchThat) { + // std::cout << "lowering such that statement\n"; Stmt stmt = lower(suchThat.getStmt()); return Block::make(stmt); } @@ -2744,7 +2871,7 @@ Stmt LowererImplImperative::initResultArrays(vector writes, // iteration of all the iterators is not full. We can check this by seeing if we can recover a // full iterator from our set of iterators. Expr size = generateAssembleCode() ? getCapacityVar(tensor) : parentSize; - result.push_back(zeroInitValues(tensor, 0, size)); + result.push_back(zeroInitValues(tensor, 0, size)); // init values } } return result.empty() ? Stmt() : Block::blanks(result); @@ -2895,7 +3022,7 @@ Stmt LowererImplImperative::initResultArrays(IndexVar var, vector writes util::contains(reducedAccesses, write)) { // Zero-initialize values array if might not assign to every element // in values array during compute - result.push_back(zeroInitValues(tensor, resultParentPos, stride)); + result.push_back(zeroInitValues(tensor, resultParentPos, stride)); // init values } } } @@ -2942,6 +3069,7 @@ Stmt LowererImplImperative::resizeAndInitValues(const std::vector& app Stmt LowererImplImperative::zeroInitValues(Expr tensor, Expr begin, Expr size) { + // std::cout << "1 Stmt LowererImplImperative::zeroInitValues\n"; Expr lower = simplify(ir::Mul::make(begin, size)); Expr upper = simplify(ir::Mul::make(ir::Add::make(begin, 1), size)); Expr p = Var::make("p" + util::toString(tensor), Int()); @@ -2954,6 +3082,11 @@ Stmt LowererImplImperative::zeroInitValues(Expr tensor, Expr begin, Expr size) { return ir::VarDecl::make(ir::Var::make("status", Int()), ir::Call::make("cudaMemset", {values, ir::Literal::make(0, Int()), ir::Mul::make(ir::Sub::make(upper, lower), ir::Literal::make(values.type().getNumBytes()))}, Int())); } + // std::cout << "2 Stmt LowererImplImperative::zeroInitValues\n"; + // if generating ispc code, we will keep the LoopKind as Init so that we can initializa it if tasks are used + if (should_use_ISPC_codegen()) { + return For::make(p, lower, upper, 1, zeroInit, LoopKind::Init); + } return For::make(p, lower, upper, 1, zeroInit, parallel); } diff --git a/src/lower/tensor_path.h b/src/lower/tensor_path.h index 4f5dc49af..da52fb782 100644 --- a/src/lower/tensor_path.h +++ b/src/lower/tensor_path.h @@ -2,6 +2,7 @@ #define TACO_TENSOR_PATH_H #include +#include #include #include "taco/util/comparable.h" @@ -47,14 +48,13 @@ class TensorPath : public util::Comparable { friend bool operator==(const TensorPath&, const TensorPath&); friend bool operator<(const TensorPath&, const TensorPath&); + friend std::ostream& operator<<(std::ostream&, const TensorPath&); private: struct Content; std::shared_ptr content; }; -std::ostream& operator<<(std::ostream&, const TensorPath&); - /// A step along a tensor path. class TensorPathStep : public util::Comparable { diff --git a/src/tensor.cpp b/src/tensor.cpp index fab437ff1..1c95851c5 100644 --- a/src/tensor.cpp +++ b/src/tensor.cpp @@ -10,6 +10,7 @@ #include #include +#include "../test/util.h" #include "taco/cuda.h" #include "taco/format.h" #include "taco/taco_tensor_t.h" @@ -278,6 +279,7 @@ static size_t unpackTensorData(const taco_tensor_t& tensorData, /// Pack coordinates into a data structure given by the tensor format. void TensorBase::pack() { + std::cout << "TensorBase::Pack() method\n"; if (!needsPack()) { return; } @@ -346,6 +348,7 @@ void TensorBase::pack() { taco_iassert((content->coordinateBufferUsed % content->coordinateSize) == 0); const size_t numCoordinates = content->coordinateBufferUsed / content->coordinateSize; + std::cout << "call helperFuncs\n"; const auto helperFuncs = getHelperFunctions(getFormat(), getComponentType(), dimensions); @@ -619,10 +622,12 @@ void TensorBase::compile() { IndexStmt stmt = makeConcreteNotation(makeReductionNotation(assignment)); stmt = reorderLoopsTopologically(stmt); stmt = insertTemporaries(stmt); + std::cout << "calling parallelizeOuterLoop(stmt)\n"; stmt = parallelizeOuterLoop(stmt); compile(stmt, content->assembleWhileCompute); } void TensorBase::compile(taco::IndexStmt stmt, bool assembleWhileCompute) { + std::cout << "TensorBase::compile\n"; if (!needsCompile()) { return; } @@ -804,9 +809,9 @@ void TensorBase::assemble() { void TensorBase::compute() { taco_uassert(!needsCompile()) << error::compute_without_compile; - if (!needsCompute()) { - return; - } + // if (!needsCompute()) { + // return; + // } setNeedsCompute(false); // Sync operand tensors if needed. auto operands = getTensors(getAssignment().getRhs()); @@ -816,7 +821,12 @@ void TensorBase::compute() { } auto arguments = packArguments(*this); - this->content->module->callFuncPacked("compute", arguments.data()); + + taco::util::TimeResults timevalue; + bool time = true; + TOOL_BENCHMARK_TIMER(this->content->module->callFuncPacked("compute", arguments.data()), + "\n\nkernel execution time: ", timevalue); + // this->content->module->callFuncPacked("compute", arguments.data()); if (content->assembleWhileCompute) { setNeedsAssemble(false); @@ -934,6 +944,7 @@ TensorBase::getHelperFunctions(const Format& format, Datatype ctype, }; const auto dims = util::map(dimensions, getDim); + set_ISPC_code_stream_enabled(false); if (format.getOrder() > 0) { const Format bufferFormat = COO(format.getOrder(), false, true, false, format.getModeOrdering()); @@ -951,6 +962,7 @@ TensorBase::getHelperFunctions(const Format& format, Datatype ctype, } // Lower packing and iterator code. + std::cout << "1 Lower packing and iterator code\n"; helperModule->addFunction(lower(packStmt, "pack", true, true)); helperModule->addFunction(lower(iterateStmt, "iterate", false, true)); } else { @@ -964,12 +976,14 @@ TensorBase::getHelperFunctions(const Format& format, Datatype ctype, IndexVar indexVar; IndexStmt assignment = (packedScalar() = bufferVector(indexVar)); IndexStmt packStmt= makeConcreteNotation(makeReductionNotation(assignment)); + std::cout << "2 Lower packing and iterator code\n"; helperModule->addFunction(lower(packStmt, "pack", true, true)); // Define and lower iterator code. IndexStmt iterateStmt = Yield({}, packedScalar()); helperModule->addFunction(lower(iterateStmt, "iterate", false, true)); } + std::cout << "Compiling the helperModule\n"; helperModule->compile(); helperFunctionsMutex.lock(); diff --git a/taco-uml.wsd b/taco-uml.wsd new file mode 100644 index 000000000..4b8e39802 --- /dev/null +++ b/taco-uml.wsd @@ -0,0 +1,411 @@ +@startuml taco +scale 1 + + +class IntrusivePtr { + +T *ptr +} +class Uncopyable {} + +class IRNode { + +virtual void accept(IRVisitorStrict *v) const = 0 + +virtual IRNodeType type_info() const = 0; +} + +class BaseStmtNode {} +class BaseExprNode { + +Datatype type +} + +class StmtNode { + +void accept(IRVisitorStrict *v) const +} +class ExprNode { + +void accept(IRVisitorStrict *v) const +} + +Uncopyable <|-- IRNode +IRNode <|-- BaseStmtNode +IRNode <|-- BaseExprNode +BaseStmtNode <|-- StmtNode +BaseExprNode <|-- ExprNode + +class IRHandle { + +void accept(IRVisitorStrict *v) const +} +class Expr {} +class Stmt {} + +IntrusivePtr <|-- IRHandle +IRHandle <|-- Expr +IRHandle <|-- Stmt + +IRHandle "1" *-- "1" IRNode : contains + + + +' this class is abstract but plantuml version does not support interface keyword +interface IRVisitorStrict { + +virtual void visit(const IRNode*) const = 0 +} + +/' +IRVisitor is not an interface or abstract because it +has not pure virtual methods +'/ +class IRVisitor { + +virtual void visit(const IRNode*) +} + +class IRRewriter { + ' protected fields and methods + #Expr expr + #Stmt stmt + + #virtual void visit(const ExprNode* op) + #virtual void visit(const StmtNode* op) + + ' public fields and methods + +Expr rewrite(Expr) + +Stmt rewrite(Stmt) +} +class IRPrinter { + #std::ostream &stream + #std::ostream &stream2 + #int indent + #bool color + #bool simplify + #enum Precedence + #Precedence parentPrecedence = BOTTOM + #NameGenerator varNameGenerator + #scopedMap varNames + + #void doIndent() + #void printBinOp(Expr a, Expr b, std::string op, Precedence precedence) + #void fewMoreMethods() + + #virtual void visit(const ExprNode*) + #virtual void visit(const StmtNode*) + + +setColor(bool color) + +print(Stmt) +} +class IRVerifier {} + +IRVisitorStrict <|-- IRVisitor +IRVisitorStrict <|-- IRPrinter +IRVisitorStrict <|-- IRRewriter +IRVisitor <|-- IRVerifier + +' Inheritance from IRRewriter +' simplifier for ir::Expr +class ExpressionSimplifier {} +IRRewriter <|-- ExpressionSimplifier + +' simplifiers for ir::Stmt +class RemoveRedundantStatements {} +class RemoveRedundantLoops {} +class RemoveDuplicateBody {} + +IRRewriter <|-- RemoveRedundantStatements +IRRewriter <|-- RemoveRedundantLoops +IRRewriter <|-- RemoveDuplicateBody + + +' Inheritance from IRPrinter +class CodeGen {} +class CodeGen_C {} +class CodeGen_CUDA {} +class CodeGen_ISPC { + -class FindVars +} + +class FindVars {} + +IRPrinter <|-- CodeGen +CodeGen <|-- CodeGen_C +CodeGen <|-- CodeGen_ISPC +CodeGen <|-- CodeGen_CUDA + +IRVisitor <|-- FindVars +CodeGen_ISPC +-- FindVars + +class Manageable {} +class IndexStmtNode { + -virtual void accept(IndexStmtVisitorStrict*) const = 0 +} +class IndexExprNode { + -virtual void accept(IndexStmtVisitorStrict*) const = 0 +} + + +Manageable <|-- IndexStmtNode +Uncopyable <|-- IndexStmtNode +Manageable <|-- IndexExprNode +Uncopyable <|-- IndexExprNode + +class IndexStmt {} +class IndexExpr {} + +IntrusivePtr <|-- IndexStmt +IndexStmt "1" *-- "1" IndexStmtNode +IntrusivePtr <|-- IndexExpr +IndexExpr "1" *-- "1" IndexExprNode + + +abstract class IndexExprVisitorStrict { + +void visit(const IndexStmt&) + +virtual void visit(const AccessNode*) = 0 + +virtual void visit(const LiteralNode*) = 0 + +virtual void visit(const NegNode*) = 0 + +virtual void visit(const AddNode*) = 0 + +virtual void visit(const SubNode*) = 0 + +virtual void visit(const MulNode*) = 0 + +virtual void visit(const DivNode*) = 0 + +virtual void visit(const SqrtNode*) = 0 + +virtual void visit(const CastNode*) = 0 + +virtual void visit(const CallIntrinsicNode*) = 0 + +virtual void visit(const ReductionNode*) = 0 +} +abstract class IndexStmtVisitorStrict { + +void visit(const IndexStmt&) + +virtual void visit(const AssignmentNode*) = 0 + +virtual void visit(const YieldNode*) = 0 + +virtual void visit(const ForallNode*) = 0 + +virtual void visit(const WhereNode*) = 0 + +virtual void visit(const SequenceNode*) = 0 + +virtual void visit(const AssembleNode*) = 0 + +virtual void visit(const MultiNode*) = 0 + +virtual void visit(const SuchThatNode*) = 0 +} + +abstract class IndexNotationVisitorStrict {} +class IndexNotationPrinter { + +void print(const IndexExpr& expr) + +void print(const IndexStmt& expr) + + ' Index Expressions visit() + +void visit(const AccessNode* node) + +void visit(const LiteralNode* node) + + void visit(const NegNode* node) + + void visit(const AddNode* node) + + void visit(const SubNode* node) + + void visit(const MulNode* node) + + void visit(const DivNode* node) + + void visit(const SqrtNode* node) + + void visit(const CastNode* node) + + void visit(const CallIntrinsicNode* node) + + void visit(const UnaryExprNode* node) + + void visit(const BinaryExprNode* node) + + void visit(const ReductionNode* node) + + ' Index Statement visit() + + void visit(const AssignmentNode* node) + + void visit(const YieldNode* node) + + void visit(const ForallNode* node) + + void visit(const WhereNode* node) + + void visit(const SequenceNode* node) + + void visit(const AssembleNode* node) + + void visit(const MultiNode* node) + + void visit(const SuchThatNode* node) +} +class IndexNotationVisitor { + ' Index Expressions visit() + +virtual void visit(const AccessNode* node) + +virtual void visit(const LiteralNode* node) + +virtual void visit(const NegNode* node) + +virtual void visit(const AddNode* node) + +virtual void visit(const SubNode* node) + +virtual void visit(const MulNode* node) + +virtual void visit(const DivNode* node) + +virtual void visit(const SqrtNode* node) + +virtual void visit(const CastNode* node) + +virtual void visit(const CallIntrinsicNode* node) + +virtual void visit(const UnaryExprNode* node) + +virtual void visit(const BinaryExprNode* node) + +virtual void visit(const ReductionNode* node) + + ' Index Statement visit() + +virtual void visit(const AssignmentNode* node) + +virtual void visit(const YieldNode* node) + +virtual void visit(const ForallNode* node) + +virtual void visit(const WhereNode* node) + +virtual void visit(const SequenceNode* node) + +virtual void visit(const AssembleNode* node) + +virtual void visit(const MultiNode* node) + +virtual void visit(const SuchThatNode* node) +} +class Matcher { + +} + +abstract class IndexExprRewriterStrict { + +IndexExpr rewrite(IndexExpr) + + #IndexExpr expr + + #virtual void visit(const AccessNode* op) = 0 + #virtual void visit(const LiteralNode* op) = 0 + #virtual void visit(const NegNode* op) = 0 + #virtual void visit(const SqrtNode* op) = 0 + #virtual void visit(const AddNode* op) = 0 + #virtual void visit(const SubNode* op) = 0 + #virtual void visit(const MulNode* op) = 0 + #virtual void visit(const DivNode* op) = 0 + #virtual void visit(const CastNode* op) = 0 + #virtual void visit(const CallIntrinsicNode* op) = 0 + #virtual void visit(const ReductionNode* op) = 0 +} +abstract class IndexStmtRewriterStrict { + +IndexStmt rewrite(IndexStmt) + + #IndexStmt stmt + + #virtual void visit(const AssignmentNode* op) = 0 + #virtual void visit(const YieldNode* op) = 0 + #virtual void visit(const ForallNode* op) = 0 + #virtual void visit(const WhereNode* op) = 0 + #virtual void visit(const SequenceNode* op) = 0 + #virtual void visit(const AssembleNode* op) = 0 + #virtual void visit(const MultiNode* op) = 0 + #virtual void visit(const SuchThatNode* op) = 0 +} +abstract class IndexNotationRewriterStrict {} +class IndexNotationRewriter { + ' Index Expressions visit() + +virtual void visit(const AccessNode* node) + +virtual void visit(const LiteralNode* node) + +virtual void visit(const NegNode* node) + +virtual void visit(const AddNode* node) + +virtual void visit(const SubNode* node) + +virtual void visit(const MulNode* node) + +virtual void visit(const DivNode* node) + +virtual void visit(const SqrtNode* node) + +virtual void visit(const CastNode* node) + +virtual void visit(const CallIntrinsicNode* node) + +virtual void visit(const UnaryExprNode* node) + +virtual void visit(const BinaryExprNode* node) + +virtual void visit(const ReductionNode* node) + + ' Index Statement visit() + +virtual void visit(const AssignmentNode* node) + +virtual void visit(const YieldNode* node) + +virtual void visit(const ForallNode* node) + +virtual void visit(const WhereNode* node) + +virtual void visit(const SequenceNode* node) + +virtual void visit(const AssembleNode* node) + +virtual void visit(const MultiNode* node) + +virtual void visit(const SuchThatNode* node) +} + + +IndexExprVisitorStrict <|-- IndexNotationVisitorStrict +IndexStmtVisitorStrict <|-- IndexNotationVisitorStrict +IndexNotationVisitorStrict <|-- IndexNotationVisitor +IndexNotationVisitorStrict <|-- IndexNotationPrinter +IndexNotationVisitor <|-- Matcher + +IndexExprVisitorStrict <|-- IndexExprRewriterStrict +IndexStmtVisitorStrict <|-- IndexStmtRewriterStrict +IndexExprRewriterStrict <|-- IndexNotationRewriterStrict +IndexStmtRewriterStrict <|-- IndexNotationRewriterStrict + +IndexNotationRewriterStrict <|-- IndexNotationRewriter + +' - private +' # protected +' ~ package private +' + public + +' {static} +' {abstract} virtual methods + +' lowering part -- convertion from IndexExpr and IndexStmt to ir::Expr and ir::Stmt +class Lowerer { + +std::shared_ptr impl; +} +abstract class LowererImpl { + ' protected fields and methods + #class Visitor; + #friend class Visitor; + #std::shared_ptr visitor; + + #virtual ir::Stmt lower(IndexStmt stmt); + #virtual ir::Expr lower(IndexExpr expr); + + #virtual ir::Expr lowerExpr(IndexExpr expr) = 0; + #virtual ir::Stmt lowerStmt(IndexStmt stmt) = 0; + + ' public fields and methods + +virtual ir::Stmt lower(IndexStmt stmt, std::string name, + bool assemble, bool compute, bool pack, bool unpack) = 0; +} + +class LowererImplImperative { + ' private fields and methods + -class Visitor + -fiend class Visitor + -std::shared_ptr visitor + -bool assemble + -bool compute + -vars a_bunch_of_other_fields + + ' protected fields and methods + #virtual ir::Stmt lowerExpr(IndexExpr expr); + #virtual ir::Stmt lowerStmt(IndexStmt stmt); + + ' public fields and methods + +ir::Stmt lower(IndexStmt stmt, std::string name, + bool assemble, bool compute, bool pack, bool unpack) + +} +note bottom of LowererImplImperative : Stmt LowererImplImperative::lower(IndexStmt stmt) {\n return visitor->lower(stmt);\n} + +Uncopyable <|-- LowererImpl +Lowerer "1" *-- "1" LowererImpl : contains + + +' visitor that does the lowering +class Visitor { + ' private fields and methods + -LowererImpl* impl + -Expr expr + -Stmt stmt + + -void visit(const AssignmentNode* node) + -void visit(const YieldNode* node) + -void visit(const ForallNode* node) + -void visit(const WhereNode* node) + -void visit(const MultiNode* node) + -void visit(const SuchThatNode* node) + -void visit(const SequenceNode* node) + -void visit(const AssembleNode* node) + -void visit(const AccessNode* node) + -void visit(const LiteralNode* node) + -void visit(const NegNode* node) + -void visit(const AddNode* node) + -void visit(const SubNode* node) + -void visit(const MulNode* node) + -void visit(const DivNode* node) + -void visit(const SqrtNode* node) + -void visit(const CastNode* node) + -void visit(const CallIntrinsicNode* node) + -void visit(const ReductionNode* node) + + ' public fields and methods + +Visitor(LowererImplImperative* impl) + +Stmt lower(IndexStmt stmt) + +Expr lower(IndexExpr expr) +} + +note bottom of Visitor: Stmt lower(IndexStmt stmt) {\n this->stmt = Stmt();\n impl->accessibleIterators.scope();\n IndexStmtVisitorStrict::visit(stmt);\n impl->accessibleIterators.unscope();\n return this->stmt;\n} + +IndexNotationVisitorStrict <|-- Visitor +LowererImpl "1" +-- "1" Visitor : contains +Visitor "1" *-- "1" LowererImpl : contains + +LowererImpl <|-- LowererImplImperative +LowererImplImperative "1" +-- "1" Visitor : contains +Visitor "1" *-- "1" LowererImplImperative : contains + +@enduml \ No newline at end of file diff --git a/test/test.cpp b/test/test.cpp index a49f10ff7..851493b7f 100644 --- a/test/test.cpp +++ b/test/test.cpp @@ -38,6 +38,20 @@ void ASSERT_TENSOR_EQ(TensorBase expected, TensorBase actual) { ASSERT_TRUE(equals(expected, actual)); } +// void ASSERT_TENSOR_VAL(TensorBase expected, TensorBase actual) { +// std::cout << "order: " << expected.getOrder(); +// std::vector modes{}; +// for (int mode = 0; mode < expected.getOrder(); mode++) { +// if (expected.getDimension(mode) != actual.getDimension(mode)) { +// ASSERT_TRUE(false); +// } + +// for (int i=0; i expected, void ASSERT_STORAGE_EQ(TensorStorage expected, TensorStorage actual); void ASSERT_TENSOR_EQ(TensorBase expected, TensorBase actual); +// void ASSERT_TENSOR_VAL(TensorBase expected, TensorBase actual); template void ASSERT_COMPONENTS_EQUALS(vector>> expectedIndices, diff --git a/test/tests-indexstmt.cpp b/test/tests-indexstmt.cpp index e2a972430..123bea3e6 100644 --- a/test/tests-indexstmt.cpp +++ b/test/tests-indexstmt.cpp @@ -1,10 +1,13 @@ +#include "taco/index_notation/kernel.h" +#include "taco/type.h" #include "test.h" #include "test_tensors.h" #include "taco/tensor.h" #include "taco/index_notation/index_notation.h" +#include "taco/index_notation/transformations.h" using namespace taco; -const IndexVar i("i"), j("j"), k("k"); +const IndexVar i("i"), j("j"), k("k"), l("l"), m("m"); TEST(indexstmt, assignment) { Type t(type(), {3}); @@ -84,4 +87,193 @@ TEST(indexstmt, spmm) { } +TEST(indexstmt, sddmm) { + Type t(type(), {3,3}); + TensorVar A("A", t, {Sparse, Dense}); + TensorVar B("B", t, {Sparse, Dense}); + TensorVar C("C", t, {Dense, Dense}); + TensorVar w("w", Type(type(),{3}), Dense); + + // the below expression is the concrete index notation + // where (consumer, producer) + IndexStmt spmm = forall(i, + forall(k, + where(forall(j, A(i,j) = w(j)), + forall(j, w(j) += B(i,k)*C(k,j)) + ) + ) + ); + + // after adding scheduling transformations to this concrete-topologically sorted index stmt + // + + std::cout << spmm << std::endl; + spmm = reorderLoopsTopologically(spmm); + std::cout << "topologically reordered loops statement: " << spmm << std::endl; + + Kernel kernel = compile(spmm); + kernel.compute(); +} + +TEST(indexstmt, sddmmPlusSpmm) { + + // Y(i,l) = B(i,j)*C(i,k)*D(k,j) * F(j,l); + // indexstmt order i, j, k, l + //topologically reordered loops statement: forall(i, forall(k, forall(j, forall(l, Y(i,l) += B(i,j) * C(i,k) * D(k,j) * F(j,l), NotParallel, IgnoreRaces), NotParallel, IgnoreRaces), NotParallel, IgnoreRaces), NotParallel, IgnoreRaces) + + Type t(type(), {3,3}); + TensorVar Y("Y", t, {Dense, Dense}); + TensorVar B("B", t, {Dense, Sparse}); + TensorVar C("C", t, {Dense, Dense}); + TensorVar D("D", t, {Dense, Dense}); + TensorVar E("E", t, {Dense, Dense}); + + // TensorVar A("A", Type(type(),{3}), ); + TensorVar A("A", Type()); + + IndexStmt fused1 = + forall(i, + forall(j, + forall(k, + forall(l, Y(i,l) += B(i,j) * C(i,k) * D(j,k) * E(j,l)) + ) + ) + ); + + std::cout << "before topological sort" << fused1 << std::endl; + fused1 = reorderLoopsTopologically(fused1); + std::cout << "after topological sort" << fused1 << std::endl; + + Kernel kernel = compile(fused1); + + + IndexStmt fused2 = + forall(i, + forall(j, + where( + forall(l, Y(i,l) += A * E(j,l)), // consumer + forall(k, A += B(i,j)*C(i,k)*D(j,k)) // producer + ) + ) + ); + + Kernel kernel2 = compile(fused2); + +} + + + +TEST(indexstmt, mttkrpPlusSpmm) { + + // ./bin/taco "A(i,m)=B(i,k,l)*C(k,j)*D(l,j)*E(j,m)" -f=A:dd:0,1 -f=B:sss:0,1,2 -f=C:dd:0,1 -f=D:dd:0,1 -f=E:dd:0,1 + + // i = 11, k = 5, l = 7, j = 8; + long unsigned int idim = 11, kdim = 5, ldim = 7, jdim = 8, mdim = 6; + + Type atype(type(), {idim, mdim}); + Type btype(type(), {idim, kdim, ldim}); + Type ctype(type(), {kdim, jdim}); + Type dtype(type(), {ldim, jdim}); + Type etype(type(), {jdim, mdim}); + + TensorVar A("A", atype, {Dense, Dense}); + TensorVar B("B", btype, {Sparse, Sparse, Sparse}); + TensorVar C("C", ctype, {Dense, Dense}); + TensorVar D("D", dtype, {Dense, Dense}); + TensorVar E("E", etype, {Dense, Dense}); + + TensorVar ws("ws", Type(type(), {jdim}) ); + + IndexStmt fused1 = + forall(i, + forall(k, + forall(l, + forall(j, + forall(m, A(i,m) += B(i,k,l) * C(k,j) * D(l,j) * E(j,m)) + ) + ) + ) + ); + + std::cout << "before topological sort" << fused1 << std::endl; + fused1 = reorderLoopsTopologically(fused1); + std::cout << "after topological sort" << fused1 << std::endl; + + Kernel kernel = compile(fused1); + + IndexStmt fused2 = + forall(i, + where( + forall(j, + forall(m, + A(i,m) += ws(j) * E(j,m) + ) + ) + , + forall(k, + forall(l, + forall(j, + ws(j) += B(i,k,l) * C(k,j) * D(l,j) + ) + ) + ) + ) + ); + + Kernel kernel2 = compile(fused2); + +} + +// ./bin/taco "y(i)=A(i,j)*B(j,k)*v(k)" -f=y:d:0 -f=A:dd:0,1 -f=B:dd:0,1 -f=v:d:0 +TEST(indexstmt, mmPlusSpmv) { + + // + + long unsigned int idim = 11, jdim = 8, kdim = 5; + + Type ytype(type(), {idim}); + Type atype(type(), {idim, jdim}); + Type btype(type(), {jdim, kdim}); + Type vtype(type(), {kdim}); + + TensorVar y("y", ytype, {Dense}); + TensorVar A("A", atype, {Dense, Dense}); + TensorVar B("B", btype, {Dense, Dense}); + TensorVar v("v", vtype, {Dense}); + + TensorVar ws("ws", Type(type(), {jdim}) ); + + IndexStmt fused1 = + forall(i, + forall(j, + forall(k, + forall(m, y(i) += A(i,j) * B(j,k) * v(k)) + ) + ) + ); + + std::cout << "before topological sort" << fused1 << std::endl; + fused1 = reorderLoopsTopologically(fused1); + std::cout << "after topological sort" << fused1 << std::endl; + + Kernel kernel = compile(fused1); + + IndexStmt fused2 = + where( + forall(i, + forall(j, + y(i) += A(i,j) * ws(j) + ) + ) + , + forall(j, + forall(k, + ws(j) += B(j,k) * v(k) + ) + ) + ); + + Kernel kernel2 = compile(fused2); +} + diff --git a/test/tests-scheduling-eval.cpp b/test/tests-scheduling-eval.cpp index 52bd74ab4..29a7e512e 100644 --- a/test/tests-scheduling-eval.cpp +++ b/test/tests-scheduling-eval.cpp @@ -1,42 +1,8 @@ -#include -#include -#include -#include -#include "test.h" -#include "test_tensors.h" -#include "taco/tensor.h" -#include "taco/index_notation/index_notation.h" -#include "taco/index_notation/transformations.h" -#include "codegen/codegen.h" -#include "taco/lower/lower.h" - -using namespace taco; +#include "util.h" + const IndexVar i("i"), j("j"), k("k"), l("l"), m("m"), n("n"); int WARP_SIZE = 32; -void printToCout(IndexStmt stmt) { - std::shared_ptr codegen = ir::CodeGen::init_default(cout, ir::CodeGen::ImplementationGen); - ir::Stmt compute = lower(stmt, "compute", false, true); - codegen->compile(compute, true); -} - -void printToFile(string filename, IndexStmt stmt) { - stringstream source; - - string file_path = "eval_generated/"; - mkdir(file_path.c_str(), 0777); - - std::shared_ptr codegen = ir::CodeGen::init_default(source, ir::CodeGen::ImplementationGen); - ir::Stmt compute = lower(stmt, "compute", false, true); - codegen->compile(compute, true); - - ofstream source_file; - string file_ending = should_use_CUDA_codegen() ? ".cu" : ".c"; - source_file.open(file_path + filename + file_ending); - source_file << source.str(); - source_file.close(); -} - IndexStmt scheduleSpMVCPU(IndexStmt stmt, int CHUNK_SIZE=16) { IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1"); return stmt.split(i, i0, i1, CHUNK_SIZE) @@ -44,6 +10,14 @@ IndexStmt scheduleSpMVCPU(IndexStmt stmt, int CHUNK_SIZE=16) { .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); } +IndexStmt scheduleSpMVISPC(IndexStmt stmt, int CHUNK_SIZE=16) { + IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1"); + // return stmt; + return stmt.split(i, i0, i1, CHUNK_SIZE) + .reorder({i0, i1, j}) + .parallelize(i0, ParallelUnit::CPUSimd, OutputRaceStrategy::NoRaces); +} + IndexStmt scheduleSpMMCPU(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); return stmt.split(i, i0, i1, CHUNK_SIZE) @@ -54,6 +28,80 @@ IndexStmt scheduleSpMMCPU(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, i .parallelize(k, ParallelUnit::CPUVector, OutputRaceStrategy::IgnoreRaces); } +IndexStmt scheduleSpMMISPC1(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); + return stmt.split(i, i0, i1, CHUNK_SIZE) + .pos(j, jpos, A(i,j)) + .split(jpos, jpos0, jpos1, UNROLL_FACTOR) + .reorder({i0, i1, jpos0, k, jpos1}) + // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + .parallelize(k, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); +} + +IndexStmt scheduleSpMMISPCOMP1(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); + return stmt.split(i, i0, i1, CHUNK_SIZE) + .pos(j, jpos, A(i,j)) + .split(jpos, jpos0, jpos1, UNROLL_FACTOR) + .reorder({i0, i1, jpos0, k, jpos1}) + .parallelize(i0, ParallelUnit::CPUSpmd, OutputRaceStrategy::NoRaces) + .parallelize(k, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); +} + +IndexStmt scheduleSpMMISPC1_2(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); + return stmt.split(i, i0, i1, CHUNK_SIZE) + .pos(j, jpos, A(i,j)) + .split(jpos, jpos0, jpos1, UNROLL_FACTOR) + .reorder({i0, i1, jpos0, k, jpos1}) + // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + .parallelize(i0, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); +} + +IndexStmt scheduleSpMMISPC1_3(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); + return stmt.split(i, i0, i1, CHUNK_SIZE) + .pos(j, jpos, A(i,j)) + .split(jpos, jpos0, jpos1, UNROLL_FACTOR) + .reorder({i0, i1, jpos0, k, jpos1}) + // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + .parallelize(i1, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); +} + +IndexStmt scheduleSpMMISPC2(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); + return stmt + .parallelize(k, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); +} + +IndexStmt scheduleSpMMISPC2_2(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); + return stmt + .parallelize(i, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); +} + +IndexStmt scheduleSpMMISPC3(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); + return stmt + // .split(i, i0, i1, CHUNK_SIZE) + // .pos(j, jpos, A(i,j)) + // .split(jpos, jpos0, jpos1, UNROLL_FACTOR) + .reorder({j, k}) + // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + .parallelize(k, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); +} + +IndexStmt scheduleSpMMISPC3_2(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); + return stmt + // .split(i, i0, i1, CHUNK_SIZE) + // .pos(j, jpos, A(i,j)) + // .split(jpos, jpos0, jpos1, UNROLL_FACTOR) + .reorder({j, k}) + // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + .parallelize(i, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); +} + IndexStmt scheduleSpGEMMCPU(IndexStmt stmt, bool doPrecompute) { Assignment assign = stmt.as().getStmt().as().getStmt() .as().getStmt().as(); @@ -107,6 +155,68 @@ IndexStmt scheduleSDDMMCPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, .parallelize(kpos1, ParallelUnit::CPUVector, OutputRaceStrategy::ParallelReduction); } +IndexStmt scheduleSDDMMCSRCPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1"); + return stmt; + // return stmt.split(i, i0, i1, CHUNK_SIZE) + // .pos(k, kpos, B(i,k)) + // .split(kpos, kpos0, kpos1, UNROLL_FACTOR) + // .reorder({i0, i1, kpos0, j, kpos1}); + // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); + // .parallelize(k, ParallelUnit::CPUVector, OutputRaceStrategy::IgnoreRaces); +} + +IndexStmt scheduleSDDMM2CPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); + return stmt.split(i, i0, i1, CHUNK_SIZE) + .pos(j, jpos, B(i,j)) + .split(jpos, jpos0, jpos1, UNROLL_FACTOR) + .reorder({i0, i1, jpos0, k, jpos1}) + .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + .parallelize(jpos1, ParallelUnit::CPUVector, OutputRaceStrategy::ParallelReduction); +} + +IndexStmt scheduleSDDMMISPC(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1"); + return stmt.split(i, i0, i1, CHUNK_SIZE) + .pos(k, kpos, B(i,k)) + .split(kpos, kpos0, kpos1, UNROLL_FACTOR) + .reorder({i0, i1, kpos0, j, kpos1}) + // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); + .parallelize(kpos1, ParallelUnit::CPUSimd, OutputRaceStrategy::ParallelReduction); +} + +IndexStmt scheduleSDDMM2ISPC(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); + return stmt.split(i, i0, i1, CHUNK_SIZE) + .pos(j, jpos, B(i,j)) + .split(jpos, jpos0, jpos1, UNROLL_FACTOR) + .reorder({i0, i1, jpos0, k, jpos1}) + // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); + .parallelize(jpos1, ParallelUnit::CPUSimd, OutputRaceStrategy::ParallelReduction); +} + +IndexStmt scheduleSDDMMISPC1(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1"); + return stmt.split(i, i0, i1, CHUNK_SIZE) + .pos(k, kpos, B(i,k)) + .split(kpos, kpos0, kpos1, UNROLL_FACTOR) + .reorder({i0, i1, kpos0, j, kpos1}) + .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + .parallelize(kpos1, ParallelUnit::CPUSimd, OutputRaceStrategy::ParallelReduction); +} + +IndexStmt scheduleSDDMMISPC2(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1"); + return stmt; + // .split(i, i0, i1, CHUNK_SIZE) + // .pos(k, kpos, B(i,k)) + // .split(kpos, kpos0, kpos1, UNROLL_FACTOR) + // .reorder({i0, i1, kpos0, j, kpos1}) + // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + // .parallelize(kpos1, ParallelUnit::CPUSimd, OutputRaceStrategy::ParallelReduction); +} + IndexStmt scheduleTTVCPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16) { IndexVar f("f"), fpos("fpos"), chunk("chunk"), fpos2("fpos2"); return stmt.fuse(i, j, f) @@ -116,6 +226,16 @@ IndexStmt scheduleTTVCPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16) { .parallelize(chunk, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); } +IndexStmt scheduleTTVISPC(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16) { + IndexVar f("f"), fpos("fpos"), chunk("chunk"), fpos2("fpos2"); + // return stmt; + return stmt.fuse(i, j, f) + .pos(f, fpos, B(i,j,k)) + .split(fpos, chunk, fpos2, CHUNK_SIZE) + .reorder({chunk, fpos2, k}) + .parallelize(chunk, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); +} + IndexStmt scheduleTTVCPUCSR(IndexStmt stmt) { TensorVar result = stmt.as().getStmt().as().getStmt() .as().getStmt().as().getLhs() @@ -125,6 +245,25 @@ IndexStmt scheduleTTVCPUCSR(IndexStmt stmt) { OutputRaceStrategy::NoRaces); } +IndexStmt scheduleTTVCPUCSR_ST(IndexStmt stmt) { + TensorVar result = stmt.as().getStmt().as().getStmt() + .as().getStmt().as().getLhs() + .getTensorVar(); + return stmt.assemble(result, AssembleStrategy::Insert); +} + +IndexStmt scheduleTTVISPCCSR(IndexStmt stmt) { + TensorVar result = stmt.as().getStmt().as().getStmt() + .as().getStmt().as().getLhs() + .getTensorVar(); + return stmt.assemble(result, AssembleStrategy::Insert) + .parallelize(i, ParallelUnit::CPUSimd, OutputRaceStrategy::NoRaces); +} + +IndexStmt scheduleTTVISPCCSR2(IndexStmt stmt) { + return stmt; +} + IndexStmt scheduleTTMCPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { IndexVar f("f"), fpos("fpos"), chunk("chunk"), fpos2("fpos2"), kpos("kpos"), kpos1("kpos1"), kpos2("kpos2"); return stmt.fuse(i, j, f) @@ -149,12 +288,47 @@ IndexStmt scheduleMTTKRPCPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, .parallelize(i1, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); } +IndexStmt scheduleMTTKRPCPU_ST(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i1("i1"), i2("i2"); + IndexExpr precomputeExpr = stmt.as().getStmt().as().getStmt() + .as().getStmt().as().getStmt() + .as().getRhs().as().getA(); + TensorVar w("w", Type(Float64, {Dimension(j)}), taco::dense); + return stmt.split(i, i1, i2, CHUNK_SIZE) + .reorder({i1, i2, k, l, j}) + .precompute(precomputeExpr, j, j, w); + // .parallelize(j, ParallelUnit::CPUVector, OutputRaceStrategy::Atomics); // gives error when lowering for IgnoreRaces, NoRaces and Atomics + // .parallelize(i1, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); +} + +IndexStmt scheduleMTTKRPISPC(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i1("i1"), i2("i2"); + IndexExpr precomputeExpr = stmt.as().getStmt().as().getStmt() + .as().getStmt().as().getStmt() + .as().getRhs().as().getA(); + TensorVar w("w", Type(Float64, {Dimension(j)}), taco::dense); + return stmt.split(i, i1, i2, CHUNK_SIZE) + .reorder({i1, i2, k, l, j}) + .precompute(precomputeExpr, j, j, w) + .parallelize(j, ParallelUnit::CPUSimd, OutputRaceStrategy::NoRaces); +} + IndexStmt scheduleMTTKRPPrecomputedCPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { IndexVar i1("i1"), i2("i2"), j_pre("j_pre"); return stmt.split(i, i1, i2, CHUNK_SIZE) .parallelize(i1, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); } +IndexStmt scheduleMTTKRPPrecomputedCPU_ST(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i1("i1"), i2("i2"), j_pre("j_pre"); + return stmt.split(i, i1, i2, CHUNK_SIZE); +} + +IndexStmt scheduleMTTKRPPrecomputedISPC_ST(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i1("i1"), i2("i2"), j_pre("j_pre"); + return stmt.parallelize(j, ParallelUnit::CPUSimd, OutputRaceStrategy::NoRaces); +} + IndexStmt scheduleMTTKRP4CPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { IndexVar i1("i1"), i2("i2"); return stmt.split(i, i1, i2, CHUNK_SIZE) @@ -162,6 +336,19 @@ IndexStmt scheduleMTTKRP4CPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16 .parallelize(i1, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); } +IndexStmt scheduleMTTKRP4CPU_ST(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i1("i1"), i2("i2"); + return stmt.split(i, i1, i2, CHUNK_SIZE) + .reorder({i1, i2, k, l, m, j}); +} + +IndexStmt scheduleMTTKRP4ISPC_ST(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i1("i1"), i2("i2"); + return stmt.split(i, i1, i2, CHUNK_SIZE) + .reorder({i1, i2, k, l, m, j}) + .parallelize(j, ParallelUnit::CPUSimd, OutputRaceStrategy::NoRaces); +} + IndexStmt scheduleMTTKRP5CPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { IndexVar i1("i1"), i2("i2"); return stmt.split(i, i1, i2, CHUNK_SIZE) @@ -576,6 +763,92 @@ TEST(scheduling_eval, spmmCPU) { ASSERT_TENSOR_EQ(expected, C); } +TEST(scheduling_eval, spmmISPC) { + taco::util::TimeResults timevalue; + bool time = true; + + set_ISPC_codegen_enabled(false); + set_CUDA_codegen_enabled(false); + + int NUM_I = 1021/10; + int NUM_J = 1039/10; + int NUM_K = 128; + float SPARSITY = .1; + Tensor A("A", {NUM_I, NUM_J}, CSR); + Tensor B("B", {NUM_J, NUM_K}, {Dense, Dense}); + Tensor C("C", {NUM_I, NUM_K}, {Dense, Dense}); + + srand(75883); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + if (rand_float < SPARSITY) { + A.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + } + + for (int j = 0; j < NUM_J; j++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + B.insert({j, k}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + + A.pack(); + B.pack(); + + set_ISPC_codegen_enabled(true); + C(i, k) = A(i, j) * B(j, k); + + IndexStmt stmt = C.getAssignment().concretize(); + // stmt = scheduleSpMMISPC1(stmt, A); + // stmt = scheduleSpMMISPC1_2(stmt, A); + stmt = scheduleSpMMISPC1_3(stmt, A); + + // stmt = scheduleSpMMISPC2(stmt, A); + // stmt = scheduleSpMMISPC2_2(stmt, A); + + // stmt = scheduleSpMMISPC3(stmt, A); + // stmt = scheduleSpMMISPC3_2(stmt, A); + + //printToFile("spmm_cpu", stmt); + + C.compile(stmt); + C.assemble(); + C.compute(); + + set_ISPC_codegen_enabled(false); + Tensor expected("expected", {NUM_I, NUM_K}, {Dense, Dense}); + expected(i, k) = A(i, j) * B(j, k); + IndexStmt stmt_taco = expected.getAssignment().concretize(); + stmt_taco = scheduleSpMMCPU(stmt_taco, A); + + expected.compile(stmt_taco); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, C); + + // float ERROR_MARGIN = 0.01; + // ASSERT_TENSOR_VAL(expected, y); + for (int i = 0; i < NUM_I; i++) { + for (int k = 0; k < NUM_K; k++) { + if (expected(i,k) <= C(i,k) + ERROR_MARGIN && expected(i,k) >= C(i,k) - ERROR_MARGIN) { + // std::cout << "matched values: expected -> " << expected(j) << " == " << y(j) << " <- actual\n"; + } + else { + std::cout << "unmatched values: expected -> " << expected(i,k) << " != " << C(i,k) << " <- actual\n"; + ASSERT_TRUE(false); + }; + } + } + + for (int i=0; i<10; i++) { + TOOL_BENCHMARK_TIMER(C.compute(), "Compute ISPC: ", timevalue); + TOOL_BENCHMARK_TIMER(expected.compute(), "Compute TACO: ", timevalue); + } +} + struct spgemm : public TestWithParam> {}; TEST_P(spgemm, scheduling_eval) { @@ -805,7 +1078,7 @@ TEST(scheduling_eval, sddmmCPU) { IndexStmt stmt = A.getAssignment().concretize(); stmt = scheduleSDDMMCPU(stmt, B); - //printToFile("sddmm_cpu", stmt); + printToFile("sddmm_cpu_ryan2", stmt); A.compile(stmt); A.assemble(); @@ -819,55 +1092,69 @@ TEST(scheduling_eval, sddmmCPU) { ASSERT_TENSOR_EQ(expected, A); } -TEST(scheduling_eval, spmvCPU) { - if (should_use_CUDA_codegen()) { +TEST(scheduling_eval, sddmmSPMMFusedCPU) { + if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) { return; } + int NUM_I = 1021/10; int NUM_J = 1039/10; + int NUM_K = 1057/10; float SPARSITY = .3; - Tensor A("A", {NUM_I, NUM_J}, CSR); - Tensor x("x", {NUM_J}, Format({Dense})); - Tensor y("y", {NUM_I}, Format({Dense})); + Tensor A("A", {NUM_I, NUM_K}, {Dense, Dense}); + Tensor B("B", {NUM_I, NUM_K}, CSR); + Tensor C("C", {NUM_I, NUM_J}, {Dense, Dense}); + Tensor D("D", {NUM_J, NUM_K}, {Dense, Dense}); - srand(120); + srand(268238); for (int i = 0; i < NUM_I; i++) { for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + C.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + + for (int i = 0; i < NUM_I; i++) { + for (int k = 0; k < NUM_K; k++) { float rand_float = (float)rand()/(float)(RAND_MAX); if (rand_float < SPARSITY) { - A.insert({i, j}, (double) ((int) (rand_float * 3 / SPARSITY))); + B.insert({i, k}, (double) ((int) (rand_float*3/SPARSITY))); } } } for (int j = 0; j < NUM_J; j++) { - float rand_float = (float)rand()/(float)(RAND_MAX); - x.insert({j}, (double) ((int) (rand_float*3/SPARSITY))); + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + D.insert({j, k}, (double) ((int) (rand_float*3/SPARSITY))); + } } - x.pack(); - A.pack(); + B.pack(); + C.pack(); + D.pack(); - y(i) = A(i, j) * x(j); + A(i,k) = B(i,k) * C(i,j) * D(j,k); - IndexStmt stmt = y.getAssignment().concretize(); - stmt = scheduleSpMVCPU(stmt); + IndexStmt stmt = A.getAssignment().concretize(); + stmt = scheduleSDDMMCPU(stmt, B); - //printToFile("spmv_cpu", stmt); + printToFile("sddmm_cpu_ryan2", stmt); - y.compile(stmt); - y.assemble(); - y.compute(); + A.compile(stmt); + A.assemble(); + A.compute(); - Tensor expected("expected", {NUM_I}, Format({Dense})); - expected(i) = A(i, j) * x(j); + Tensor expected("expected", {NUM_I, NUM_K}, {Dense, Dense}); + expected(i,k) = B(i,k) * C(i,j) * D(j,k); expected.compile(); expected.assemble(); expected.compute(); - ASSERT_TENSOR_EQ(expected, y); + ASSERT_TENSOR_EQ(expected, A); } -TEST(scheduling_eval, ttvCPU) { + +TEST(scheduling_eval, sddmmcsrCPU) { if (should_use_CUDA_codegen()) { return; } @@ -875,7 +1162,495 @@ TEST(scheduling_eval, ttvCPU) { int NUM_J = 1039/10; int NUM_K = 1057/10; float SPARSITY = .3; - Tensor A("A", {NUM_I, NUM_J}, {Dense, Dense}); // TODO: change to sparse outputs + Tensor A("A", {NUM_I, NUM_K}, CSR); + Tensor B("B", {NUM_I, NUM_K}, CSR); + Tensor C("C", {NUM_I, NUM_J}, {Dense, Dense}); + Tensor D("D", {NUM_J, NUM_K}, {Dense, Dense}); + + srand(268238); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + C.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + + for (int i = 0; i < NUM_I; i++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, k}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + } + + for (int j = 0; j < NUM_J; j++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + D.insert({j, k}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + + B.pack(); + C.pack(); + D.pack(); + + A(i,k) = B(i,k) * C(i,j) * D(j,k); + + IndexStmt stmt = A.getAssignment().concretize(); + stmt = scheduleSDDMMCSRCPU(stmt, B); + + printToFile("sddmm_cpu", stmt); + + A.compile(stmt); + A.assemble(); + A.compute(); + + Tensor expected("expected", {NUM_I, NUM_K}, CSR); + expected(i,k) = B(i,k) * C(i,j) * D(j,k); + + IndexStmt stmt_ref = expected.getAssignment().concretize(); + printToFile("sddmm_cpu_ref", stmt_ref); + + expected.compile(stmt_ref); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, A); +} + + +TEST(scheduling_eval, sddmm2CPU) { + if (should_use_CUDA_codegen()) { + return; + } + int NUM_I = 1021/10; + int NUM_J = 1021/10; + int NUM_K = 18; + float SPARSITY = .3; + Tensor Y("Y", {NUM_I, NUM_J}, {Dense, Compressed(ModeFormat::UNIQUE)}); + Tensor A("A", {NUM_I, NUM_J}, {Dense, Compressed(ModeFormat::UNIQUE)}); + Tensor X("X", {NUM_I, NUM_K}, {Dense, Dense}); + + srand(268238); + + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + if (rand_float < SPARSITY) { + A.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + } + + for (int i = 0; i < NUM_J; i++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + X.insert({i, k}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + + A.pack(); + X.pack(); + + Y(i,j) = A(i,j) * X(i,k) * X(k,j); + + // IndexStmt stmt = A.getAssignment().concretize(); + // // stmt = scheduleSDDMMCPU(stmt, A); + + // printToFile("sddmm2_cpu", stmt); + + // A.compile(stmt); + // A.assemble(); + // A.compute(); + + // Tensor expected("expected", {NUM_I, NUM_J}, {Dense, Dense}); + // expected(i,j) = A(i,j) * X(i,k) * X(j,k); + // expected.compile(); + // expected.assemble(); + // expected.compute(); + // ASSERT_TENSOR_EQ(expected, A); +} + + + +// bin/taco-test --gtest_filter=scheduling_eval.sddmmISPC +TEST(scheduling_eval, sddmmISPC) { + + taco::util::TimeResults timevalue; + bool time = true; + + set_CUDA_codegen_enabled(false); + set_ISPC_codegen_enabled(false); + + int NUM_I = 1021/10; + int NUM_J = 1039/10; + int NUM_K = 1057/10; + float SPARSITY = .3; + Tensor A("A", {NUM_I, NUM_K}, {Dense, Dense}); + Tensor B("B", {NUM_I, NUM_K}, CSR); + Tensor C("C", {NUM_I, NUM_J}, {Dense, Dense}); + Tensor D("D", {NUM_J, NUM_K}, {Dense, Dense}); + + srand(268238); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + C.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + + for (int i = 0; i < NUM_I; i++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, k}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + } + + for (int j = 0; j < NUM_J; j++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + D.insert({j, k}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + + B.pack(); + C.pack(); + D.pack(); + + set_ISPC_codegen_enabled(true); + A(i,k) = B(i,k) * C(i,j) * D(j,k); + + IndexStmt stmt = A.getAssignment().concretize(); + stmt = scheduleSDDMMISPC(stmt, B); + + //printToFile("sddmm_cpu", stmt); + + A.compile(stmt); + A.assemble(); + // A.compute(); + + set_ISPC_codegen_enabled(false); + Tensor expected("expected", {NUM_I, NUM_K}, {Dense, Dense}); + expected(i,k) = B(i,k) * C(i,j) * D(j,k); + IndexStmt stmt_taco = A.getAssignment().concretize(); + stmt_taco = scheduleSDDMMCPU(stmt_taco, B); + expected.compile(stmt_taco); + expected.assemble(); + // expected.compute(); + + TOOL_BENCHMARK_TIMER(A.compute(), "Compute ISPC: ", timevalue); + TOOL_BENCHMARK_TIMER(expected.compute(), "Compute TACO: ", timevalue); + + ASSERT_TENSOR_EQ(expected, A); + + + // float ERROR_MARGIN = 0.01; + // ASSERT_TENSOR_VAL(expected, y); + for (int i = 0; i < NUM_I; i++) { + for (int k = 0; k < NUM_K; k++) { + if (expected(i,k) <= A(i,k) + ERROR_MARGIN && expected(i,k) >= A(i,k) - ERROR_MARGIN) { + // std::cout << "matched values: expected -> " << expected(j) << " == " << y(j) << " <- actual\n"; + } + else { + std::cout << "unmatched values: expected -> " << expected(i,k) << " != " << A(i,k) << " <- actual\n"; + ASSERT_TRUE(false); + }; + } + } + std::cout << "test scheduling_eval.sddmmISPC passed\n"; + +} + + +// bin/taco-test --gtest_filter=scheduling_eval.sddmmISPC +TEST(scheduling_eval, sddmm2ISPC) { + + taco::util::TimeResults timevalue; + bool time = true; + + set_CUDA_codegen_enabled(false); + set_ISPC_codegen_enabled(false); + + int NUM_I = 1021/10; + int NUM_K = 1039/10; + int NUM_J = 1021/10; + float SPARSITY = .3; + Tensor A("A", {NUM_I, NUM_J}, {Dense, Dense}); + Tensor B("B", {NUM_I, NUM_J}, CSR); + Tensor C("C", {NUM_I, NUM_K}, {Dense, Dense}); + + srand(268238); + for (int i = 0; i < NUM_I; i++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + C.insert({i, k}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + } + + B.pack(); + C.pack(); + + set_ISPC_codegen_enabled(true); + A(i,j) = B(i,j) * C(i,k) * C(j,k); + + IndexStmt stmt = A.getAssignment().concretize(); + stmt = scheduleSDDMM2ISPC(stmt, B); + + //printToFile("sddmm_cpu", stmt); + + A.compile(stmt); + A.assemble(); + // A.compute(); + + set_ISPC_codegen_enabled(false); + Tensor expected("expected", {NUM_I, NUM_J}, {Dense, Dense}); + expected(i,j) = B(i,j) * C(i,k) * C(j,k); + IndexStmt stmt_taco = A.getAssignment().concretize(); + stmt_taco = scheduleSDDMM2CPU(stmt_taco, B); + expected.compile(stmt_taco); + expected.assemble(); + // expected.compute(); + + TOOL_BENCHMARK_TIMER(A.compute(), "Compute ISPC: ", timevalue); + TOOL_BENCHMARK_TIMER(expected.compute(), "Compute TACO: ", timevalue); + + ASSERT_TENSOR_EQ(expected, A); + + + // float ERROR_MARGIN = 0.01; + // ASSERT_TENSOR_VAL(expected, y); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + if (expected(i,j) <= A(i,j) + ERROR_MARGIN && expected(i,j) >= A(i,j) - ERROR_MARGIN) { + // std::cout << "matched values: expected -> " << expected(j) << " == " << y(j) << " <- actual\n"; + } + else { + std::cout << "unmatched values: expected -> " << expected(i,j) << " != " << A(i,j) << " <- actual\n"; + ASSERT_TRUE(false); + }; + } + } + std::cout << "test scheduling_eval.sddmmISPC passed\n"; + +} + + +TEST(scheduling_eval, spmvCPU) { + if (should_use_CUDA_codegen()) { + return; + } + int NUM_I = 1021/10; + int NUM_J = 1039/10; + float SPARSITY = .3; + Tensor A("A", {NUM_I, NUM_J}, CSR); + Tensor x("x", {NUM_J}, Format({Dense})); + Tensor y("y", {NUM_I}, Format({Dense})); + + srand(120); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + if (rand_float < SPARSITY) { + A.insert({i, j}, (double) ((int) (rand_float * 3 / SPARSITY))); + } + } + } + + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + x.insert({j}, (double) ((int) (rand_float*3/SPARSITY))); + } + + x.pack(); + A.pack(); + + y(i) = A(i, j) * x(j); + + IndexStmt stmt = y.getAssignment().concretize(); + stmt = scheduleSpMVCPU(stmt); + + //printToFile("spmv_cpu", stmt); + + y.compile(stmt); + y.assemble(); + y.compute(); + + Tensor expected("expected", {NUM_I}, Format({Dense})); + expected(i) = A(i, j) * x(j); + expected.compile(); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, y); +} + + +TEST(scheduling_eval, spmvISPC) { + + taco::util::TimeResults timevalue; + bool time = true; + + set_ISPC_codegen_enabled(false); + set_CUDA_codegen_enabled(false); + + int NUM_I = 200021/10; + int NUM_J = 200039/10; + float SPARSITY = .2; + Tensor A("A", {NUM_I, NUM_J}, CSR); + Tensor x("x", {NUM_J}, Format({Dense})); + Tensor y("y", {NUM_I}, Format({Dense})); + + srand(120); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + if (rand_float < SPARSITY) { + A.insert({i, j}, (double) ((int) (rand_float * 3 / SPARSITY))); + } + } + } + + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + x.insert({j}, (double) ((int) (rand_float*3/SPARSITY))); + } + + x.pack(); + A.pack(); + + set_ISPC_codegen_enabled(true); + + y(i) = A(i, j) * x(j); + + IndexStmt stmt = y.getAssignment().concretize(); + // stmt = scheduleSpMVISPC(stmt); + + printToFile("spmv_cpu", stmt); + + y.compile(stmt); + y.assemble(); + // y.compile(); + + set_ISPC_codegen_enabled(false); + + // Tensor expected("expected", {NUM_I}, Format({Dense})); + // expected(i) = A(i, j) * x(j); + // expected.compile(); + // expected.assemble(); + // expected.compute(); + + + Tensor expected("expected", {NUM_I}, Format({Dense})); + expected(i) = A(i, j) * x(j); + IndexStmt stmt_taco = expected.getAssignment().concretize(); + stmt_taco = scheduleSpMVCPU(stmt_taco); + + expected.compile(stmt_taco); + expected.assemble(); + // expected.compile(); + + + TOOL_BENCHMARK_TIMER(y.compute(), "Compute ISPC: ", timevalue); + TOOL_BENCHMARK_TIMER(expected.compute(), "Compute TACO: ", timevalue); + + + ASSERT_TENSOR_EQ(expected, y); + + // float ERROR_MARGIN = 0.01; + // ASSERT_TENSOR_VAL(expected, y); + for (int j = 0; j < NUM_J; j++) { + if (expected(j) <= y(j) + ERROR_MARGIN && expected(j) >= y(j) - ERROR_MARGIN) { + // std::cout << "matched values: expected -> " << expected(j) << " == " << y(j) << " <- actual\n"; + } + else { + std::cout << "unmatched values: expected -> " << expected(j) << " != " << y(j) << " <- actual\n"; + ASSERT_TRUE(false); + }; + } + + std::cout << "test scheduling_eval.spmvISPC passed\n"; + + for (int i=0; i<10; i++) { + TOOL_BENCHMARK_TIMER(y.compute(), "Compute ISPC: ", timevalue); + TOOL_BENCHMARK_TIMER(expected.compute(), "Compute TACO: ", timevalue); + } + + +} + +TEST(scheduling_eval, ttvCPU) { + if (should_use_CUDA_codegen()) { + return; + } + int NUM_I = 1021/10; + int NUM_J = 1039/10; + int NUM_K = 1057/10; + float SPARSITY = .3; + Tensor A("A", {NUM_I, NUM_J}, {Dense, Dense}); // TODO: change to sparse outputs + Tensor B("B", {NUM_I, NUM_J, NUM_K}, {Sparse, Sparse, Sparse}); + Tensor c("c", {NUM_K}, Format({Dense})); + + srand(9536); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float) rand() / (float) (RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, j, k}, (double) ((int) (rand_float * 3 / SPARSITY))); + } + } + } + } + + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + c.insert({k}, (double) ((int) (rand_float*3))); + } + + B.pack(); + c.pack(); + + A(i,j) = B(i,j,k) * c(k); + + IndexStmt stmt = A.getAssignment().concretize(); + stmt = scheduleTTVCPU(stmt, B); + + printToFile("ttv_cpu", stmt); + + A.compile(stmt); + A.assemble(); + A.compute(); + + Tensor expected("expected", {NUM_I, NUM_J}, {Dense, Dense}); + expected(i,j) = B(i,j,k) * c(k); + expected.compile(); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, A); +} + + +TEST(scheduling_eval, ttvISPC) { + if (should_use_CUDA_codegen()) { + return; + } + set_CUDA_codegen_enabled(false); + set_ISPC_codegen_enabled(false); + int NUM_I = 1021/10; + int NUM_J = 1039/10; + int NUM_K = 1057/10; + float SPARSITY = .3; + Tensor A("A", {NUM_I, NUM_J}, {Dense, Dense}); // TODO: change to sparse outputs Tensor B("B", {NUM_I, NUM_J, NUM_K}, {Sparse, Sparse, Sparse}); Tensor c("c", {NUM_K}, Format({Dense})); @@ -899,25 +1674,30 @@ TEST(scheduling_eval, ttvCPU) { B.pack(); c.pack(); + set_ISPC_codegen_enabled(true); A(i,j) = B(i,j,k) * c(k); IndexStmt stmt = A.getAssignment().concretize(); - stmt = scheduleTTVCPU(stmt, B); + stmt = scheduleTTVISPC(stmt, B); - //printToFile("ttv_cpu", stmt); + printToFile("ttv_ispc", "__ttv_ispc", stmt); A.compile(stmt); A.assemble(); A.compute(); + set_ISPC_codegen_enabled(false); Tensor expected("expected", {NUM_I, NUM_J}, {Dense, Dense}); expected(i,j) = B(i,j,k) * c(k); + IndexStmt stmt_taco = expected.getAssignment().concretize(); + stmt_taco = scheduleTTVCPU(stmt_taco, B); expected.compile(); expected.assemble(); expected.compute(); ASSERT_TENSOR_EQ(expected, A); } + TEST(scheduling_eval, ttvCPU_CSR) { if (should_use_CUDA_codegen()) { return; @@ -928,7 +1708,7 @@ TEST(scheduling_eval, ttvCPU_CSR) { int NUM_K = 1057/10; float SPARSITY = .3; Tensor A("A", {NUM_I, NUM_J}, {Dense, Sparse}); - Tensor B("B", {NUM_I, NUM_J, NUM_K}, {Sparse, Sparse, Sparse}); + Tensor B("B", {NUM_I, NUM_J, NUM_K}, {Dense, Sparse, Sparse}); Tensor c("c", {NUM_K}, Format({Dense})); srand(9536); @@ -956,11 +1736,13 @@ TEST(scheduling_eval, ttvCPU_CSR) { IndexStmt stmt = A.getAssignment().concretize(); stmt = scheduleTTVCPUCSR(stmt); + printToFile("ttv_cpu_csr", stmt); + A.compile(stmt); A.assemble(); A.compute(); - Tensor expected("expected", {NUM_I, NUM_J}, {Dense, Dense}); + Tensor expected("expected", {NUM_I, NUM_J}, {Dense, Sparse}); expected(i,j) = B(i,j,k) * c(k); expected.compile(); expected.assemble(); @@ -968,6 +1750,82 @@ TEST(scheduling_eval, ttvCPU_CSR) { ASSERT_TENSOR_EQ(expected, A); } +TEST(scheduling_eval, ttvISPC_CSR) { + if (should_use_CUDA_codegen()) { + return; + } + + int NUM_I = 10000; + int NUM_J = 1039/10; + int NUM_K = 128; + float SPARSITY = .3; + Tensor A("A", {NUM_I, NUM_J}, {Dense, Sparse}); + Tensor B("B", {NUM_I, NUM_J, NUM_K}, {Dense, Sparse, Sparse}); + Tensor c("c", {NUM_K}, Format({Dense})); + + srand(9536); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float) rand() / (float) (RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, j, k}, (double) ((int) (rand_float * 3 / SPARSITY))); + } + } + } + } + + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + c.insert({k}, (double) ((int) (rand_float*3))); + } + + B.pack(); + c.pack(); + + set_ISPC_codegen_enabled(true); + A(i,j) = B(i,j,k) * c(k); + + IndexStmt stmt = A.getAssignment().concretize(); + stmt = scheduleTTVISPCCSR(stmt); + printToFile("ttv_ispc_csr", "__ttv_ispc_csr", stmt); + + A.compile(stmt); + A.assemble(); + A.compute(); + + set_ISPC_codegen_enabled(false); + Tensor expected("expected", {NUM_I, NUM_J}, {Dense, Sparse}); + expected(i,j) = B(i,j,k) * c(k); + IndexStmt taco_stmt = expected.getAssignment().concretize(); + taco_stmt = scheduleTTVCPUCSR_ST(taco_stmt); + expected.compile(taco_stmt); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, A); + + Tensor A2("A2", {NUM_I, NUM_J}, {Dense, Sparse}); + set_ISPC_codegen_enabled(true); + A2(i,j) = B(i,j,k) * c(k); + + IndexStmt stmt2 = A2.getAssignment().concretize(); + + A2.compile(stmt2); + A2.assemble(); + A2.compute(); + + taco::util::TimeResults timevalue; + bool time = true; + + for (int i=0; i<3; i++) { + TOOL_BENCHMARK_TIMER(expected.compute(), "Compute TACO1: ", timevalue); + TOOL_BENCHMARK_TIMER(A.compute(), "Compute ISPC1: ", timevalue); + TOOL_BENCHMARK_TIMER(A2.compute(), "Compute ISPC2: ", timevalue); + } + + +} + TEST(scheduling_eval, ttmCPU) { if (should_use_CUDA_codegen()) { return; @@ -1010,39 +1868,318 @@ TEST(scheduling_eval, ttmCPU) { //printToFile("ttm_cpu", stmt); - A.compile(stmt); - A.assemble(); - A.compute(); + A.compile(stmt); + A.assemble(); + A.compute(); + + Tensor expected("expected", {NUM_I, NUM_J, NUM_L}, {Dense, Dense, Dense}); + expected(i,j,l) = B(i,j,k) * C(k,l); + expected.compile(); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, A); +} + +TEST(scheduling_eval, ttmISPC) { + if (should_use_CUDA_codegen()) { + return; + } + int NUM_I = 1021/40; + int NUM_J = 1039/40; + int NUM_K = 1057/40; + int NUM_L = 1232/40; + float SPARSITY = .1; + Tensor A("A", {NUM_I, NUM_J, NUM_L}, {Dense, Dense, Dense}); // TODO: change to sparse outputs + Tensor B("B", {NUM_I, NUM_J, NUM_K}, {Sparse, Sparse, Sparse}); + Tensor C("C", {NUM_K, NUM_L}, {Dense, Dense}); + + srand(935); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float) rand() / (float) (RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, j, k}, (double) ((int) (rand_float * 3 / SPARSITY))); + } + } + } + } + + for (int k = 0; k < NUM_K; k++) { + for (int l = 0; l < NUM_L; l++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + C.insert({k, l}, (double) ((int) (rand_float*3))); + } + } + + B.pack(); + C.pack(); + + A(i,j,l) = B(i,j,k) * C(k,l); + + IndexStmt stmt = A.getAssignment().concretize(); + stmt = scheduleTTMCPU(stmt, B); + + //printToFile("ttm_cpu", stmt); + + A.compile(stmt); + A.assemble(); + A.compute(); + + Tensor expected("expected", {NUM_I, NUM_J, NUM_L}, {Dense, Dense, Dense}); + expected(i,j,l) = B(i,j,k) * C(k,l); + expected.compile(); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, A); +} + +TEST(scheduling_eval, mttkrpCPU) { + if (should_use_CUDA_codegen()) { + return; + } + int NUM_I = 1021/20; + int NUM_J = 1039/20; + int NUM_K = 1057/20; + int NUM_L = 1232/20; + float SPARSITY = .1; + Tensor A("A", {NUM_I, NUM_J}, {Dense, Dense}); + Tensor B("B", {NUM_I, NUM_K, NUM_L}, {Dense, Sparse, Sparse}); + Tensor C("C", {NUM_K, NUM_J}, {Dense, Dense}); + Tensor D("D", {NUM_L, NUM_J}, {Dense, Dense}); + + srand(549694); + for (int i = 0; i < NUM_I; i++) { + for (int k = 0; k < NUM_K; k++) { + for (int l = 0; l < NUM_L; l++) { + float rand_float = (float) rand() / (float) (RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, k, l}, (double) ((int) (rand_float * 3 / SPARSITY))); + } + } + } + } + + for (int k = 0; k < NUM_K; k++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + C.insert({k, j}, (double) ((int) (rand_float*3))); + } + } + + for (int l = 0; l < NUM_L; l++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + D.insert({l, j}, (double) ((int) (rand_float*3))); + } + } + + B.pack(); + C.pack(); + D.pack(); + + A(i,j) = B(i,k,l) * C(k,j) * D(l,j); + + IndexStmt stmt = A.getAssignment().concretize(); + stmt = scheduleMTTKRPCPU(stmt, B); + //printToFile("mttkrp_cpu", stmt); + + A.compile(stmt); + A.assemble(); + A.compute(); + + Tensor expected("expected", {NUM_I, NUM_J}, {Dense, Dense}); + expected(i,j) = B(i,k,l) * C(k,j) * D(l,j); + expected.compile(); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, A); +} + +TEST(scheduling_eval, temp) { + if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) { + return; + } + std::default_random_engine gen(0); + std::uniform_real_distribution unif(0.0, 1.0); + // Predeclare the storage formats that the inputs and output will be stored as. + // To define a format, you must specify whether each dimension is dense or sparse + // and (optionally) the order in which dimensions should be stored. The formats + // declared below correspond to doubly compressed sparse row (dcsr), row-major + // dense (rm), and column-major dense (dm). + Format dcsr({Sparse,Sparse}); + Format rm({Dense,Dense}); + Format cm({Dense,Dense}, {1,0}); + + // Load a sparse matrix from file (stored in the Matrix Market format) and + // store it as a doubly compressed sparse row matrix. Matrices correspond to + // order-2 tensors in taco. The matrix in this example can be download from: + // https://www.cise.ufl.edu/research/sparse/MM/Williams/webbase-1M.tar.gz + Tensor B = read("/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx", dcsr); + // Generate a random dense matrix and store it in row-major (dense) format. + Tensor C({B.getDimension(0), 1000}, rm); + for (int i = 0; i < C.getDimension(0); ++i) { + for (int j = 0; j < C.getDimension(1); ++j) { + C.insert({i,j}, unif(gen)); + } + } + C.pack(); + + // Generate another random dense matrix and store it in column-major format. + Tensor D({1000, B.getDimension(1)}, cm); + for (int i = 0; i < D.getDimension(0); ++i) { + for (int j = 0; j < D.getDimension(1); ++j) { + D.insert({i,j}, unif(gen)); + } + } + D.pack(); + + // Declare the output matrix to be a sparse matrix with the same dimensions as + // input matrix B, to be also stored as a doubly compressed sparse row matrix. + Tensor A(B.getDimensions(), dcsr); + + // Define the SDDMM computation using index notation. + IndexVar i, j, k; + A(i,j) = B(i,j) * C(i,k) * D(k,j); + + // At this point, we have defined how entries in the output matrix should be + // computed from entries in the input matrices but have not actually performed + // the computation yet. To do so, we must first tell taco to generate code that + // can be executed to compute the SDDMM operation. + A.compile(); + // We can now call the functions taco generated to assemble the indices of the + // output matrix and then actually compute the SDDMM. + A.assemble(); + A.compute(); + // Write the output of the computation to file (stored in the Matrix Market format). + write("A.mtx", A); +} + +TEST(scheduling_eval, mttkrpISPC) { + if (should_use_CUDA_codegen()) { + return; + } + set_ISPC_codegen_enabled(false); + set_CUDA_codegen_enabled(false); + int NUM_I = 10000; // 1021/20; + int NUM_J = 256; + int NUM_K = 1057/20; + int NUM_L = 1232/20; + float SPARSITY = .1; + Tensor B("B", {NUM_I, NUM_K, NUM_L}, {Dense, Sparse, Sparse}); + Tensor C("C", {NUM_K, NUM_J}, {Dense, Dense}); + Tensor D("D", {NUM_L, NUM_J}, {Dense, Dense}); + + srand(549694); + for (int i = 0; i < NUM_I; i++) { + for (int k = 0; k < NUM_K; k++) { + for (int l = 0; l < NUM_L; l++) { + float rand_float = (float) rand() / (float) (RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, k, l}, (double) ((int) (rand_float * 3 / SPARSITY))); + } + } + } + } + + for (int k = 0; k < NUM_K; k++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + C.insert({k, j}, (double) ((int) (rand_float*3))); + } + } + + for (int l = 0; l < NUM_L; l++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + D.insert({l, j}, (double) ((int) (rand_float*3))); + } + } + + B.pack(); + C.pack(); + D.pack(); - Tensor expected("expected", {NUM_I, NUM_J, NUM_L}, {Dense, Dense, Dense}); - expected(i,j,l) = B(i,j,k) * C(k,l); - expected.compile(); - expected.assemble(); - expected.compute(); - ASSERT_TENSOR_EQ(expected, A); + set_ISPC_codegen_enabled(true); + + Tensor A1("A1", {NUM_I, NUM_J}, {Dense, Dense}); + A1(i,j) = B(i,k,l) * C(k,j) * D(l,j); + IndexStmt stmt1 = A1.getAssignment().concretize(); + stmt1 = scheduleMTTKRPISPC(stmt1, B); + // printToFile("mttkrp1_cpu_ispc", stmt1); + A1.compile(stmt1); + A1.assemble(); + A1.compute(); + + set_ISPC_codegen_enabled(false); + Tensor expected1("expected1", {NUM_I, NUM_J}, {Dense, Dense}); + expected1(i,j) = B(i,k,l) * C(k,j) * D(l,j); + IndexStmt taco_stmt1 = expected1.getAssignment().concretize(); + taco_stmt1 = scheduleMTTKRPCPU(taco_stmt1, B); + expected1.compile(taco_stmt1); + expected1.assemble(); + expected1.compute(); + ASSERT_TENSOR_EQ(expected1, A1); + + set_ISPC_codegen_enabled(true); + Tensor A2("A2", {NUM_I, NUM_J}, {Dense, Dense}); + A2(i,j) = B(i,k,l) * C(k,j) * D(l,j); + IndexStmt stmt2 = A1.getAssignment().concretize(); + stmt2 = scheduleMTTKRPPrecomputedISPC_ST(stmt2, B); + // printToFile("mttkrp_cpu_ispc", stmt); + A2.compile(stmt2); + A2.assemble(); + A2.compute(); + ASSERT_TENSOR_EQ(expected1, A2); + + set_ISPC_codegen_enabled(false); + Tensor expected2("expected2", {NUM_I, NUM_J}, {Dense, Dense}); + expected2(i,j) = B(i,k,l) * C(k,j) * D(l,j); + IndexStmt taco_stmt2 = expected2.getAssignment().concretize(); + taco_stmt2 = scheduleMTTKRPPrecomputedCPU_ST(taco_stmt2, B); + expected2.compile(taco_stmt2); + expected2.assemble(); + expected2.compute(); + ASSERT_TENSOR_EQ(expected1, expected2); + + taco::util::TimeResults timevalue; + bool time = true; + + for (int i=0; i<3; i++) { + TOOL_BENCHMARK_TIMER(expected1.compute(), "Compute TACO1: ", timevalue); + TOOL_BENCHMARK_TIMER(A1.compute(), "Compute ISPC1: ", timevalue); + TOOL_BENCHMARK_TIMER(expected2.compute(), "Compute TACO2: ", timevalue); + TOOL_BENCHMARK_TIMER(A2.compute(), "Compute ISPC2: ", timevalue); + } } -TEST(scheduling_eval, mttkrpCPU) { + +TEST(scheduling_eval, mttkrp4ISPC) { if (should_use_CUDA_codegen()) { return; } - int NUM_I = 1021/20; - int NUM_J = 1039/20; + set_ISPC_codegen_enabled(false); + set_CUDA_codegen_enabled(false); + int NUM_I = 1000; // 1021/20; + int NUM_J = 16; int NUM_K = 1057/20; int NUM_L = 1232/20; + int NUM_M = 1124/20; float SPARSITY = .1; - Tensor A("A", {NUM_I, NUM_J}, {Dense, Dense}); - Tensor B("B", {NUM_I, NUM_K, NUM_L}, {Dense, Sparse, Sparse}); + Tensor B("B", {NUM_I, NUM_K, NUM_L, NUM_M}, {Dense, Sparse, Sparse, Sparse}); Tensor C("C", {NUM_K, NUM_J}, {Dense, Dense}); Tensor D("D", {NUM_L, NUM_J}, {Dense, Dense}); + Tensor E("E", {NUM_M, NUM_J}, {Dense, Dense}); srand(549694); for (int i = 0; i < NUM_I; i++) { for (int k = 0; k < NUM_K; k++) { for (int l = 0; l < NUM_L; l++) { - float rand_float = (float) rand() / (float) (RAND_MAX); - if (rand_float < SPARSITY) { - B.insert({i, k, l}, (double) ((int) (rand_float * 3 / SPARSITY))); + for (int m = 0; m < NUM_M; m++) { + float rand_float = (float) rand() / (float) (RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, k, l, m}, (double) ((int) (rand_float * 3 / SPARSITY))); + } } } } @@ -1062,27 +2199,83 @@ TEST(scheduling_eval, mttkrpCPU) { } } + for (int m = 0; m < NUM_M; m++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + E.insert({m, j}, (double) ((int) (rand_float*3))); + } + } + B.pack(); C.pack(); D.pack(); + E.pack(); + + set_ISPC_codegen_enabled(true); + Tensor A1("A1", {NUM_I, NUM_J}, {Dense, Dense}); + A1(i,j) = B(i,k,l,m) * C(k,j) * D(l,j) * E(m,j); + IndexStmt stmt1 = A1.getAssignment().concretize(); + stmt1 = scheduleMTTKRP4ISPC_ST(stmt1, B); + // printToFile("mttkrp1_cpu_ispc", stmt1); + A1.compile(stmt1); + A1.assemble(); + A1.compute(); + + set_ISPC_codegen_enabled(false); + Tensor expected1("expected1", {NUM_I, NUM_J}, {Dense, Dense}); + expected1(i,j) = B(i,k,l,m) * C(k,j) * D(l,j) * E(m,j); + IndexStmt taco_stmt1 = expected1.getAssignment().concretize(); + taco_stmt1 = scheduleMTTKRP4CPU_ST(taco_stmt1, B); + expected1.compile(taco_stmt1); + expected1.assemble(); + expected1.compute(); + ASSERT_TENSOR_EQ(expected1, A1); + + // set_ISPC_codegen_enabled(true); + // Tensor A2("A2", {NUM_I, NUM_J}, {Dense, Dense}); + // A2(i,j) = B(i,k,l) * C(k,j) * D(l,j); + // IndexStmt stmt2 = A1.getAssignment().concretize(); + // stmt2 = scheduleMTTKRPPrecomputedISPC_ST(stmt2, B); + // // printToFile("mttkrp_cpu_ispc", stmt); + // A2.compile(stmt2); + // A2.assemble(); + // A2.compute(); + // ASSERT_TENSOR_EQ(expected1, A2); + + set_ISPC_codegen_enabled(false); + Tensor expected2("expected2", {NUM_I, NUM_J}, {Dense, Dense}); + expected2(i,j) = B(i,k,l,m) * C(k,j) * D(l,j) * E(m,j); + + IndexExpr BE = B(i,k,l,m) * E(m,j); + IndexExpr BDE = BE * D(l, j); + expected2(i,j) = BDE * C(k,j); + IndexStmt taco_stmt2 = expected2.getAssignment().concretize(); + TensorVar BE_workspace("BE_workspace", Type(Float64, {Dimension(j)}), taco::dense); + TensorVar BDE_workspace("BDE_workspace", Type(Float64, {Dimension(j)}), taco::dense); + + IndexStmt precomputed_stmt = forall(i, forall(k, + where(forall(j, expected2(i,j) += BDE_workspace(j) * C(k,j)), + forall(l, where(forall(j, BDE_workspace(j) += BE_workspace(j) * D(l,j)), + forall(m, forall(j, BE_workspace(j) += B(i,k,l,m) * E(m,j)))))))); + + // IndexStmt scheduled2 = scheduleMTTKRPPrecomputedCPU(precomputed_stmt, B, 64); + // expected2.compile(scheduled2); + // expected2.assemble(); + // expected2.compute(); + // ASSERT_TENSOR_EQ(expected1, expected2); + + taco::util::TimeResults timevalue; + bool time = true; + + for (int i=0; i<3; i++) { + TOOL_BENCHMARK_TIMER(expected1.compute(), "Compute TACO1: ", timevalue); + TOOL_BENCHMARK_TIMER(A1.compute(), "Compute ISPC1: ", timevalue); + // TOOL_BENCHMARK_TIMER(expected2.compute(), "Compute TACO2: ", timevalue); + // TOOL_BENCHMARK_TIMER(A2.compute(), "Compute ISPC2: ", timevalue); + } +} - A(i,j) = B(i,k,l) * C(k,j) * D(l,j); - - IndexStmt stmt = A.getAssignment().concretize(); - stmt = scheduleMTTKRPCPU(stmt, B); - //printToFile("mttkrp_cpu", stmt); - - A.compile(stmt); - A.assemble(); - A.compute(); - Tensor expected("expected", {NUM_I, NUM_J}, {Dense, Dense}); - expected(i,j) = B(i,k,l) * C(k,j) * D(l,j); - expected.compile(); - expected.assemble(); - expected.compute(); - ASSERT_TENSOR_EQ(expected, A); -} TEST(scheduling_eval, spmvGPU) { if (!should_use_CUDA_codegen()) { @@ -1463,7 +2656,336 @@ TEST(scheduling_eval, mttkrpGPU) { ASSERT_TENSOR_EQ(expected, A); } -TEST(generate_evaluation_files, DISABLED_cpu) { +TEST(generate_evaluation_files, ispc) { + std::cout << "Hi Adhitha!\n" << std::endl ; + set_CUDA_codegen_enabled(false); + set_ISPC_codegen_enabled(true); + + vector> spmv_parameters = {{32}}; + vector> spmspv_parameters = {{8}}; + + // 4 to 512 and 4, 8, 16 + vector> spmm_dcsr_parameters = {{16, 8}}; + vector> spmm_parameters = {{16,4}}; + + vector> mttkrp_parameters = {}; + mttkrp_parameters.push_back({64,0}); + + vector> sddmm_parameters = {{8, 8}}; + vector> ttv_parameters = {{32}}; + + int NUM_I = 100; + int NUM_J = 100; + int NUM_K = 100; + int NUM_L = 100; + + string c_file_ending = ".h"; + string file_ending = ".ispc"; + string file_path = "eval_prepared_ispc/"; + mkdir(file_path.c_str(), 0777); + + // spmv + { + stringstream source1; + stringstream source2; + std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); + Tensor A("A", {NUM_I, NUM_J}, CSR); + Tensor x("x", {NUM_J}, {Dense}); + Tensor y("y", {NUM_I}, {Dense}); + y(i) = A(i, j) * x(j); + std::cout << "concretizing the assignment statement\n"; + IndexStmt stmt = y.getAssignment().concretize(); + std::cout << "Printing the original IndexStmt: " << stmt << std::endl; + + for (auto paramSet : spmv_parameters) { + std::cout << "param set: " << paramSet[0] << std::endl; + IndexStmt scheduled = scheduleSpMVISPC(stmt, paramSet[0]); + std::cout << "scheduled IndexStmt: " << scheduled << std::endl; + ir::Stmt compute = lower(scheduled, string("compute_") + util::join(paramSet, "_"), false, true); + std::cout << "computed statement: \n" << compute << std::endl; + codegen->compile(compute, false); + } + ofstream source_file; + source_file.open(file_path + "spmv_csr_ispc_taco" + c_file_ending); + source_file << source1.str(); + source_file.close(); + + ofstream ispc_source_file; + ispc_source_file.open(file_path + "__spmv_csr_ispc_taco" + file_ending); + ispc_source_file << source2.str(); + ispc_source_file.close(); + + } + + // spmm + { + stringstream source1; + stringstream source2; + std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); + Tensor A("A", {NUM_I, NUM_J}, CSR); + Tensor X("X", {NUM_J, NUM_K}, {Dense, Dense}); + Tensor Y("Y", {NUM_I, NUM_K}, {Dense, Dense}); + Y(i, k) = A(i, j) * X(j, k); + IndexStmt stmt = Y.getAssignment().concretize(); + bool isFirst = true; + for (auto paramSet : spmm_parameters) { + IndexStmt scheduled = scheduleSpMMISPC1(stmt, A, paramSet[0], paramSet[1]); + ir::Stmt compute = lower(scheduled, string("compute1_") + util::join(paramSet, "_"), false, true); + codegen->compile(compute, isFirst); + isFirst = false; + } + ofstream source_file; + source_file.open(file_path + "spmm_csr_ispc_taco1" + c_file_ending); + source_file << source1.str(); + source_file.close(); + + ofstream ispc_source_file; + ispc_source_file.open(file_path + "__spmm_csr_ispc_taco1" + file_ending); + ispc_source_file << source2.str(); + ispc_source_file.close(); + } + + // spmm omp + { + stringstream source1; + stringstream source2; + std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); + Tensor A("A", {NUM_I, NUM_J}, CSR); + Tensor X("X", {NUM_J, NUM_K}, {Dense, Dense}); + Tensor Y("Y", {NUM_I, NUM_K}, {Dense, Dense}); + Y(i, k) = A(i, j) * X(j, k); + IndexStmt stmt = Y.getAssignment().concretize(); + bool isFirst = true; + for (auto paramSet : spmm_parameters) { + IndexStmt scheduled = scheduleSpMMISPCOMP1(stmt, A, paramSet[0], paramSet[1]); + ir::Stmt compute = lower(scheduled, string("compute1_") + util::join(paramSet, "_"), false, true); + codegen->compile(compute, isFirst); + isFirst = false; + } + ofstream source_file; + source_file.open(file_path + "spmm_omp_ispc_taco1" + c_file_ending); + source_file << source1.str(); + source_file.close(); + + ofstream ispc_source_file; + ispc_source_file.open(file_path + "__spmm_omp_ispc_taco1" + file_ending); + ispc_source_file << source2.str(); + ispc_source_file.close(); + } + + // spmm2 + { + stringstream source1; + stringstream source2; + std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); + Tensor A("A", {NUM_I, NUM_J}, CSR); + Tensor X("X", {NUM_J, NUM_K}, {Dense, Dense}); + Tensor Y("Y", {NUM_I, NUM_K}, {Dense, Dense}); + Y(i, k) = A(i, j) * X(j, k); + IndexStmt stmt = Y.getAssignment().concretize(); + bool isFirst = true; + for (auto paramSet : spmm_parameters) { + IndexStmt scheduled = scheduleSpMMISPC2(stmt, A, paramSet[0], paramSet[1]); + ir::Stmt compute = lower(scheduled, string("compute2_") + util::join(paramSet, "_"), false, true); + codegen->compile(compute, isFirst); + isFirst = false; + } + ofstream source_file; + source_file.open(file_path + "spmm_csr_ispc_taco2" + c_file_ending); + source_file << source1.str(); + source_file.close(); + + ofstream ispc_source_file; + ispc_source_file.open(file_path + "__spmm_csr_ispc_taco2" + file_ending); + ispc_source_file << source2.str(); + ispc_source_file.close(); + } + + // spmm + { + stringstream source1; + stringstream source2; + std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); + Tensor A("A", {NUM_I, NUM_J}, CSR); + Tensor X("X", {NUM_J, NUM_K}, {Dense, Dense}); + Tensor Y("Y", {NUM_I, NUM_K}, {Dense, Dense}); + Y(i, k) = A(i, j) * X(j, k); + IndexStmt stmt = Y.getAssignment().concretize(); + bool isFirst = true; + for (auto paramSet : spmm_parameters) { + IndexStmt scheduled = scheduleSpMMISPC3(stmt, A, paramSet[0], paramSet[1]); + ir::Stmt compute = lower(scheduled, string("compute3_") + util::join(paramSet, "_"), false, true); + codegen->compile(compute, isFirst); + isFirst = false; + } + ofstream source_file; + source_file.open(file_path + "spmm_csr_ispc_taco3" + c_file_ending); + source_file << source1.str(); + source_file.close(); + + ofstream ispc_source_file; + ispc_source_file.open(file_path + "__spmm_csr_ispc_taco3" + file_ending); + ispc_source_file << source2.str(); + ispc_source_file.close(); + } + + // ttv + { + stringstream source; + stringstream source2; + std::shared_ptr codegen = ir::CodeGen::init_default(source, source2, ir::CodeGen::ImplementationGen); + Tensor A("A", {NUM_I, NUM_J}, {Dense, Dense}); // TODO: change to sparse outputs + Tensor B("B", {NUM_I, NUM_J, NUM_K}, {Sparse, Sparse, Sparse}); + Tensor c("c", {NUM_K}, Format({Dense})); + A(i,j) = B(i,j,k) * c(k); + IndexStmt stmt = A.getAssignment().concretize(); + bool isFirst = true; + for (auto paramSet : ttv_parameters) { + IndexStmt scheduled = scheduleTTVCPU(stmt, B, paramSet[0]); + ir::Stmt compute = lower(scheduled, string("compute_") + util::join(paramSet, "_"), false, true); + codegen->compile(compute, isFirst); + isFirst = false; + } + ofstream source_file; + source_file.open(file_path + "ttv_cpu" + c_file_ending); + source_file << source.str(); + source_file.close(); + + ofstream ispc_source_file; + ispc_source_file.open(file_path + "__ttv_cpu" + file_ending); + ispc_source_file << source2.str(); + ispc_source_file.close(); + } + + + // mttkrp3 + { + stringstream source; + stringstream source2; + std::shared_ptr codegen = ir::CodeGen::init_default(source, source2, ir::CodeGen::ImplementationGen); + Tensor A("A", {NUM_I, NUM_J}, {Dense, Dense}); + Tensor B("B", {NUM_I, NUM_K, NUM_L}, {Dense, Sparse, Sparse}); + Tensor C("C", {NUM_K, NUM_J}, {Dense, Dense}); + Tensor D("D", {NUM_L, NUM_J}, {Dense, Dense}); + A(i,j) = B(i,k,l) * C(k,j) * D(l,j); + IndexStmt stmt = A.getAssignment().concretize(); + bool isFirst = true; + for (auto paramSet : mttkrp_parameters) { + IndexStmt scheduled = scheduleMTTKRPCPU(stmt, B, paramSet[0], paramSet[1]); + ir::Stmt compute = lower(scheduled, string("compute_") + util::join(paramSet, "_"), false, true); + codegen->compile(compute, isFirst); + isFirst = false; + } + ofstream source_file; + source_file.open(file_path + "mttkrp3_cpu" + c_file_ending); + source_file << source.str(); + source_file.close(); + + ofstream ispc_source_file; + ispc_source_file.open(file_path + "__mttkrp3_cpu" + file_ending); + ispc_source_file << source2.str(); + ispc_source_file.close(); + } + + + return; +} + + + +TEST(generate_ispc_sddmm_evaluation_files, ispc) { + std::cout << "Hi Adhitha!\n" << std::endl ; + set_CUDA_codegen_enabled(false); + set_ISPC_codegen_enabled(true); + + vector> spmv_parameters = {{32}}; + vector> spmspv_parameters = {{8}}; + + // 4 to 512 and 4, 8, 16 + vector> spmm_dcsr_parameters = {{16, 8}}; + vector> spmm_parameters = {{16,4}}; + + vector> mttkrp_parameters = {}; + mttkrp_parameters.push_back({64,0}); + + vector> sddmm_parameters = {{8, 8}}; + vector> ttv_parameters = {{32}}; + + int NUM_I = 100; + int NUM_J = 100; + int NUM_K = 100; + + string c_file_ending = ".h"; + string file_ending = ".ispc"; + string file_path = "eval_prepared_ispc/sddmm/"; + mkdir(file_path.c_str(), 0777); + + // sddmm + { + stringstream source1; + stringstream source2; + std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); + Tensor A("A", {NUM_I, NUM_K}, {Dense, Dense}); + Tensor B("B", {NUM_I, NUM_K}, CSR); + Tensor C("C", {NUM_I, NUM_J}, {Dense, Dense}); + Tensor D("D", {NUM_J, NUM_K}, {Dense, Dense}); + A(i,k) = B(i,k) * C(i,j) * D(j,k); + IndexStmt stmt = A.getAssignment().concretize(); + bool isFirst = true; + for (auto paramSet : sddmm_parameters) { + IndexStmt scheduled = scheduleSDDMMISPC1(stmt, B, paramSet[0], paramSet[1]); + ir::Stmt compute = lower(scheduled, string("compute1_") + util::join(paramSet, "_"), false, true); + codegen->compile(compute, isFirst); + isFirst = false; + } + ofstream source_file; + source_file.open(file_path + "sddmm_cpu_ispc_taco1" + file_ending); + source_file << source1.str(); + source_file.close(); + + ofstream ispc_source_file; + ispc_source_file.open(file_path + "__sddmm_cpu_ispc_taco1" + file_ending); + ispc_source_file << source2.str(); + ispc_source_file.close(); + } + + + // sddmm + { + stringstream source1; + stringstream source2; + std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); + Tensor Y("Y", {NUM_I, NUM_K}, {Dense, Dense}); + Tensor A("A", {NUM_I, NUM_K}, CSR); + Tensor X("X", {NUM_I, NUM_J}, {Dense, Dense}); + Y(i,j) = A(i,j) * X(i,k) * X(j,k); + IndexStmt stmt = Y.getAssignment().concretize(); + bool isFirst = true; + for (auto paramSet : sddmm_parameters) { + IndexStmt scheduled = scheduleSDDMMISPC2(stmt, A, paramSet[0], paramSet[1]); + ir::Stmt compute = lower(scheduled, string("compute2_") + util::join(paramSet, "_"), false, true); + codegen->compile(compute, isFirst); + isFirst = false; + } + ofstream source_file; + source_file.open(file_path + "sddmm_cpu_ispc_taco2" + file_ending); + source_file << source1.str(); + source_file.close(); + + ofstream ispc_source_file; + ispc_source_file.open(file_path + "__sddmm_cpu_ispc_taco2" + file_ending); + ispc_source_file << source2.str(); + ispc_source_file.close(); + } + + + return; +} + + + + +TEST(generate_evaluation_files, cpu) { if (should_use_CUDA_codegen()) { return; } @@ -1779,10 +3301,63 @@ TEST(generate_evaluation_files, DISABLED_cpu) { } } -TEST(generate_evaluation_files, DISABLED_gpu) { - if (!should_use_CUDA_codegen()) { - return; +TEST(generate_evaluation_files, spmv_ispc) { + set_CUDA_codegen_enabled(false); + set_ISPC_codegen_enabled(true); + + std::cout << "executing generate_evaluation_file.ispc\n"; + + int NUM_I = 100; + int NUM_J = 100; + + vector> spmv_parameters = {}; // {NNZ_PER_THREAD, BLOCK_SIZE} + for (int i = 3; i <= 20; i++) { + spmv_parameters.push_back({i, 512}); + } + + string file_ending_c = ".c"; + string file_ending_ispc = ".ispc"; + string file_path = "eval_prepared_ispc/spmv/"; + mkdir(file_path.c_str(), 0777); + + // spmv + { + stringstream source1; + stringstream source2; + std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); + Tensor A("A", {NUM_I, NUM_J}, CSR); + Tensor x("x", {NUM_J}, Format({Dense})); + Tensor y("y", {NUM_I}, Format({Dense})); + IndexExpr precomputed = A(i, j) * x(j); + y(i) = precomputed; + IndexStmt stmt = y.getAssignment().concretize(); + bool isFirst = true; + for (auto paramSet : spmv_parameters) { + IndexStmt scheduled = scheduleSpMVCPU(stmt); + ir::Stmt compute = lower(scheduled, string("compute_") + util::join(paramSet, "_"), false, true); + codegen->compile(compute, isFirst); + isFirst = false; + } + ofstream source_file1; + source_file1.open(file_path + "spmv_ispc" + file_ending_c); + source_file1 << source1.str(); + source_file1.close(); + + ofstream source_file2; + source_file2.open(file_path + "__spmv_ispc" + file_ending_ispc); + source_file2 << source2.str(); + source_file2.close(); } +} + +TEST(generate_evaluation_files, gpu) { + // if (!should_use_CUDA_codegen()) { + // return; + // } + set_CUDA_codegen_enabled(true); + set_ISPC_codegen_enabled(false); + + std::cout << "executing generate_evaluation_file.gpu\n"; vector> spmv_parameters = {}; // {NNZ_PER_THREAD, BLOCK_SIZE} for (int i = 3; i <= 20; i++) { diff --git a/test/tests-transformation.cpp b/test/tests-transformation.cpp index abfec3d45..9a472906f 100644 --- a/test/tests-transformation.cpp +++ b/test/tests-transformation.cpp @@ -255,6 +255,8 @@ INSTANTIATE_TEST_CASE_P(parallelize, apply, struct reorderLoopsTopologically : public TestWithParam {}; + +// TEST_P(reorderLoopsTopologically, test) { IndexStmt actual = taco::reorderLoopsTopologically(GetParam().actual); ASSERT_NOTATION_EQ(GetParam().expected, actual); diff --git a/tools/taco.cpp b/tools/taco.cpp index cd351a203..7384874ec 100644 --- a/tools/taco.cpp +++ b/tools/taco.cpp @@ -9,6 +9,7 @@ #include "taco.h" #include "taco/error.h" +#include "taco/index_notation/index_notation.h" #include "taco/parser/lexer.h" #include "taco/parser/parser.h" #include "taco/parser/schedule_parser.h" @@ -20,6 +21,7 @@ #include "taco/lower/lower.h" #include "taco/codegen/module.h" #include "codegen/codegen_c.h" +#include "codegen/codegen_ispc.h" #include "codegen/codegen_cuda.h" #include "codegen/codegen.h" #include "taco/util/strings.h" @@ -188,6 +190,8 @@ static void printUsageInfo() { cout << endl; printFlag("print-nocolor", "Print without colors."); cout << endl; + printFlag("ispc", "Generate ISPC code for Intel CPUs"); + cout << endl; printFlag("cuda", "Generate CUDA code for NVIDIA GPUs"); cout << endl; printFlag("schedule", "Specify parallel execution schedule"); @@ -262,7 +266,7 @@ static void printSchedulingHelp() { "an output race strategy `strat`. Since the other transformations " "expect serial code, parallelize must come last in a series of " "transformations. Possible parallel hardware units are: " - "NotParallel, GPUBlock, GPUWarp, GPUThread, CPUThread, CPUVector. " + "NotParallel, GPUBlock, GPUWarp, GPUThread, CPUThread, CPUVector, CPUSimd, CPUSimd. " "Possible output race strategies are: " "IgnoreRaces, NoRaces, Atomics, Temporary, ParallelReduction."); } @@ -279,6 +283,8 @@ static void printVersionInfo() { cout << "Built with Python support." << endl; if(TACO_FEATURE_CUDA) cout << "Built with CUDA support." << endl; + if(TACO_FEATURE_ISPC) + cout << "Built with ISPC support." << endl; cout << endl; cout << "Built on: " << TACO_BUILD_DATE << endl; cout << "CMake build type: " << TACO_BUILD_TYPE << endl; @@ -308,7 +314,10 @@ static void printCommandLine(ostream& os, int argc, char* argv[]) { } } -static bool setSchedulingCommands(vector> scheduleCommands, parser::Parser& parser, IndexStmt& stmt) { +static int setSchedulingCommands(vector> scheduleCommands, + parser::Parser& parser, IndexStmt& stmt, Assignment assignment) { + + std::cout << "setting scheduling commands\n"; auto findVar = [&stmt](string name) { ProvenanceGraph graph(stmt); for (auto v : graph.getAllIndexVars()) { @@ -321,9 +330,15 @@ static bool setSchedulingCommands(vector> scheduleCommands, parse abort(); // to silence a warning: control reaches end of non-void function }; - bool isGPU = false; + int isGPU = 0; + int isISPC = 0; for(vector scheduleCommand : scheduleCommands) { + std::cout << "running schedluing command: "; + for (auto &command : scheduleCommand) { + std::cout << command << " "; + } + std::cout << std::endl; string command = scheduleCommand[0]; scheduleCommand.erase(scheduleCommand.begin()); @@ -352,6 +367,16 @@ static bool setSchedulingCommands(vector> scheduleCommands, parse IndexVar fused(f); stmt = stmt.fuse(findVar(i), findVar(j), fused); + } else if (command == "loopfuse") { + taco_uassert(scheduleCommand.size() == 2) + << "'loopfuse' scheduling directive takes 2 parameters: fuse(b, 2)"; + std::string side = scheduleCommand[0]; + taco_uassert(side == "b" || side == "f") + << "first parameter must be either 'f' or 'b'"; + + int iters = std::stoi(scheduleCommand[1]); + + stmt = loopFusionOverFission(stmt, assignment, side, iters); } else if (command == "split") { taco_uassert(scheduleCommand.size() == 4) << "'split' scheduling directive takes 4 parameters: split(i, i1, i2, splitFactor)"; @@ -536,7 +561,15 @@ static bool setSchedulingCommands(vector> scheduleCommands, parse parallel_unit = ParallelUnit::CPUThread; } else if (unit == "CPUVector") { parallel_unit = ParallelUnit::CPUVector; - } else { + } else if (unit == "CPUSimd") { + isISPC = true; + parallel_unit = ParallelUnit::CPUSimd; + } + else if (unit == "CPUSpmd") { + parallel_unit = ParallelUnit::CPUSpmd; + isISPC = true; + } + else { taco_uerror << "Parallel hardware not defined."; goto end; } @@ -557,6 +590,8 @@ static bool setSchedulingCommands(vector> scheduleCommands, parse goto end; } + std::cout << "stmt before parallelizing the statement: " << stmt << endl; + std::cout << "ParallelUnit: " << ParallelUnit_NAMES[(int) parallel_unit] << ", outputRaceStrategy: " << OutputRaceStrategy_NAMES[(int) output_race_strategy] << std::endl; stmt = stmt.parallelize(findVar(i), parallel_unit, output_race_strategy); } else if (command == "assemble") { @@ -612,7 +647,13 @@ static bool setSchedulingCommands(vector> scheduleCommands, parse end:; } - return isGPU; + if (isGPU) { + return 1; + } + else if (isISPC) { + return 2; + } + return 0; } int main(int argc, char* argv[]) { @@ -641,6 +682,7 @@ int main(int argc, char* argv[]) { bool color = true; bool readKernels = false; bool cuda = false; + bool ispc = false; bool setSchedule = false; @@ -949,6 +991,10 @@ int main(int argc, char* argv[]) { else if ("-cuda" == argName) { cuda = true; } + else if ("-ispc" == argName) { + std::cout << "ispc true\n"; + ispc = true; + } else if ("-schedule" == argName) { vector descriptor = util::split(argValue, ","); if (descriptor.size() > 2 || descriptor.empty()) { @@ -1001,6 +1047,8 @@ int main(int argc, char* argv[]) { } } + std::cout << "cuda: " << cuda << ", ispc: " << ispc << std::endl; + // Print compute is the default if nothing else was asked for if (!printAssemble && !printEvaluate && !printIterationGraph && !writeCompute && !writeAssemble && !writeKernels && !readKernels && @@ -1009,9 +1057,11 @@ int main(int argc, char* argv[]) { } // pre-parse expression, to determine existence and order of loaded tensors + std::cout << "pre-parse expression, to determine existence and order of loaded tensors\n"; map loadedTensors; TensorBase temp_tensor; parser::Parser temp_parser(exprStr, formats, dataTypes, tensorsDimensions, loadedTensors, 42); + std::cout << exprStr << std::endl; try { temp_parser.parse(); temp_tensor = temp_parser.getResultTensor(); @@ -1112,33 +1162,61 @@ int main(int argc, char* argv[]) { taco_set_parallel_schedule(sched, chunkSize); taco_set_num_threads(nthreads); - IndexStmt stmt = - makeConcreteNotation(makeReductionNotation(tensor.getAssignment())); + Assignment assignment = tensor.getAssignment(); + std::cout << "tensor.getAssignment(): " << assignment << std::endl; + + IndexStmt stmt2 = makeReductionNotation(tensor.getAssignment()); + std::cout << "reducedNotation: " << stmt2 << std::endl; + // IndexStmt stmt = + // makeConcreteNotation(makeReductionNotation(tensor.getAssignment())); + IndexStmt stmt = makeConcreteNotation(stmt2); + std::cout << "concrete index statement: " << stmt << std::endl; stmt = reorderLoopsTopologically(stmt); + std::cout << "topologically reordered loops statement: " << stmt << std::endl; + if (setSchedule) { - cuda |= setSchedulingCommands(scheduleCommands, parser, stmt); + int val = setSchedulingCommands(scheduleCommands, parser, stmt, tensor.getAssignment()); + // stmt = loopFusionOverFission(stmt, tensor.getAssignment()); + cuda |= (val==1); + ispc |= (val==2); } else { + // stmt = loopFusionOverFission(stmt, tensor.getAssignment()); stmt = insertTemporaries(stmt); stmt = parallelizeOuterLoop(stmt); } + std::cout << "after setting the scheduling commands\n"; + std::cout << stmt << std::endl; if (cuda) { if (!CUDA_BUILT && benchmark) { return reportError("TACO must be built for CUDA (cmake -DCUDA=ON ..) to benchmark", 2); } set_CUDA_codegen_enabled(true); + set_ISPC_codegen_enabled(false); + } + else if (ispc) { + if (!ISPC_BUILT && benchmark) { + return reportError("TACO must be built for ISPC (cmake -DISPC=ON .. to benchmark", 2); + } + set_CUDA_codegen_enabled(false); + set_ISPC_codegen_enabled(true); } else { set_CUDA_codegen_enabled(false); + set_ISPC_codegen_enabled(false); } + std::cout << "running scalar promote\n" << std::endl; // stmt = scalarPromote(stmt); + std::cout << "\nafter scalar promote: \n" << stmt << std::endl << std::endl; + if (printConcrete) { cout << stmt << endl; } + // lower index statement to ir statement Kernel kernel; if (benchmark) { if (time) cout << endl; @@ -1221,9 +1299,15 @@ int main(int argc, char* argv[]) { } } else { + std::cout << "lowering stmt: " << stmt << std::endl; compute = lower(stmt, prefix+"compute", computeWithAssemble, true); assemble = lower(stmt, prefix+"assemble", true, false); evaluate = lower(stmt, prefix+"evaluate", true, true); + + std::cout << "\n\ncompute kernel\n------------\n" << compute << std::endl << std::endl; + // compute kernel is the most basic kernel after lowering phase + + std::cout << "\n\nevaluate kernel\n------------\n" << evaluate << std::endl << std::endl; } string packComment = @@ -1278,6 +1362,7 @@ int main(int argc, char* argv[]) { } bool hasPrinted = false; + std::shared_ptr codegen = ir::CodeGen::init_default(cout, ir::CodeGen::ImplementationGen); codegen->setColor(color); if (printAssemble) { @@ -1298,6 +1383,7 @@ int main(int argc, char* argv[]) { } if (compute.defined()) { + std::cout << "Code generation\n"; codegen->compile(compute, false); } else { @@ -1355,7 +1441,7 @@ int main(int argc, char* argv[]) { } IterationGraph iterationGraph; - if (printIterationGraph) { + if (printIterationGraph) { // print iteration graph iterationGraph = IterationGraph::make(tensor.getAssignment()); }