diff --git a/.gitignore b/.gitignore index 16389f34e..215b56e9a 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,7 @@ CMakeCache.txt doc apps/tensor_times_vector/tensor_times_vector + +.cache +.vscode +compile_commands.json diff --git a/CMakeLists.txt b/CMakeLists.txt index a6a80d9d1..c9012ca2d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,11 +10,13 @@ project(taco LANGUAGES C CXX ) option(CUDA "Build for NVIDIA GPU (CUDA must be preinstalled)" OFF) +option(ISPC "Build for Intel ISPC Compiler (ISPC Compiler must be preinstalled)" OFF) option(PYTHON "Build TACO for python environment" OFF) -option(OPENMP "Build with OpenMP execution support" OFF) +option(OPENMP "Build with OpenMP execution support" ON) option(COVERAGE "Build with code coverage analysis" OFF) set(TACO_FEATURE_CUDA 0) -set(TACO_FEATURE_OPENMP 0) +set(TACO_FEATURE_ISPC 0) +set(TACO_FEATURE_OPENMP 1) set(TACO_FEATURE_PYTHON 0) if(CUDA) message("-- Searching for CUDA Installation") @@ -22,6 +24,11 @@ if(CUDA) add_definitions(-DCUDA_BUILT) set(TACO_FEATURE_CUDA 1) endif(CUDA) +if(ISPC) + message("-- Searching for ISPC Installation") + add_definitions(-DISPC_BUILT) + set(TACO_FEATURE_ISPC 1) +endif(ISPC) if(OPENMP) message("-- Will use OpenMP for parallel execution") add_definitions(-DUSE_OPENMP) @@ -88,6 +95,39 @@ if(OPENMP) set(C_CXX_FLAGS "-fopenmp ${C_CXX_FLAGS}") endif(OPENMP) +set(PAPI_DIR "/home/min/a/kadhitha/workspace/my_taco/papi/src/install/") + +find_path(PAPI_DIR + NAMES include/papi.h +) + +find_library(PAPI_LIBRARIES + # Pick the static library first for easier run-time linking. + NAMES libpapi.a papi + HINTS ${PAPI_DIR}/lib ${HILTIDEPS}/lib +) + +find_path(PAPI_INCLUDE_DIRS + NAMES papi.h + HINTS ${PAPI_DIR}/include ${HILTIDEPS}/include +) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(PAPI DEFAULT_MSG + PAPI_LIBRARIES + PAPI_INCLUDE_DIRS +) + +mark_as_advanced( + PAPI_PREFIX_DIRS + PAPI_LIBRARIES + PAPI_INCLUDE_DIRS +) + +include_directories(${PAPI_INCLUDE_DIRS}) + +# project (ValgrindExample) + if(COVERAGE) find_program(PATH_TO_GCOVR gcovr REQUIRED) # add coverage tooling to build flags @@ -97,7 +137,8 @@ if(COVERAGE) message("-- Code coverage analysis (gcovr) enabled") endif(COVERAGE) -set(C_CXX_FLAGS "${C_CXX_FLAGS}") +set(C_CXX_FLAGS "${C_CXX_FLAGS} -I/${PAPI_DIR}/include -L/${PAPI_DIR}/lib") +# set(C_CXX_FLAGS "${C_CXX_FLAGS}") set(CMAKE_C_FLAGS "${C_CXX_FLAGS}") set(CMAKE_CXX_FLAGS "${C_CXX_FLAGS} -std=c++14") @@ -110,6 +151,9 @@ set(TACO_INCLUDE_DIR ${TACO_PROJECT_DIR}/include) enable_testing() include_directories(${TACO_INCLUDE_DIR}) +# include_directories("/home/min/a/kadhitha/workspace/my_taco/valgrind") +# project (ValgrindExample) +# include (CTest) set(TACO_LIBRARY_DIR ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}) diff --git a/include/taco/codegen/module.h b/include/taco/codegen/module.h index 36eb34f1a..4db5fcdaf 100644 --- a/include/taco/codegen/module.h +++ b/include/taco/codegen/module.h @@ -17,7 +17,7 @@ class Module { public: /// Create a module for some target Module(Target target=getTargetFromEnvironment()) - : lib_handle(nullptr), moduleFromUserSource(false), target(target) { + : lib_handle(nullptr), so_lib_handle(nullptr), moduleFromUserSource(false), target(target) { setJITLibname(); setJITTmpdir(); } @@ -44,11 +44,16 @@ class Module { /// before calling. If there's no function of this name then a nullptr is /// returned. void* getFuncPtr(std::string name); + void* getFuncPtr(std::string& sofile, std::string name); /// Call a raw function in this module and return the result + int callFuncPackedRaw(std::string name, std::string& sofile, void** args); int callFuncPackedRaw(std::string name, void** args); /// Call a raw function in this module and return the result + int callFuncPackedRaw(std::string name, std::string& sofile, std::vector args) { + return callFuncPackedRaw(name, sofile, args.data()); + } int callFuncPackedRaw(std::string name, std::vector args) { return callFuncPackedRaw(name, args.data()); } @@ -57,6 +62,10 @@ class Module { int callFuncPacked(std::string name, void** args) { return callFuncPackedRaw("_shim_"+name, args); } + + int callFuncPacked(std::string name, std::string& sofile, void** args) { + return callFuncPackedRaw("_shim_"+name, sofile,args); + } /// Call a function using the taco_tensor_t interface and return the result int callFuncPacked(std::string name, std::vector args) { @@ -68,10 +77,12 @@ class Module { private: std::stringstream source; + std::stringstream additional_source; std::stringstream header; std::string libname; std::string tmpdir; void* lib_handle; + void* so_lib_handle; std::vector funcs; // true iff the module was created from user-provided source diff --git a/include/taco/cuda.h b/include/taco/cuda.h index aad6b5229..9c4a7aae9 100644 --- a/include/taco/cuda.h +++ b/include/taco/cuda.h @@ -9,7 +9,19 @@ #define CUDA_BUILT false #endif +#ifndef ISPC_BUILT + #define ISPC_BUILT false +#endif + namespace taco { + +/// Functions used by taco to interface with ISPC +bool should_use_ISPC_codegen(); +void set_ISPC_codegen_enabled(bool enabled); +bool is_ISPC_code_stream_enabled(); +void set_ISPC_code_stream_enabled(bool enabled); + + /// Functions used by taco to interface with CUDA (especially unified memory) /// Check if should use CUDA codegen bool should_use_CUDA_codegen(); diff --git a/include/taco/index_notation/transformations.h b/include/taco/index_notation/transformations.h index 7aa2579ad..4d6ec6830 100644 --- a/include/taco/index_notation/transformations.h +++ b/include/taco/index_notation/transformations.h @@ -223,6 +223,9 @@ IndexStmt parallelizeOuterLoop(IndexStmt stmt); */ IndexStmt reorderLoopsTopologically(IndexStmt stmt); +IndexStmt loopFusionOverFission(IndexStmt stmt, Assignment assignment, + std::string side, int iters); + /** * Performs scalar promotion so that reductions are done by accumulating into * scalar temporaries whenever possible. diff --git a/include/taco/ir/ir.h b/include/taco/ir/ir.h index f852f26b1..96dc7d034 100644 --- a/include/taco/ir/ir.h +++ b/include/taco/ir/ir.h @@ -591,7 +591,7 @@ struct Switch : public StmtNode { static const IRNodeType _type_info = IRNodeType::Switch; }; -enum class LoopKind {Serial, Static, Dynamic, Runtime, Vectorized, Static_Chunked}; +enum class LoopKind {Serial, Static, Dynamic, Runtime, Vectorized, Static_Chunked, Foreach, Mul_Thread, Init}; /** A for loop from start to end by increment. * A vectorized loop will require the increment to be 1 and the diff --git a/include/taco/ir/ir_printer.h b/include/taco/ir/ir_printer.h index 4e50764e9..c2c505bf5 100644 --- a/include/taco/ir/ir_printer.h +++ b/include/taco/ir/ir_printer.h @@ -16,6 +16,7 @@ class IRPrinter : public IRVisitorStrict { public: IRPrinter(std::ostream& stream); IRPrinter(std::ostream& stream, bool color, bool simplify); + IRPrinter(std::ostream& stream, std::ostream& stream2, bool color, bool simplify); virtual ~IRPrinter(); void setColor(bool color); @@ -72,6 +73,7 @@ class IRPrinter : public IRVisitorStrict { virtual void visit(const Break*); std::ostream &stream; + std::ostream &stream2; int indent; bool color; bool simplify; @@ -109,6 +111,7 @@ class IRPrinter : public IRVisitorStrict { void doIndent(); void printBinOp(Expr a, Expr b, std::string op, Precedence precedence); bool needsParentheses(Precedence precedence); + void sendToStream(std::stringstream &stream); std::string keywordString(std::string); std::string commentString(std::string); diff --git a/include/taco/ir_tags.h b/include/taco/ir_tags.h index 5858a13e3..6a74be173 100644 --- a/include/taco/ir_tags.h +++ b/include/taco/ir_tags.h @@ -9,7 +9,7 @@ namespace taco { /// ParallelUnit::GPUWarp can be optionally used to allow for GPU warp-level primitives /// ParallelUnit::GPUThread causes for every iteration to be executed on a separate GPU thread enum class ParallelUnit { - NotParallel, DefaultUnit, GPUBlock, GPUWarp, GPUThread, CPUThread, CPUVector, CPUThreadGroupReduction, GPUBlockReduction, GPUWarpReduction + NotParallel, DefaultUnit, GPUBlock, GPUWarp, GPUThread, CPUThread, CPUVector, CPUThreadGroupReduction, GPUBlockReduction, GPUWarpReduction, CPUSimd, CPUSpmd }; extern const char *ParallelUnit_NAMES[]; diff --git a/include/taco/lower/lowerer_impl_imperative.h b/include/taco/lower/lowerer_impl_imperative.h index 65f069fda..d743f5875 100644 --- a/include/taco/lower/lowerer_impl_imperative.h +++ b/include/taco/lower/lowerer_impl_imperative.h @@ -499,10 +499,13 @@ class LowererImplImperative : public LowererImpl { bool emitUnderivedGuards = true; + int loopDepth = 0; int inParallelLoopDepth = 0; std::map parallelUnitSizes; std::map parallelUnitIndexVars; + std::map forUnits; // + std::map whereTempsWithLoopDepth; /// Keep track of what IndexVars have already been defined std::set definedIndexVars; diff --git a/include/taco/taco_tensor_t.h b/include/taco/taco_tensor_t.h index 20d78bb51..f27acd9c7 100644 --- a/include/taco/taco_tensor_t.h +++ b/include/taco/taco_tensor_t.h @@ -6,6 +6,7 @@ #ifndef TACO_TENSOR_T_DEFINED #define TACO_TENSOR_T_DEFINED +#include #include typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; diff --git a/include/taco/tensor.h b/include/taco/tensor.h index b91782256..883718fb6 100644 --- a/include/taco/tensor.h +++ b/include/taco/tensor.h @@ -413,6 +413,8 @@ class TensorBase { /// Compile the tensor expression. void compile(); + void compute(std::ofstream& statfile); + void compute(std::ofstream& statfile, std::string& sofile); void compile(IndexStmt stmt, bool assembleWhileCompute=false); diff --git a/include/taco/util/strings.h b/include/taco/util/strings.h index 5dfb2f174..a3c3d863f 100644 --- a/include/taco/util/strings.h +++ b/include/taco/util/strings.h @@ -1,6 +1,7 @@ #ifndef TACO_UTIL_STRINGS_H #define TACO_UTIL_STRINGS_H +#include "taco/cuda.h" #include #include #include @@ -8,6 +9,8 @@ #include #include +#include "taco/type.h" + // To get the value of a compiler macro variable #define STRINGIFY(x) #x #define TO_STRING(x) STRINGIFY(x) @@ -15,6 +18,25 @@ namespace taco { namespace util { +// /// Turn anything except floating points that can be written to a stream +// /// into a string. +// template +// typename std::enable_if::value, std::string>::type +// toStringISPC(const T &val) { + +// std::stringstream sstream; +// if (val == Int32) { +// sstream << "int32"; +// } +// else if (val == Int64) { +// sstream << "int64"; +// } +// else { +// sstream << val; +// } +// return sstream.str(); +// } + /// Turn anything except floating points that can be written to a stream /// into a string. template diff --git a/include/taco/version.h.in b/include/taco/version.h.in index bc5559d7d..8ef507598 100644 --- a/include/taco/version.h.in +++ b/include/taco/version.h.in @@ -20,5 +20,6 @@ #define TACO_FEATURE_OPENMP @TACO_FEATURE_OPENMP@ #define TACO_FEATURE_PYTHON @TACO_FEATURE_PYTHON@ #define TACO_FEATURE_CUDA @TACO_FEATURE_CUDA@ +#define TACO_FEATURE_ISPC @TACO_FEATURE_ISPC@ #endif /* TACO_VERSION_H */ diff --git a/out/taco-uml/._taco.svg b/out/taco-uml/._taco.svg new file mode 100755 index 000000000..e88dbd51b Binary files /dev/null and b/out/taco-uml/._taco.svg differ diff --git a/out/taco-uml/taco.svg b/out/taco-uml/taco.svg new file mode 100644 index 000000000..57f7a18d1 --- /dev/null +++ b/out/taco-uml/taco.svg @@ -0,0 +1,878 @@ +IntrusivePtrT *ptrUncopyableIRNodevirtual void accept(IRVisitorStrict *v) const = 0virtual IRNodeType type_info() const = 0;BaseStmtNodeBaseExprNodeDatatype typeStmtNodevoid accept(IRVisitorStrict *v) constExprNodevoid accept(IRVisitorStrict *v) constIRHandlevoid accept(IRVisitorStrict *v) constExprStmtIRVisitorStrictvirtual void visit(const IRNode*) const = 0IRVisitorvirtual void visit(const IRNode*)IRRewriterExpr exprStmt stmtvirtual void visit(const ExprNode* op)virtual void visit(const StmtNode* op)Expr rewrite(Expr)Stmt rewrite(Stmt)IRPrinterstd::ostream &streamstd::ostream &stream2int indentbool colorbool simplifyenum PrecedencePrecedence parentPrecedence = BOTTOMNameGenerator varNameGeneratorscopedMap<Expr, std::String> varNamesvoid doIndent()void printBinOp(Expr a, Expr b, std::string op, Precedence precedence)void fewMoreMethods()virtual void visit(const ExprNode*)virtual void visit(const StmtNode*)setColor(bool color)print(Stmt)IRVerifierExpressionSimplifierRemoveRedundantStatementsRemoveRedundantLoopsRemoveDuplicateBodyCodeGenCodeGen_CCodeGen_CUDACodeGen_ISPCManageableIndexStmtNodevirtual void accept(IndexStmtVisitorStrict*) const = 0IndexExprNodevirtual void accept(IndexStmtVisitorStrict*) const = 0IndexStmtIndexExprIndexExprVisitorStrictvoid visit(const IndexStmt&)virtual void visit(const AccessNode*) = 0virtual void visit(const LiteralNode*) = 0virtual void visit(const NegNode*) = 0virtual void visit(const AddNode*) = 0virtual void visit(const SubNode*) = 0virtual void visit(const MulNode*) = 0virtual void visit(const DivNode*) = 0virtual void visit(const SqrtNode*) = 0virtual void visit(const CastNode*) = 0virtual void visit(const CallIntrinsicNode*) = 0virtual void visit(const ReductionNode*) = 0IndexStmtVisitorStrictvoid visit(const IndexStmt&)virtual void visit(const AssignmentNode*) = 0virtual void visit(const YieldNode*) = 0virtual void visit(const ForallNode*) = 0virtual void visit(const WhereNode*) = 0virtual void visit(const SequenceNode*) = 0virtual void visit(const AssembleNode*) = 0virtual void visit(const MultiNode*) = 0virtual void visit(const SuchThatNode*) = 0IndexNotationVisitorStrictIndexNotationPrintervoid print(const IndexExpr& expr)void print(const IndexStmt& expr)void visit(const AccessNode* node)void visit(const LiteralNode* node)void visit(const NegNode* node)void visit(const AddNode* node)void visit(const SubNode* node)void visit(const MulNode* node)void visit(const DivNode* node)void visit(const SqrtNode* node)void visit(const CastNode* node)void visit(const CallIntrinsicNode* node)void visit(const UnaryExprNode* node)void visit(const BinaryExprNode* node)void visit(const ReductionNode* node)void visit(const AssignmentNode* node)void visit(const YieldNode* node)void visit(const ForallNode* node)void visit(const WhereNode* node)void visit(const SequenceNode* node)void visit(const AssembleNode* node)void visit(const MultiNode* node)void visit(const SuchThatNode* node)IndexNotationVisitorvirtual void visit(const AccessNode* node)virtual void visit(const LiteralNode* node)virtual void visit(const NegNode* node)virtual void visit(const AddNode* node)virtual void visit(const SubNode* node)virtual void visit(const MulNode* node)virtual void visit(const DivNode* node)virtual void visit(const SqrtNode* node)virtual void visit(const CastNode* node)virtual void visit(const CallIntrinsicNode* node)virtual void visit(const UnaryExprNode* node)virtual void visit(const BinaryExprNode* node)virtual void visit(const ReductionNode* node)virtual void visit(const AssignmentNode* node)virtual void visit(const YieldNode* node)virtual void visit(const ForallNode* node)virtual void visit(const WhereNode* node)virtual void visit(const SequenceNode* node)virtual void visit(const AssembleNode* node)virtual void visit(const MultiNode* node)virtual void visit(const SuchThatNode* node)MatcherIndexExprRewriterStrictIndexExpr exprIndexExpr rewrite(IndexExpr)virtual void visit(const AccessNode* op) = 0virtual void visit(const LiteralNode* op) = 0virtual void visit(const NegNode* op) = 0virtual void visit(const SqrtNode* op) = 0virtual void visit(const AddNode* op) = 0virtual void visit(const SubNode* op) = 0virtual void visit(const MulNode* op) = 0virtual void visit(const DivNode* op) = 0virtual void visit(const CastNode* op) = 0virtual void visit(const CallIntrinsicNode* op) = 0virtual void visit(const ReductionNode* op) = 0IndexStmtRewriterStrictIndexStmt stmtIndexStmt rewrite(IndexStmt)virtual void visit(const AssignmentNode* op) = 0virtual void visit(const YieldNode* op) = 0virtual void visit(const ForallNode* op) = 0virtual void visit(const WhereNode* op) = 0virtual void visit(const SequenceNode* op) = 0virtual void visit(const AssembleNode* op) = 0virtual void visit(const MultiNode* op) = 0virtual void visit(const SuchThatNode* op) = 0IndexNotationRewriterStrictIndexNotationRewritervirtual void visit(const AccessNode* node)virtual void visit(const LiteralNode* node)virtual void visit(const NegNode* node)virtual void visit(const AddNode* node)virtual void visit(const SubNode* node)virtual void visit(const MulNode* node)virtual void visit(const DivNode* node)virtual void visit(const SqrtNode* node)virtual void visit(const CastNode* node)virtual void visit(const CallIntrinsicNode* node)virtual void visit(const UnaryExprNode* node)virtual void visit(const BinaryExprNode* node)virtual void visit(const ReductionNode* node)virtual void visit(const AssignmentNode* node)virtual void visit(const YieldNode* node)virtual void visit(const ForallNode* node)virtual void visit(const WhereNode* node)virtual void visit(const SequenceNode* node)virtual void visit(const AssembleNode* node)virtual void visit(const MultiNode* node)virtual void visit(const SuchThatNode* node)Lowererstd::shared_ptr<LowererImpl> impl;LowererImplclass Visitor;friend class Visitor;std::shared_ptr<Visitor> visitor;virtual ir::Stmt lower(IndexStmt stmt);virtual ir::Expr lower(IndexExpr expr);virtual ir::Expr lowerExpr(IndexExpr expr) = 0;virtual ir::Stmt lowerStmt(IndexStmt stmt) = 0;virtual ir::Stmt lower(IndexStmt stmt, std::string name,bool assemble, bool compute, bool pack, bool unpack) = 0;LowererImplImperativeclass Visitorfiend class Visitorstd::shared_ptr<Visitor> visitorbool assemblebool computevars a_bunch_of_other_fieldsvirtual ir::Stmt lowerExpr(IndexExpr expr);virtual ir::Stmt lowerStmt(IndexStmt stmt);ir::Stmt lower(IndexStmt stmt, std::string name,bool assemble, bool compute, bool pack, bool unpack)Stmt LowererImplImperative::lower(IndexStmt stmt) {return visitor->lower(stmt);}VisitorLowererImpl* implExpr exprStmt stmtvoid visit(const AssignmentNode* node)void visit(const YieldNode* node)void visit(const ForallNode* node)void visit(const WhereNode* node)void visit(const MultiNode* node)void visit(const SuchThatNode* node)void visit(const SequenceNode* node)void visit(const AssembleNode* node)void visit(const AccessNode* node)void visit(const LiteralNode* node)void visit(const NegNode* node)void visit(const AddNode* node)void visit(const SubNode* node)void visit(const MulNode* node)void visit(const DivNode* node)void visit(const SqrtNode* node)void visit(const CastNode* node)void visit(const CallIntrinsicNode* node)void visit(const ReductionNode* node)Visitor(LowererImplImperative* impl)Stmt lower(IndexStmt stmt)Expr lower(IndexExpr expr)Stmt lower(IndexStmt stmt) {this->stmt = Stmt();impl->accessibleIterators.scope();IndexStmtVisitorStrict::visit(stmt);impl->accessibleIterators.unscope();return this->stmt;}contains111111contains11contains11contains11contains11contains11 \ No newline at end of file diff --git a/src/codegen/codegen.cpp b/src/codegen/codegen.cpp index f0c09d98a..6ec54a2f8 100644 --- a/src/codegen/codegen.cpp +++ b/src/codegen/codegen.cpp @@ -2,6 +2,7 @@ #include "taco/cuda.h" #include "codegen_cuda.h" #include "codegen_c.h" +#include "codegen_ispc.h" #include #include @@ -26,6 +27,21 @@ shared_ptr CodeGen::init_default(std::ostream &dest, OutputKind outputK if (should_use_CUDA_codegen()) { return make_shared(dest, outputKind); } + else if (should_use_ISPC_codegen()) { + return make_shared(dest, outputKind); + } + else { + return make_shared(dest, outputKind); + } +} + +shared_ptr CodeGen::init_default(std::ostream &dest, std::ostream &dest2, OutputKind outputKind) { + if (should_use_CUDA_codegen()) { + return make_shared(dest, outputKind); + } + else if (should_use_ISPC_codegen()) { + return make_shared(dest, dest2, outputKind); + } else { return make_shared(dest, outputKind); } @@ -229,6 +245,49 @@ string CodeGen::printTensorProperty(string varname, const GetProperty* op, bool return ret.str(); } +string CodeGen::getUnpackedTensorArgument(string varname, const GetProperty* op, + bool is_output_prop) { + stringstream ret; + ret << ""; + + auto tensor = op->tensor.as(); + if (op->property == TensorProperty::Values) { + // for the values, it's in the last slot + ret << "uniform " << printType(tensor->type, false) << " " << varname << "[]"; + return ret.str(); + } else if (op->property == TensorProperty::ValuesSize) { + ret << "int32 " << varname; + return ret.str(); + } + + // for a Dense level, nnz is an int + // for a Fixed level, ptr is an int + // all others are int* + if (op->property == TensorProperty::Dimension) { + if (op->type == Int32) { + ret << "uniform int32 "; + } else if (op->type == Int64) { + ret << "uniform int64 "; + } else { + ret << "int "; + } + ret << varname; + + } else { + taco_iassert(op->property == TensorProperty::Indices); + if (op->type == Int32) { + ret << "uniform int32 "; + } else if (op->type == Int64) { + ret << "uniform int64 "; + } else { + ret << "uniform int "; + } + ret << varname << "[]"; + } + + return ret.str(); +} + string CodeGen::unpackTensorProperty(string varname, const GetProperty* op, bool is_output_prop) { stringstream ret; @@ -310,13 +369,9 @@ string CodeGen::pointTensorProperty(std::string varname) { return ret.str(); } -// helper to print declarations -string CodeGen::printDecls(map varMap, - vector inputs, vector outputs) { - stringstream ret; - unordered_set propsAlreadyGenerated; - - vector sortedProps; +void CodeGen::getSortedProps(map &varMap, + vector &sortedProps, vector &inputs, + vector &outputs) { for (auto const& p: varMap) { if (p.first.as()) @@ -355,6 +410,17 @@ string CodeGen::printDecls(map varMap, return a->index < b->index; }); +} + +// helper to print declarations +string CodeGen::printDecls(map varMap, + vector inputs, vector outputs) { + stringstream ret; + unordered_set propsAlreadyGenerated; + + vector sortedProps; + getSortedProps(varMap, sortedProps, inputs, outputs); + for (auto prop: sortedProps) { bool isOutputProp = (find(outputs.begin(), outputs.end(), prop->tensor) != outputs.end()); @@ -375,7 +441,6 @@ string CodeGen::printDecls(map varMap, return ret.str(); } - string CodeGen::printPack(map, string> outputProperties, vector outputs) { stringstream ret; diff --git a/src/codegen/codegen.h b/src/codegen/codegen.h index cc25c80d6..db891f995 100644 --- a/src/codegen/codegen.h +++ b/src/codegen/codegen.h @@ -16,9 +16,13 @@ class CodeGen : public IRPrinter { enum CodeGenType { C, CUDA }; CodeGen(std::ostream& stream, CodeGenType type) : IRPrinter(stream), codeGenType(type) {}; - CodeGen(std::ostream& stream, bool color, bool simplify, CodeGenType type) : IRPrinter(stream, color, simplify), codeGenType(type) {}; + CodeGen(std::ostream& stream, bool color, bool simplify, CodeGenType type) + : IRPrinter(stream, color, simplify), codeGenType(type) {}; + CodeGen(std::ostream& stream, std::ostream& stream2, bool color, bool simplify, CodeGenType type) + : IRPrinter(stream, stream2, color, simplify), codeGenType(type) {}; /// Initialize the default code generator static std::shared_ptr init_default(std::ostream &dest, OutputKind outputKind); + static std::shared_ptr init_default(std::ostream &dest, std::ostream &dest2, OutputKind outputKind); /// Compile a lowered function virtual void compile(Stmt stmt, bool isFirst=false) =0; @@ -26,6 +30,9 @@ class CodeGen : public IRPrinter { protected: static bool checkForAlloc(const Function *func); static int countYields(const Function *func); + void getSortedProps(std::map &varMap, + std::vector &sortedProps, std::vector &inputs, + std::vector &outputs); static std::string printCType(Datatype type, bool is_ptr); static std::string printCUDAType(Datatype type, bool is_ptr); @@ -52,6 +59,10 @@ class CodeGen : public IRPrinter { std::string printFuncName(const Function *func, std::map inputMap={}, std::map outputMap={}); + + std::string printTensorProperty(std::string varname, const GetProperty* op, bool is_ptr); + std::string getUnpackedTensorArgument(std::string varname, const GetProperty* op, + bool is_output_prop); void resetUniqueNameCounters(); std::string genUniqueName(std::string name); @@ -61,9 +72,8 @@ class CodeGen : public IRPrinter { private: virtual std::string restrictKeyword() const { return ""; } - std::string printTensorProperty(std::string varname, const GetProperty* op, bool is_ptr); std::string unpackTensorProperty(std::string varname, const GetProperty* op, - bool is_output_prop); + bool is_output_prop); std::string packTensorProperty(std::string varname, Expr tnsr, TensorProperty property, int mode, int index); std::string pointTensorProperty(std::string varname); diff --git a/src/codegen/codegen_c.cpp b/src/codegen/codegen_c.cpp index 2ade9d7f6..83da7aaab 100644 --- a/src/codegen/codegen_c.cpp +++ b/src/codegen/codegen_c.cpp @@ -34,6 +34,7 @@ const string cHeaders = "#include \n" "#include \n" "#include \n" + "#include \n" "#if _OPENMP\n" "#include \n" "#endif\n" @@ -240,7 +241,10 @@ class CodeGen_C::FindVars : public IRVisitor { }; CodeGen_C::CodeGen_C(std::ostream &dest, OutputKind outputKind, bool simplify) - : CodeGen(dest, false, simplify, C), out(dest), outputKind(outputKind) {} + : CodeGen(dest, false, simplify, C), out(dest), out2(dest), outputKind(outputKind) {} + +CodeGen_C::CodeGen_C(std::ostream &dest, std::ostream &dest2, OutputKind outputKind, bool simplify) + : CodeGen(dest, dest2, false, simplify, C), out(dest), out2(dest2), outputKind(outputKind) {} CodeGen_C::~CodeGen_C() {} @@ -299,14 +303,18 @@ void CodeGen_C::visit(const Function* func) { // Print variable declarations out << printDecls(varFinder.varDecls, func->inputs, func->outputs) << endl; + // out << "printf(\"declarations added\\n\");" << std::endl; if (emittingCoroutine) { out << printContextDeclAndInit(varMap, localVars, numYields, func->name) << endl; } + // out << "printf(\"declarations added2\\n\");" << std::endl; // output body print(func->body); + // out << "printf(\"function body added " << count++ << "\\n\"); // " << std::endl; + // output repack only if we allocated memory if (checkForAlloc(func)) @@ -403,6 +411,9 @@ static string getAtomicPragma() { // Docs for vectorization pragmas: // http://clang.llvm.org/docs/LanguageExtensions.html#extensions-for-loop-hint-optimizations void CodeGen_C::visit(const For* op) { + + // out << " printf(\"adding for loop " << count++ << "\\n\"); //" << std::endl; + switch (op->kind) { case LoopKind::Vectorized: doIndent(); @@ -452,6 +463,14 @@ void CodeGen_C::visit(const For* op) { } stream << ") {\n"; + // out << " printf(\"loop " << count++ << " : %d , dim: %d, %d\\n\","; + // op->var.accept(this); + // out << ", "; + // op->start.accept(this); + // out << ", "; + // op->end.accept(this); + // out << "); // " << count++ << std::endl; + op->contents.accept(this); doIndent(); stream << "}"; @@ -472,6 +491,7 @@ void CodeGen_C::visit(const While* op) { } void CodeGen_C::visit(const GetProperty* op) { + // std::cout << "GetProperty* " << op << std::endl; taco_iassert(varMap.count(op) > 0) << "Property " << Expr(op) << " of " << op->tensor << " not found in varMap"; out << varMap[op]; diff --git a/src/codegen/codegen_c.h b/src/codegen/codegen_c.h index 55c9d01a8..c8505a3bb 100644 --- a/src/codegen/codegen_c.h +++ b/src/codegen/codegen_c.h @@ -16,6 +16,7 @@ class CodeGen_C : public CodeGen { /// Initialize a code generator that generates code to an /// output stream. CodeGen_C(std::ostream &dest, OutputKind outputKind, bool simplify=true); + CodeGen_C(std::ostream &dest, std::ostream &dest2, OutputKind outputKind, bool simplify=true); ~CodeGen_C(); /// Compile a lowered function @@ -28,23 +29,25 @@ class CodeGen_C : public CodeGen { protected: using IRPrinter::visit; - void visit(const Function*); - void visit(const VarDecl*); - void visit(const Yield*); - void visit(const Var*); - void visit(const For*); - void visit(const While*); - void visit(const GetProperty*); - void visit(const Min*); - void visit(const Max*); - void visit(const Allocate*); - void visit(const Sqrt*); - void visit(const Store*); - void visit(const Assign*); + virtual void visit(const Function*); + virtual void visit(const VarDecl*); + virtual void visit(const Yield*); + virtual void visit(const Var*); + virtual void visit(const For*); + virtual void visit(const While*); + virtual void visit(const GetProperty*); + virtual void visit(const Min*); + virtual void visit(const Max*); + virtual void visit(const Allocate*); + virtual void visit(const Sqrt*); + virtual void visit(const Store*); + virtual void visit(const Assign*); std::map varMap; std::vector localVars; std::ostream &out; + std::ostream &out2; + int count = 0; OutputKind outputKind; diff --git a/src/codegen/codegen_cuda.cpp b/src/codegen/codegen_cuda.cpp index 77cf0cd88..14505f740 100644 --- a/src/codegen/codegen_cuda.cpp +++ b/src/codegen/codegen_cuda.cpp @@ -646,6 +646,7 @@ void CodeGen_CUDA::printDeviceFunctions(const Function* func) { // Collect device functions resetUniqueNameCounters(); deviceFunctionLoopDepth = 0; + // here they calculate the device FunctionCollecor DeviceFunctionCollector deviceFunctionCollector(func->inputs, func->outputs, this); func->body.accept(&deviceFunctionCollector); deviceFunctions = deviceFunctionCollector.blockFors; diff --git a/src/codegen/codegen_ispc.cpp b/src/codegen/codegen_ispc.cpp new file mode 100644 index 000000000..d4f428ccf --- /dev/null +++ b/src/codegen/codegen_ispc.cpp @@ -0,0 +1,1097 @@ +#include +#include +#include +#include +#include +#include + +#include "taco/cuda.h" +#include "taco/ir/ir_printer.h" +#include "taco/ir/ir_visitor.h" +#include "taco/ir/ir_rewriter.h" +#include "taco/ir/simplify.h" + +#include "codegen_c.h" +#include "codegen_ispc.h" +#include "taco/error.h" +#include "taco/util/strings.h" +#include "taco/util/collections.h" + +using namespace std; + +namespace taco { +namespace ir { + +// Some helper functions +namespace { + +// Include stdio.h for printf +// stdlib.h for malloc/realloc +// math.h for sqrt +// MIN preprocessor macro +// This *must* be kept in sync with taco_tensor_t.h +const string cHeaders = + "#ifndef TACO_C_HEADERS\n" + "#define TACO_C_HEADERS\n" + "#include \n" + "#include \n" + "#include \n" + "#include \n" + "#include \n" + "#include \n" + "#include \n" + "#if _OPENMP\n" + "#include \n" + "#endif\n" + "#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))\n" + "#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))\n" + "#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)\n" + "#ifndef TACO_TENSOR_T_DEFINED\n" + "#define TACO_TENSOR_T_DEFINED\n" + "typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;\n" + "typedef struct {\n" + " int32_t order; // tensor order (number of modes)\n" + " int32_t* dimensions; // tensor dimensions\n" + " int32_t csize; // component size\n" + " int32_t* mode_ordering; // mode storage ordering\n" + " taco_mode_t* mode_types; // mode storage types\n" + " uint8_t*** indices; // tensor index data (per mode)\n" + " uint8_t* vals; // tensor values\n" + " int32_t vals_size; // values array size\n" + "} taco_tensor_t;\n" + "#endif\n" + "#if !_OPENMP\n" + "int omp_get_thread_num() { return 0; }\n" + "int omp_get_max_threads() { return 1; }\n" + "#endif\n" + "int cmp(const void *a, const void *b) {\n" + " return *((const int*)a) - *((const int*)b);\n" + "}\n" + "int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {\n" + " if (array[arrayStart] >= target) {\n" + " return arrayStart;\n" + " }\n" + " int lowerBound = arrayStart; // always < target\n" + " int upperBound = arrayEnd; // always >= target\n" + " while (upperBound - lowerBound > 1) {\n" + " int mid = (upperBound + lowerBound) / 2;\n" + " int midValue = array[mid];\n" + " if (midValue < target) {\n" + " lowerBound = mid;\n" + " }\n" + " else if (midValue > target) {\n" + " upperBound = mid;\n" + " }\n" + " else {\n" + " return mid;\n" + " }\n" + " }\n" + " return upperBound;\n" + "}\n" + "int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {\n" + " if (array[arrayEnd] <= target) {\n" + " return arrayEnd;\n" + " }\n" + " int lowerBound = arrayStart; // always <= target\n" + " int upperBound = arrayEnd; // always > target\n" + " while (upperBound - lowerBound > 1) {\n" + " int mid = (upperBound + lowerBound) / 2;\n" + " int midValue = array[mid];\n" + " if (midValue < target) {\n" + " lowerBound = mid;\n" + " }\n" + " else if (midValue > target) {\n" + " upperBound = mid;\n" + " }\n" + " else {\n" + " return mid;\n" + " }\n" + " }\n" + " return lowerBound;\n" + "}\n" + "taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,\n" + " int32_t* dimensions, int32_t* mode_ordering,\n" + " taco_mode_t* mode_types) {\n" + " taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));\n" + " t->order = order;\n" + " t->dimensions = (int32_t *) malloc(order * sizeof(int32_t));\n" + " t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));\n" + " t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));\n" + " t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***));\n" + " t->csize = csize;\n" + " for (int32_t i = 0; i < order; i++) {\n" + " t->dimensions[i] = dimensions[i];\n" + " t->mode_ordering[i] = mode_ordering[i];\n" + " t->mode_types[i] = mode_types[i];\n" + " switch (t->mode_types[i]) {\n" + " case taco_mode_dense:\n" + " t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));\n" + " break;\n" + " case taco_mode_sparse:\n" + " t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));\n" + " break;\n" + " }\n" + " }\n" + " return t;\n" + "}\n" + "void deinit_taco_tensor_t(taco_tensor_t* t) {\n" + " for (int i = 0; i < t->order; i++) {\n" + " free(t->indices[i]);\n" + " }\n" + " free(t->indices);\n" + " free(t->dimensions);\n" + " free(t->mode_ordering);\n" + " free(t->mode_types);\n" + " free(t);\n" + "}\n" + "#endif\n"; + +const string ispcHeaders = + "#define __TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))\n" + "#define __TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))\n" + "#define __TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)\n" + "int __cmp(const void *a, const void *b) {\n" + " return *((const int*)a) - *((const int*)b);\n" + "}\n" + "int __taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {\n" + " if (array[arrayStart] >= target) {\n" + " return arrayStart;\n" + " }\n" + " int lowerBound = arrayStart; // always < target\n" + " int upperBound = arrayEnd; // always >= target\n" + " while (upperBound - lowerBound > 1) {\n" + " int mid = (upperBound + lowerBound) / 2;\n" + " int midValue = array[mid];\n" + " if (midValue < target) {\n" + " lowerBound = mid;\n" + " }\n" + " else if (midValue > target) {\n" + " upperBound = mid;\n" + " }\n" + " else {\n" + " return mid;\n" + " }\n" + " }\n" + " return upperBound;\n" + "}\n" + "int __taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {\n" + " if (array[arrayEnd] <= target) {\n" + " return arrayEnd;\n" + " }\n" + " int lowerBound = arrayStart; // always <= target\n" + " int upperBound = arrayEnd; // always > target\n" + " while (upperBound - lowerBound > 1) {\n" + " int mid = (upperBound + lowerBound) / 2;\n" + " int midValue = array[mid];\n" + " if (midValue < target) {\n" + " lowerBound = mid;\n" + " }\n" + " else if (midValue > target) {\n" + " upperBound = mid;\n" + " }\n" + " else {\n" + " return mid;\n" + " }\n" + " }\n" + " return lowerBound;\n" + "}\n\n\n"; + +} // anonymous namespace + + + +// find variables for generating declarations +// generates a single var for each GetProperty +class CodeGen_ISPC::FindVars : public IRVisitor { +public: + map varMap; + + // the variables for which we need to add declarations + map varDecls; + + vector localVars; + + // this maps from tensor, property, mode, index to the unique var + map, string> canonicalPropertyVar; + + // this is for convenience, recording just the properties unpacked + // from the output tensor so we can re-save them at the end + map, string> outputProperties; + + // TODO: should replace this with an unordered set + vector outputTensors; + vector inputTensors; + + CodeGen_ISPC *codeGen; + + // copy inputs and outputs into the map + FindVars(vector inputs, vector outputs, CodeGen_ISPC *codeGen) + : codeGen(codeGen) { + for (auto v: inputs) { + auto var = v.as(); + taco_iassert(var) << "Inputs must be vars in codegen"; + taco_iassert(varMap.count(var)==0) << "Duplicate input found in codegen"; + inputTensors.push_back(v); + varMap[var] = var->name; + } + for (auto v: outputs) { + auto var = v.as(); + taco_iassert(var) << "Outputs must be vars in codegen"; + taco_iassert(varMap.count(var)==0) << "Duplicate output found in codegen"; + outputTensors.push_back(v); + varMap[var] = var->name; + } + } + +protected: + using IRVisitor::visit; + + virtual void visit(const Var *op) { + if (varMap.count(op) == 0) { + varMap[op] = op->is_ptr? op->name : codeGen->genUniqueName(op->name); + } + } + + virtual void visit(const VarDecl *op) { + if (!util::contains(localVars, op->var)) { + localVars.push_back(op->var); + } + op->var.accept(this); + op->rhs.accept(this); + } + + virtual void visit(const For *op) { + if (!util::contains(localVars, op->var)) { + localVars.push_back(op->var); + } + op->var.accept(this); + op->start.accept(this); + op->end.accept(this); + op->increment.accept(this); + op->contents.accept(this); + } + + virtual void visit(const GetProperty *op) { + if (!util::contains(inputTensors, op->tensor) && + !util::contains(outputTensors, op->tensor)) { + // Don't create header unpacking code for temporaries + return; + } + + if (varMap.count(op) == 0) { + auto key = + tuple(op->tensor,op->property, + (size_t)op->mode, + (size_t)op->index); + if (canonicalPropertyVar.count(key) > 0) { + varMap[op] = canonicalPropertyVar[key]; + } else { + auto unique_name = codeGen->genUniqueName(op->name); + canonicalPropertyVar[key] = unique_name; + varMap[op] = unique_name; + varDecls[op] = unique_name; + if (util::contains(outputTensors, op->tensor)) { + outputProperties[key] = unique_name; + } + } + } + } +}; + + +// Finds all for loops tagged with accelerator and adds statements to deviceFunctions +// Also tracks scope of when device function is called and +// tracks which variables must be passed to function. +class CodeGen_ISPC::FunctionCollector : public IRVisitor { +public: + vector threadFors; // contents is device function + vector initFors; // for loops to initialize statements + map scopeMap; + + // the variables to pass to each device function + vector>> functionParameters; + vector> currentParameters; // keep as vector so code generation is deterministic + set currentParameterSet; + + set variablesDeclaredInKernel; + + vector> threadIDVars; + vector> blockIDVars; + vector> warpIDVars; + vector numThreads; + vector numWarps; + + CodeGen_ISPC *codeGen; + // copy inputs and outputs into the map + FunctionCollector(vector inputs, vector outputs, CodeGen_ISPC *codeGen) : codeGen(codeGen) { + inDeviceFunction = false; + for (auto v: inputs) { + auto var = v.as(); + taco_iassert(var) << "Inputs must be vars in codegen"; + taco_iassert(scopeMap.count(var) == 0) << + "Duplicate input found in codegen"; + scopeMap[var] = var->name; + } + for (auto v: outputs) { + auto var = v.as(); + taco_iassert(var) << "Outputs must be vars in codegen"; + taco_iassert(scopeMap.count(var) == 0) << + "Duplicate output found in codegen"; + + scopeMap[var] = var->name; + } + } + +protected: + bool inDeviceFunction; + using IRVisitor::visit; + + virtual void visit(const For *op) { + if (op->parallel_unit == ParallelUnit::CPUSpmd) { + std::cout << "ParallelUnit::CPUSpmd directive found\n"; + + inDeviceFunction = false; + op->var.accept(this); + inDeviceFunction = true; + + threadFors.push_back(op); + std::cout << "scopeMap: [" << scopeMap[op->var] << "], varExpr: [" << op->var << "]\n"; + threadIDVars.push_back(pair(scopeMap[op->var], op->var)); + Expr blockSize = ir::simplify(ir::Div::make(ir::Sub::make(op->end, op->start), op->increment)); + numThreads.push_back(blockSize); + + } + else if (op->parallel_unit == ParallelUnit::CPUSimd) { + std::cout << "************************************************************************** CPUSimd For node\n"; + } + else if (op->kind == LoopKind::Init) { + std::cout << "************************************************************************* Init loop kind found\n"; + initFors.push_back(op); + } + else{ + op->var.accept(this); + } + op->start.accept(this); + op->end.accept(this); + op->increment.accept(this); + op->contents.accept(this); + } + + virtual void visit(const Var *op) { + if (scopeMap.count(op) == 0) { + string name = codeGen->genUniqueName(op->name); + if (!inDeviceFunction) { + scopeMap[op] = name; + } + } + else if (scopeMap.count(op) == 1 && inDeviceFunction && currentParameterSet.count(op) == 0 + && (threadIDVars.empty() || op != threadIDVars.back().second) + && !variablesDeclaredInKernel.count(op)) { + currentParameters.push_back(pair(scopeMap[op], op)); + currentParameterSet.insert(op); + } + } + + virtual void visit(const VarDecl *op) { + if (inDeviceFunction) { + variablesDeclaredInKernel.insert(op->var); + } + op->var.accept(this); + op->rhs.accept(this); + } + + virtual void visit(const GetProperty *op) { + if (scopeMap.count(op->tensor) == 0 && !inDeviceFunction) { + auto key = + tuple(op->tensor,op->property, + (size_t)op->mode, + (size_t)op->index); + auto unique_name = codeGen->genUniqueName(op->name); + scopeMap[op->tensor] = unique_name; + } + else if (scopeMap.count(op->tensor) == 1 && inDeviceFunction && currentParameterSet.count(op->tensor) == 0) { + currentParameters.push_back(pair(op->tensor.as()->name, op->tensor)); + currentParameterSet.insert(op->tensor); + } + } +}; + + +CodeGen_ISPC::CodeGen_ISPC(std::ostream &dest, OutputKind outputKind, bool simplify) + : CodeGen_C(dest, dest, outputKind, simplify) {} + +CodeGen_ISPC::CodeGen_ISPC(std::ostream &dest, std::ostream &dest2, OutputKind outputKind, bool simplify) + : CodeGen_C(dest, dest2, outputKind, simplify) {} + +CodeGen_ISPC::~CodeGen_ISPC() {} + +void CodeGen_ISPC::compile(Stmt stmt, bool isFirst) { + varMap = {}; + localVars = {}; + + if (isFirst) { + // output the headers + out << cHeaders; + + if (&out != &out2) { + out2 << ispcHeaders; + } + } + out << endl; + // generate code for the Stmt + std::cout << "Compiling the code\n"; + stmt.accept(this); +} + + + +string CodeGen_ISPC::printCallISPCFunc(const std::string& funcName, map varMap, + vector &sortedProps) { + std::stringstream ret; + ret << " "; + unordered_set propsAlreadyGenerated; + + ret << "__" << funcName << "("; + + + for (unsigned long i=0; i < sortedProps.size(); i++) { + ret << varMap[sortedProps[i]]; + if (i != sortedProps.size()-1) { + ret << ", "; + } + propsAlreadyGenerated.insert(varMap[sortedProps[i]]); + } + + ret << ");\n"; + return ret.str(); +} + +// varMap is already sorted <- make sure to pass the sorted varMap +void CodeGen_ISPC::printISPCFunc(const Function *func, map varMap, + vector &sortedProps) { + + FunctionCollector functionCollector(func->inputs, func->outputs, this); + func->body.accept(&functionCollector); + + vector inputs = func->inputs; + vector outputs = func->outputs; + unordered_set propsAlreadyGenerated; + + for (unsigned long i=0; i < sortedProps.size(); i++) { + auto prop = sortedProps[i]; + bool isOutputProp = (find(outputs.begin(), outputs.end(), + prop->tensor) != outputs.end()); + + auto var = prop->tensor.as(); + if (var->is_parameter) { + if (isOutputProp) { + funcVariables << " " << printTensorProperty(varMap[prop], prop, false) << ";" << endl; + } else { + break; + } + } else { + funcVariables << getUnpackedTensorArgument(varMap[prop], prop, isOutputProp); + } + propsAlreadyGenerated.insert(varMap[prop]); + + if (i!=sortedProps.size()-1) { + funcVariables << ", "; + } + if (i%2==0) { + funcVariables << "\n\t"; + } + } + + resetUniqueNameCounters(); + + // threadFors code generation + for (size_t i = 0; i < functionCollector.threadFors.size(); i++) { + + const For *threadloop = to(functionCollector.threadFors[i]); + taco_iassert(threadloop->parallel_unit == ParallelUnit::CPUSpmd); + Stmt function = threadloop->contents; + std::cout << "threadloop function: " << function << std::endl; + + out2 << "\nstatic task void __" << func->name << "__ ("; + out2 << funcVariables.str(); + out2 << "\n) {\n\n"; + + indent++; + // output body of the threadloop + taskCode = true; + print(threadloop); + indent--; + out2 << "}\n\n"; + + } + + taskCode = false; + out2 << "export void __" << func->name << " ("; + out2 << funcVariables.str(); + out2 << "\n) {\n\n"; + + indent++; + // output body + print(func->body); + indent--; + out2 << "}\n"; + +} + +void CodeGen_ISPC::sendToStream(std::stringstream &stream) { + if (is_ISPC_code_stream_enabled()) { + this->out2 << stream.str(); + } + else { + CodeGen_C::sendToStream(stream); + } +} + +void CodeGen_ISPC::visit(const Function* func) { + set_ISPC_code_stream_enabled(false); + + // if generating a header, protect the function declaration with a guard + if (func->name == "assemble") { + if (outputKind == HeaderGen) { + out << "#ifndef TACO_GENERATED_" << func->name << "\n"; + out << "#define TACO_GENERATED_" << func->name << "\n"; + } + + int numYields = countYields(func); + emittingCoroutine = (numYields > 0); + funcName = func->name; + labelCount = 0; + + resetUniqueNameCounters(); + FindVars inputVarFinder(func->inputs, {}, this); + func->body.accept(&inputVarFinder); + FindVars outputVarFinder({}, func->outputs, this); + func->body.accept(&outputVarFinder); + + // output function declaration + doIndent(); + out << printFuncName(func, inputVarFinder.varDecls, outputVarFinder.varDecls); + + // if we're just generating a header, this is all we need to do + if (outputKind == HeaderGen) { + out << ";\n"; + out << "#endif\n"; + return; + } + + out << " {\n"; + + indent++; + + // find all the vars that are not inputs or outputs and declare them + resetUniqueNameCounters(); + FindVars varFinder(func->inputs, func->outputs, this); + func->body.accept(&varFinder); + varMap = varFinder.varMap; + localVars = varFinder.localVars; + + // Print variable declarations + out << printDecls(varFinder.varDecls, func->inputs, func->outputs) << endl; + + if (emittingCoroutine) { + out << printContextDeclAndInit(varMap, localVars, numYields, func->name) + << endl; + } + + // output body + print(func->body); + + // output repack only if we allocated memory + if (checkForAlloc(func)) + out << endl << printPack(varFinder.outputProperties, func->outputs); + + if (emittingCoroutine) { + out << printCoroutineFinish(numYields, funcName); + } + + doIndent(); + out << "return 0;\n"; + indent--; + + doIndent(); + out << "}\n"; + return; + + } + + + if (outputKind == HeaderGen) { + out << "#ifndef TACO_GENERATED_" << func->name << "\n"; + out << "#define TACO_GENERATED_" << func->name << "\n"; + } + + int numYields = countYields(func); + emittingCoroutine = (numYields > 0); + funcName = func->name; + labelCount = 0; + + resetUniqueNameCounters(); + FindVars inputVarFinder(func->inputs, {}, this); + func->body.accept(&inputVarFinder); + FindVars outputVarFinder({}, func->outputs, this); + func->body.accept(&outputVarFinder); + + // output function declaration + doIndent(); + out << printFuncName(func, inputVarFinder.varDecls, outputVarFinder.varDecls); + + // if we're just generating a header, this is all we need to do + if (outputKind == HeaderGen) { + out << ";\n"; + out << "#endif\n"; + return; + } + + out << " {\n"; + + indent++; + + // find all the vars that are not inputs or outputs and declare them + resetUniqueNameCounters(); + FindVars varFinder(func->inputs, func->outputs, this); + func->body.accept(&varFinder); + varMap = varFinder.varMap; + localVars = varFinder.localVars; + + // Print variable declarations + out << printDecls(varFinder.varDecls, func->inputs, func->outputs) << endl; + + sortedProps = {}; + vector inputs = func->inputs; + vector outputs = func->outputs; + getSortedProps(varFinder.varDecls, sortedProps, inputs, outputs); + out << printCallISPCFunc(func->name, varFinder.varDecls, sortedProps); + + if (emittingCoroutine) { + out << printContextDeclAndInit(varMap, localVars, numYields, func->name) + << endl; + } + + // output repack only if we allocated memory + if (checkForAlloc(func)) + out << endl << printPack(varFinder.outputProperties, func->outputs); + + if (emittingCoroutine) { + out << printCoroutineFinish(numYields, funcName); + } + + doIndent(); + out << "return 0;\n"; + indent--; + + doIndent(); + out << "}\n\n"; + + set_ISPC_code_stream_enabled(true); + printISPCFunc(func, varFinder.varDecls, sortedProps); + set_ISPC_code_stream_enabled(false); + +} + +void CodeGen_ISPC::visit(const VarDecl* op) { + // std::stringstream stream; + if (is_ISPC_code_stream_enabled()) { + if (emittingCoroutine) { + doIndent(); + op->var.accept(this); + parentPrecedence = Precedence::TOP; + stream2 << " = "; + op->rhs.accept(this); + stream2 << ";"; + stream2 << endl; + } else { + IRPrinter::visit(op); + } + } + else { + CodeGen_C::visit(op); + } + + // sendToStream(stream); +} + +void CodeGen_ISPC::visit(const Yield* op) { + printYield(op, localVars, varMap, labelCount, funcName); +} + +// For Vars, we replace their names with the generated name, +// since we match by reference (not name) +void CodeGen_ISPC::visit(const Var* op) { + if (is_ISPC_code_stream_enabled()) { + taco_iassert(varMap.count(op) > 0) << + "Var " << op->name << " not found in varMap"; + if (emittingCoroutine) { + // out << "TACO_DEREF("; + } + out2 << varMap[op]; + if (emittingCoroutine) { + // out << ")"; + } + } + else { + CodeGen_C::visit(op); + } +} + +static string genVectorizePragma(int width) { + stringstream ret; + ret << "#pragma clang loop interleave(enable) "; + if (!width) + ret << "vectorize(enable)"; + else + ret << "vectorize_width(" << width << ")"; + + return ret.str(); +} + +// static string getParallelizePragma(LoopKind kind) { +// stringstream ret; +// ret << "#pragma omp parallel for schedule"; +// switch (kind) { +// case LoopKind::Static: +// ret << "(static, 1)"; +// break; +// case LoopKind::Dynamic: +// ret << "(dynamic, 1)"; +// break; +// case LoopKind::Runtime: +// ret << "(runtime)"; +// break; +// case LoopKind::Static_Chunked: +// ret << "(static)"; +// break; +// default: +// break; +// } +// return ret.str(); +// } + +// static string getUnrollPragma(size_t unrollFactor) { +// return "#pragma unroll " + std::to_string(unrollFactor); +// } + +static string getAtomicPragma() { + return "#pragma omp atomic"; +} + +// The next two need to output the correct pragmas depending +// on the loop kind (Serial, Static, Dynamic, Vectorized) +// +// Docs for vectorization pragmas: +// http://clang.llvm.org/docs/LanguageExtensions.html#extensions-for-loop-hint-optimizations +void CodeGen_ISPC::visit(const For* op) { + if (!is_ISPC_code_stream_enabled()) { + CodeGen_C::visit(op); + return; + } + doIndent(); + + if (op->kind == LoopKind::Mul_Thread) { + if (!taskCode) { + out2 << "launch[4] " << printCallISPCFunc(funcName+"__", varMap, sortedProps) << "\n"; + return; + } + stream2 << "uniform unsigned int chunk_size = ("; + op->end.accept(this); + stream2 << " - "; + op->start.accept(this); + stream2 << ") / taskCount;\n"; + stream2 << " uniform unsigned int modulo = ("; + op->end.accept(this); + stream2 << " - "; + op->start.accept(this); + stream2 << ") % taskCount;\n"; + + stream2 << " uniform unsigned int start = "; + op->start.accept(this); + stream2 << " + chunk_size * taskIndex;\n"; + + stream2 << " if (taskIndex != 0) {\n"; + stream2 << " start += modulo;\n"; + stream2 << " }\n"; + + stream2 << " uniform unsigned int end = start + chunk_size;\n"; + stream2 << " if (taskIndex == 0) {\n"; + stream2 << " end += modulo;\n"; + stream2 << " }\n\n"; + + stream2 << keywordString(" for") << " ("; + if (!emittingCoroutine) { + if (op->var.type() == Int32) { + stream2 << "int32 "; + } + else if (op->var.type() == Int64) { + stream2 << "int64 "; + } + + } + op->var.accept(this); + stream2 << " = "; + stream2 << "start"; + // op->start.accept(this); + stream2 << keywordString("; "); + op->var.accept(this); + stream2 << " < "; + parentPrecedence = BOTTOM; + stream2 << "end"; + // op->end.accept(this); + stream2 << keywordString("; "); + op->var.accept(this); + + auto lit = op->increment.as(); + if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || + (lit->type.isUInt() && lit->equalsScalar(1)))) { + stream2 << "++"; + } + else { + stream2 << " += "; + op->increment.accept(this); + } + + } + + else if (op->kind == LoopKind::Foreach) { + stream2 << keywordString("foreach") << " ("; + + op->var.accept(this); + stream2 << " = "; + op->start.accept(this); + stream2 << keywordString(" ... "); + op->end.accept(this); + + } else { + stream2 << keywordString("for") << " ("; + if (!emittingCoroutine) { + if (op->var.type() == Int32) { + stream2 << "int32 "; + } + else if (op->var.type() == Int64) { + stream2 << "int64 "; + } + + } + op->var.accept(this); + stream2 << " = "; + op->start.accept(this); + stream2 << keywordString("; "); + op->var.accept(this); + stream2 << " < "; + parentPrecedence = BOTTOM; + op->end.accept(this); + stream2 << keywordString("; "); + op->var.accept(this); + + auto lit = op->increment.as(); + if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || + (lit->type.isUInt() && lit->equalsScalar(1)))) { + stream2 << "++"; + } + else { + stream2 << " += "; + op->increment.accept(this); + } + + } + + stream2 << ") {\n"; + op->contents.accept(this); + doIndent(); + stream2 << "}"; + stream2 << endl; + +} + +void CodeGen_ISPC::visit(const While* op) { + // it's not clear from documentation that clang will vectorize + // while loops + // however, we'll output the pragmas anyway + if (op->kind == LoopKind::Vectorized) { + doIndent(); + out << genVectorizePragma(op->vec_width); + out << "\n"; + } + + CodeGen_C::visit(op); +} + +void CodeGen_ISPC::visit(const GetProperty* op) { + taco_iassert(varMap.count(op) > 0) << + "Property " << Expr(op) << " of " << op->tensor << " not found in varMap"; + if (is_ISPC_code_stream_enabled()) { + out2 << varMap[op]; + } + else { + out << varMap[op]; + } + +} + +void CodeGen_ISPC::visit(const Min* op) { + if (op->operands.size() == 1) { + op->operands[0].accept(this); + return; + } + for (size_t i=0; ioperands.size()-1; i++) { + stream << "TACO_MIN("; + op->operands[i].accept(this); + stream << ","; + } + op->operands.back().accept(this); + for (size_t i=0; ioperands.size()-1; i++) { + stream << ")"; + } +} + +void CodeGen_ISPC::visit(const Max* op) { + if (op->operands.size() == 1) { + op->operands[0].accept(this); + return; + } + for (size_t i=0; ioperands.size()-1; i++) { + stream << "TACO_MAX("; + op->operands[i].accept(this); + stream << ","; + } + op->operands.back().accept(this); + for (size_t i=0; ioperands.size()-1; i++) { + stream << ")"; + } +} + +void CodeGen_ISPC::visit(const Allocate* op) { + + + if (is_ISPC_code_stream_enabled()) { + string elementType = printCType(op->var.type(), false); + doIndent(); + + op->var.accept(this); + stream2 << " = "; + // stream2 << " = ("; + // stream2 << elementType << "*"; + // stream2 << ")"; + if (op->is_realloc) { + stream2 << "realloc("; + op->var.accept(this); + stream2 << ", "; + } + else { + // If the allocation was requested to clear the allocated memory, + // use calloc instead of malloc. + if (op->clear) { + stream2 << "calloc(1, "; + } else { + stream2 << "new "; + } + } + stream2 << elementType << "["; + parentPrecedence = MUL; + op->num_elements.accept(this); + parentPrecedence = TOP; + stream2 << "];"; + stream2 << endl; + + + } else { + CodeGen_C::visit(op); + + } + + +} + +void CodeGen_ISPC::visit(const Sqrt* op) { + taco_tassert(op->type.isFloat() && op->type.getNumBits() == 64) << + "Codegen doesn't currently support non-double sqrt"; + stream << "sqrt("; + op->a.accept(this); + stream << ")"; +} + +void CodeGen_ISPC::visit(const Assign* op) { + if (is_ISPC_code_stream_enabled()) { + doIndent(); + op->lhs.accept(this); + parentPrecedence = Precedence::TOP; + bool printed = false; + if (simplify) { + if (isa(op->rhs)) { + auto add = to(op->rhs); + if (add->a == op->lhs) { + const Literal* lit = add->b.as(); + if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || + (lit->type.isUInt() && lit->equalsScalar(1)))) { + stream2 << "++"; + } + else { + if (op->use_atomics) { + stream2 << " += reduce_add("; + add->b.accept(this); + stream2 << ")"; + } + else { + stream2 << " += "; + add->b.accept(this); + } + } + printed = true; + } + } + else if (isa(op->rhs)) { + auto mul = to(op->rhs); + if (mul->a == op->lhs) { + stream2 << " *= "; + mul->b.accept(this); + printed = true; + } + } + else if (isa(op->rhs)) { + auto bitOr = to(op->rhs); + if (bitOr->a == op->lhs) { + stream2 << " |= "; + bitOr->b.accept(this); + printed = true; + } + } + } + if (!printed) { + stream2 << " = "; + op->rhs.accept(this); + } + + stream2 << ";"; + stream2 << endl; + + IRPrinter::visit(op); + } + else { + CodeGen_C::visit(op); + + } + + +} + +void CodeGen_ISPC::visit(const Store* op) { + if (is_ISPC_code_stream_enabled()) { + if (op->use_atomics) { + doIndent(); + stream2 << getAtomicPragma() << endl; + } + } + else { + if (op->use_atomics) { + doIndent(); + stream << getAtomicPragma() << endl; + } + } + IRPrinter::visit(op); +} + +} +} diff --git a/src/codegen/codegen_ispc.h b/src/codegen/codegen_ispc.h new file mode 100644 index 000000000..62d2897ca --- /dev/null +++ b/src/codegen/codegen_ispc.h @@ -0,0 +1,68 @@ +#ifndef TACO_BACKEND_ISPC_H +#define TACO_BACKEND_ISPC_H +#include +#include +#include + +#include "taco/ir/ir.h" +#include "taco/ir/ir_printer.h" +#include "codegen_c.h" + +namespace taco { +namespace ir { + + +class CodeGen_ISPC : public CodeGen_C { +public: + /// Initialize a code generator that generates code to an + /// output stream. + CodeGen_ISPC(std::ostream &dest, OutputKind outputKind, bool simplify=true); + CodeGen_ISPC(std::ostream &dest, std::ostream &dest2, OutputKind outputKind, bool simplify=true); + ~CodeGen_ISPC(); + + /// Compile a lowered function + void compile(Stmt stmt, bool isFirst=false); + + /// Generate shims that unpack an array of pointers representing + /// a mix of taco_tensor_t* and scalars into a function call + static void generateShim(const Stmt& func, std::stringstream &stream); + +protected: + using CodeGen_C::visit; + + void visit(const Function*); + void visit(const VarDecl*); + void visit(const Yield*); + void visit(const Var*); + void visit(const For*); + void visit(const While*); + void visit(const GetProperty*); + void visit(const Min*); + void visit(const Max*); + void visit(const Allocate*); + void visit(const Sqrt*); + void visit(const Store*); + void visit(const Assign*); + + Stmt simplifyFunctionBodies(Stmt stmt); + std::string printCallISPCFunc(const std::string& funcName, std::map varMap, + std::vector &sortedProps); + void printISPCFunc(const Function *func, std::map varMap, + std::vector &sortedProps); + + bool taskCode = false; + + std::stringstream funcVariables; + std::vector sortedProps; + + class FindVars; + class FunctionCollector; + +private: + virtual std::string restrictKeyword() const { return "restrict"; } + void sendToStream(std::stringstream &stream); +}; + +} // namespace ir +} // namespace taco +#endif diff --git a/src/codegen/module.cpp b/src/codegen/module.cpp index bd0f487b1..6f631d40e 100644 --- a/src/codegen/module.cpp +++ b/src/codegen/module.cpp @@ -4,6 +4,7 @@ #include #include #include +// #include #if USE_OPENMP #include #endif @@ -13,6 +14,7 @@ #include "taco/util/strings.h" #include "taco/util/env.h" #include "codegen/codegen_c.h" +#include "codegen/codegen_ispc.h" #include "codegen/codegen_cuda.h" #include "taco/cuda.h" @@ -42,6 +44,7 @@ void Module::addFunction(Stmt func) { void Module::compileToSource(string path, string prefix) { if (!moduleFromUserSource) { + std::cout << "module not from user source\n"; // create a codegen instance and add all the funcs bool didGenRuntime = false; @@ -50,11 +53,13 @@ void Module::compileToSource(string path, string prefix) { header.clear(); source.str(""); source.clear(); + additional_source.str(""); + additional_source.clear(); taco_tassert(target.arch == Target::C99) << "Only C99 codegen supported currently"; std::shared_ptr sourcegen = - CodeGen::init_default(source, CodeGen::ImplementationGen); + CodeGen::init_default(source, additional_source, CodeGen::ImplementationGen); std::shared_ptr headergen = CodeGen::init_default(header, CodeGen::HeaderGen); @@ -68,8 +73,17 @@ void Module::compileToSource(string path, string prefix) { ofstream source_file; string file_ending = should_use_CUDA_codegen() ? ".cu" : ".c"; source_file.open(path+prefix+file_ending); + if (should_use_ISPC_codegen()) { + source_file << "#include \"" << path+prefix+"_ispc.h\"\n"; + } source_file << source.str(); source_file.close(); + + ofstream additional_source_file; + string file_ending2 = ".ispc"; + additional_source_file.open(path+prefix+file_ending2); + additional_source_file << additional_source.str(); + additional_source_file.close(); ofstream header_file; header_file.open(path+prefix+".h"); @@ -89,6 +103,9 @@ void writeShims(vector funcs, string path, string prefix) { if (should_use_CUDA_codegen()) { CodeGen_CUDA::generateShim(func, shims); } + // else if (should_use_ISPC_codegen()) { + // CodeGen_ISPC::generateShim(func, shims); + // } else { CodeGen_C::generateShim(func, shims); } @@ -98,6 +115,9 @@ void writeShims(vector funcs, string path, string prefix) { if (should_use_CUDA_codegen()) { shims_file.open(path+prefix+"_shims.cpp"); } + // else if (should_use_ISPC_codegen()) { + // shims_file.open(path+prefix+".c", ios::app); + // } else { shims_file.open(path+prefix+".c", ios::app); } @@ -109,6 +129,7 @@ void writeShims(vector funcs, string path, string prefix) { } // anonymous namespace string Module::compile() { + std::cout << "Module::compile\n"; string prefix = tmpdir+libname; string fullpath = prefix + ".so"; @@ -123,6 +144,13 @@ string Module::compile() { file_ending = ".cu"; shims_file = prefix + "_shims.cpp"; } + // else if (should_use_ISPC_codegen()) { + // cc = util::getFromEnv("TACO_ISPC", "ispc"); + // cflags = util::getFromEnv("TACO_ISPC_FLAGS", + // " --target=sse2-i32x4,sse4-i32x8,avx1-i32x8,avx2-i32x8,avx512knl-i32x16,avx512skx-i32x16 --pic -O3 --addressing=64 --arch=x86-64" + // ) + " "; + + // } else { cc = util::getFromEnv(target.compiler_env, target.compiler); cflags = util::getFromEnv("TACO_CFLAGS", @@ -137,17 +165,55 @@ string Module::compile() { string cmd = cc + " " + cflags + " " + prefix + file_ending + " " + shims_file + " " + "-o " + fullpath + " -lm"; + std::cout << "--------------------------------------------------------------------------------tmpdir: " << tmpdir << std::endl; + std::cout << "--------------------------------------------------------------------------------libname: " << libname << std::endl; + std::cout << "--------------------------------------------------------------------------------prefix: " << prefix << std::endl; + std::cout << "--------------------------------------------------------------------------------fullpath: " << fullpath << std::endl; + std::cout << "--------------------------------------------------------------------------------cmd: " << cmd << std::endl; // open the output file & write out the source compileToSource(tmpdir, libname); + // write out the shims writeShims(funcs, tmpdir, libname); + for (auto &statement : funcs) { + std::cout << "----- statement --------" << std::endl; + // std::cout << statement; + std::cout << std::endl; + } + std::cout << tmpdir << std::endl << libname << std::endl; - // now compile it - int err = system(cmd.data()); - taco_uassert(err == 0) << "Compilation command failed:\n" << cmd - << "\nreturned " << err; + if (should_use_ISPC_codegen()) { + string ispc = util::getFromEnv("TACO_ISPC", "ispc"); + string ispcflags = util::getFromEnv("TACO_ISPC_FLAGS", + " --target=sse2-i32x4,sse4-i32x8,avx1-i32x8,avx2-i32x8,avx512knl-i32x16,avx512skx-i32x16 --pic -O3 --addressing=64 --arch=x86-64" + ) + " "; + string cmd = ispc + " " + ispcflags + " -o " + prefix + ".ispc.o " + " --emit-obj " + prefix + ".ispc " + "-h " + prefix + "_ispc.h"; + + // now compile the ispc file to generate the object file and the ispc header file + std::cout << "--------------------------------------------------------------------------------cmd: " << cmd << std::endl; + int err = system(cmd.data()); + taco_uassert(err == 0) << "Compilation command failed:\n" << cmd + << "\nreturned " << err; + + string ispc_object_file = " " + prefix + ".ispc.o "; + string ispc_object_files_for_diff_targets = " " + prefix + ".ispc_* "; + cmd = cc + " " + cflags + " " + + prefix + file_ending + " " + ispc_object_file + ispc_object_files_for_diff_targets + shims_file + " " + + "-o " + fullpath + " -lm -lrt "; + + // now compile the c file linking the ispc object file. ispc header is added to the top of the c file + std::cout << "--------------------------------------------------------------------------------cmd: " << cmd << std::endl; + err = system(cmd.data()); + taco_uassert(err == 0) << "Compilation command failed:\n" << cmd + << "\nreturned " << err; + } else { + // now compile it + int err = system(cmd.data()); + taco_uassert(err == 0) << "Compilation command failed:\n" << cmd + << "\nreturned " << err; + } // use dlsym() to open the compiled library if (lib_handle) { @@ -168,10 +234,61 @@ string Module::getSource() { return source.str(); } +void* Module::getFuncPtr(std::string& sofile, std::string name) { + std::cout << "opening shared object 1\n"; + if (so_lib_handle) { + dlclose(so_lib_handle); + } + std::cout << "opening shared object 2\n"; + so_lib_handle = dlopen(sofile.data(), RTLD_NOW | RTLD_LOCAL); + std::cout << "opening shared object : " << sofile << std::endl; + return dlsym(so_lib_handle, name.data()); +} + void* Module::getFuncPtr(std::string name) { return dlsym(lib_handle, name.data()); } +int Module::callFuncPackedRaw(std::string name, std::string& sofile, void** args) { + typedef int (*fnptr_t)(void**); + static_assert(sizeof(void*) == sizeof(fnptr_t), + "Unable to cast dlsym() returned void pointer to function pointer"); + void* v_func_ptr = getFuncPtr(sofile, name); + fnptr_t func_ptr; + *reinterpret_cast(&func_ptr) = v_func_ptr; + +#if USE_OPENMP + omp_sched_t existingSched; + ParallelSchedule tacoSched; + int existingChunkSize, tacoChunkSize; + int existingNumThreads = omp_get_max_threads(); + omp_get_schedule(&existingSched, &existingChunkSize); + taco_get_parallel_schedule(&tacoSched, &tacoChunkSize); + switch (tacoSched) { + case ParallelSchedule::Static: + omp_set_schedule(omp_sched_static, tacoChunkSize); + break; + case ParallelSchedule::Dynamic: + omp_set_schedule(omp_sched_dynamic, tacoChunkSize); + break; + default: + break; + } + omp_set_num_threads(taco_get_num_threads()); +#endif + + std::cout << "calling the function\n"; + int ret = func_ptr(args); + std::cout << "function call completed\n"; + +#if USE_OPENMP + omp_set_schedule(existingSched, existingChunkSize); + omp_set_num_threads(existingNumThreads); +#endif + + return ret; +} + int Module::callFuncPackedRaw(std::string name, void** args) { typedef int (*fnptr_t)(void**); static_assert(sizeof(void*) == sizeof(fnptr_t), @@ -200,7 +317,13 @@ int Module::callFuncPackedRaw(std::string name, void** args) { omp_set_num_threads(taco_get_num_threads()); #endif + std::cout << "calling the function\n"; + // CALLGRIND_START_INSTRUMENTATION; + // CALLGRIND_TOGGLE_COLLECT; int ret = func_ptr(args); + // CALLGRIND_TOGGLE_COLLECT; + // CALLGRIND_STOP_INSTRUMENTATION; + std::cout << "function call completed\n"; #if USE_OPENMP omp_set_schedule(existingSched, existingChunkSize); diff --git a/src/cuda.cpp b/src/cuda.cpp index 059c60105..68e49fe98 100644 --- a/src/cuda.cpp +++ b/src/cuda.cpp @@ -7,6 +7,25 @@ using namespace std; namespace taco { + +static bool ISPC_codegen_enabled = ISPC_BUILT; +static bool ISPC_code_stream_enabled = false; +bool should_use_ISPC_codegen() { + return ISPC_codegen_enabled; +} + +bool is_ISPC_code_stream_enabled() { + return ISPC_code_stream_enabled; +} + +void set_ISPC_codegen_enabled(bool enabled) { + ISPC_codegen_enabled = enabled; +} + +void set_ISPC_code_stream_enabled(bool enabled) { + ISPC_code_stream_enabled = enabled; +} + /// Functions used by taco to interface with CUDA (especially unified memory) static bool CUDA_codegen_enabled = CUDA_BUILT; static bool CUDA_unified_memory_enabled = CUDA_BUILT; diff --git a/src/index_notation/index_notation.cpp b/src/index_notation/index_notation.cpp index 51fb8770c..2e26460c7 100644 --- a/src/index_notation/index_notation.cpp +++ b/src/index_notation/index_notation.cpp @@ -2438,6 +2438,7 @@ bool isConcreteNotation(IndexStmt stmt, std::string* reason) { return isConcrete; } +// make reduction notation Assignment makeReductionNotation(Assignment assignment) { IndexExpr expr = assignment.getRhs(); std::vector free = assignment.getLhs().getIndexVars(); @@ -2513,7 +2514,10 @@ IndexStmt makeReductionNotation(IndexStmt stmt) { return makeReductionNotation(to(stmt)); } +// make concrete notation IndexStmt makeConcreteNotation(IndexStmt stmt) { + // std::cout << "concrete notation original assignment: " << stmt << std::endl; + std::string reason; taco_iassert(isReductionNotation(stmt, &reason)) << "Not reduction notation: " << stmt << std::endl << reason; @@ -2521,6 +2525,7 @@ IndexStmt makeConcreteNotation(IndexStmt stmt) { // Free variables and reductions covering the whole rhs become top level loops vector freeVars = to(stmt).getFreeVars(); + std::cout << "free vars: " << freeVars << std::endl; struct RemoveTopLevelReductions : IndexNotationRewriter { using IndexNotationRewriter::visit; @@ -2535,12 +2540,17 @@ IndexStmt makeConcreteNotation(IndexStmt stmt) { topLevelReductions.push_back(reduction.getVar()); rhs = reduction.getExpr(); } + // std::cout << "top level reductions: " << topLevelReductions << std::endl; if (rhs != node->rhs) { - stmt = Assignment(node->lhs, rhs, Add()); + stmt = Assignment(node->lhs, rhs, Add()); // write with add + int idx = 0; for (auto& i : util::reverse(topLevelReductions)) { + std::cout << idx << ": " << stmt << std::endl; + idx++; stmt = forall(i, stmt); } + std::cout << idx << ": " << stmt << std::endl; } else { stmt = node; @@ -2548,11 +2558,18 @@ IndexStmt makeConcreteNotation(IndexStmt stmt) { } }; stmt = RemoveTopLevelReductions().rewrite(stmt); + // std::cout << "after remove top level reductions: " << stmt << std::endl; + // now we form the stmt in reverse order of freeVars + int idx = 0; for (auto& i : util::reverse(freeVars)) { + std::cout << idx << ": " << stmt << std::endl; stmt = forall(i, stmt); + idx++; } + std::cout << idx << ": " << stmt << std::endl; + std::cout << "replacing reductions with whereas statements\n"; // Replace other reductions with where and forall statements struct ReplaceReductionsWithWheres : IndexNotationRewriter { using IndexNotationRewriter::visit; diff --git a/src/index_notation/index_notation_printer.cpp b/src/index_notation/index_notation_printer.cpp index 0b41615ad..d7ee998ae 100644 --- a/src/index_notation/index_notation_printer.cpp +++ b/src/index_notation/index_notation_printer.cpp @@ -224,9 +224,9 @@ void IndexNotationPrinter::visit(const YieldNode* op) { void IndexNotationPrinter::visit(const ForallNode* op) { os << "forall(" << op->indexVar << ", "; op->stmt.accept(this); - if (op->parallel_unit != ParallelUnit::NotParallel) { + // if (op->parallel_unit != ParallelUnit::NotParallel) { os << ", " << ParallelUnit_NAMES[(int) op->parallel_unit] << ", " << OutputRaceStrategy_NAMES[(int) op->output_race_strategy]; - } + // } os << ")"; } diff --git a/src/index_notation/transformations.cpp b/src/index_notation/transformations.cpp index 47fc1dd55..c1d82a9fd 100644 --- a/src/index_notation/transformations.cpp +++ b/src/index_notation/transformations.cpp @@ -1,9 +1,16 @@ #include "taco/index_notation/transformations.h" +#include "lower/iteration_graph.h" +#include "lower/tensor_path.h" +#include "taco/cuda.h" #include "taco/index_notation/index_notation.h" +#include "taco/index_notation/index_notation_nodes_abstract.h" #include "taco/index_notation/index_notation_rewriter.h" #include "taco/index_notation/index_notation_nodes.h" +#include "taco/index_notation/index_notation_printer.h" #include "taco/error/error_messages.h" +#include "taco/index_notation/intrinsic.h" +#include "taco/type.h" #include "taco/util/collections.h" #include "taco/lower/iterator.h" #include "taco/lower/merge_lattice.h" @@ -305,6 +312,7 @@ IndexStmt Precompute::apply(IndexStmt stmt, std::string* reason) const { IndexExpr e = precompute.getExpr(); IndexVar iw = precompute.getiw(); + // these lines of code looks interesting when creating the producer consumer relationship IndexStmt consumer = forall(i, replace(s, {{e, ws(i)}})); IndexStmt producer = forall(iw, Assignment(ws(iw), replace(e, {{i,iw}}), assign.getOperator())); @@ -592,7 +600,10 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const { std::string reason = ""; IndexStmt rewriteParallel(IndexStmt stmt) { + std::cout << "1 rewriting IndexStmt to support parallelize schedule directive\n--------------------------------------------\n"; + // std::cout << stmt << std::endl; provGraph = ProvenanceGraph(stmt); + std::cout << "2 rewriting IndexStmt to support parallelize schedule directive\n--------------------------------------------\n"; const auto reductionVars = getReductionVars(stmt); reductionIndexVars.clear(); @@ -607,15 +618,22 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const { tensorVars = createIRTensorVars(stmt); assembledByUngroupedInsert.clear(); + std::cout << "3 rewriting IndexStmt to support parallelize schedule directive\n--------------------------------------------\n"; for (const auto& result : getAssembledByUngroupedInsertion(stmt)) { assembledByUngroupedInsert.push_back(tensorVars[result]); } + std::cout << "4 rewriting IndexStmt to support parallelize schedule directive\n--------------------------------------------\n"; + // std::cout << stmt << std::endl; return rewrite(stmt); } void visit(const ForallNode* node) { + std::cout << "transformations.cpp void visit(const ForallNode* node)\n"; + std::cout << "node: \n" << node << std::endl; Forall foralli(node); + std::cout << "foralli: \n" << foralli << std::endl; + std::cout << "before stmt update stmt: \n" << stmt << std::endl; IndexVar i = parallelize.geti(); definedIndexVars.insert(foralli.getIndexVar()); @@ -632,6 +650,7 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const { Iterators iterators(foralli, tensorVars); MergeLattice lattice = MergeLattice::make(foralli, iterators, provGraph, definedIndexVars); + std::cout << "iter: " << i << ", lattice: \n" << lattice << std::endl; // Precondition 2: No coiteration of modes (i.e., merge lattice has // only one iterator) @@ -660,6 +679,7 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const { MergeLattice underivedLattice = MergeLattice::make(underivedForall, iterators, provGraph, definedIndexVars); + std::cout << "iter: " << i << ", underivedLattice: \n" << lattice << std::endl; // Precondition 3: Every result iterator must have insert capability for (Iterator iterator : underivedLattice.results()) { @@ -721,6 +741,7 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const { // build consumer that writes from temporary to output, mark consumer as parallel reduction ParallelUnit reductionUnit = ParallelUnit::CPUThreadGroupReduction; if (should_use_CUDA_codegen()) { + std::cout << "should_use_CUDA_codegen() true\n"; if (parentParallelUnits.count(ParallelUnit::GPUWarp)) { reductionUnit = ParallelUnit::GPUWarpReduction; } @@ -728,6 +749,9 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const { reductionUnit = ParallelUnit::GPUBlockReduction; } } + else { + std::cout << "should_use_CUDA_codegen() false\n"; + } IndexStmt consumer = forall(i, Assignment(assignment->lhs, w(i), assignment->op), reductionUnit, OutputRaceStrategy::ParallelReduction); precomputed_stmt = where(consumer, producer); } @@ -746,8 +770,9 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const { return; } - + std::cout << "updated stmt: \n"; stmt = forall(i, foralli.getStmt(), parallelize.getParallelUnit(), parallelize.getOutputRaceStrategy(), foralli.getUnrollFactor()); + std::cout << stmt << std::endl; return; } @@ -1181,6 +1206,7 @@ std::ostream& operator<<(std::ostream& os, IndexStmt parallelizeOuterLoop(IndexStmt stmt) { // get outer ForAll + std::cout << "get outer ForAll ----------------- \n"; Forall forall; bool matched = false; match(stmt, @@ -1215,7 +1241,19 @@ IndexStmt parallelizeOuterLoop(IndexStmt stmt) { } return parallelized256; } + else if (should_use_ISPC_codegen()) { + std::cout << "outer loop parallelization for ISPC codegen\n"; + // IndexStmt parallelized = Parallelize(forall.getIndexVar(), ParallelUnit::CPUSpmd, OutputRaceStrategy::NoRaces).apply(stmt, &reason); + // if (parallelized == IndexStmt()) { + // // can't parallelize + // return stmt; + // } + // return parallelized; + + return stmt; + } else { + std::cout << "outer loop parallelization for CPU codgen index statement\n"; IndexStmt parallelized = Parallelize(forall.getIndexVar(), ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces).apply(stmt, &reason); if (parallelized == IndexStmt()) { // can't parallelize @@ -1274,6 +1312,7 @@ static vector topologicallySort(map> hardDeps, map> softDeps, vector originalOrder) { + std::cout << "originalOrder: " << std::endl; vector sortedVars; unsigned long countVars = originalOrder.size(); while (sortedVars.size() < countVars) { @@ -1295,6 +1334,9 @@ topologicallySort(map> hardDeps, } // No free var found there is a cycle + std::cout << "this is where the assert fails\n"; + std::cout << "freeVarPos: " << freeVarPos << std::endl; + std::cout << "limit: " << std::numeric_limits::max() << std::endl; taco_iassert(freeVarPos != std::numeric_limits::max()) << "Cycles in iteration graphs must be resolved, through transpose, " << "before the expression is passed to the topological sorting " @@ -1320,8 +1362,674 @@ topologicallySort(map> hardDeps, return sortedVars; } +bool checkFromBack(const TensorPath& resultTensorPath, + const vector& tensorPaths, + string& removedAccessNode, + vector& producerVars, + vector& consumerVars, + vector& modifiedResultIndexesAccessed, + vector& sortedAllIndexes) { + + std::cout << "check from back function execution\n"; + + const std::vector& resultIndexesVisited = resultTensorPath.getVariables(); + IndexVar lastVisitedIndexVar = resultIndexesVisited.back(); + + std::cout << "last visited index variable: " << lastVisitedIndexVar << std::endl; + + bool onlyLastTensorContainLastIndexOfOutput = true; + bool fissionFromBack = false; + + // check from the back + for (unsigned long i=0; i& indexesVisited = otherIndexPaths.getVariables(); + cout << "index paths: " << otherIndexPaths << endl; + + // if (i < tensorPaths.size()-1) { + // check if other tensors also contain last index of output tensor + for (auto index : indexesVisited) { + cout << "checking " << index << " " << lastVisitedIndexVar << endl; + if (index == lastVisitedIndexVar) { + onlyLastTensorContainLastIndexOfOutput = false; + } + } + // } + } + + if (onlyLastTensorContainLastIndexOfOutput) { // last accessed tensorVariable + const TensorPath& otherIndexPaths = tensorPaths.back(); + const vector& indexesVisited = otherIndexPaths.getVariables(); + cout << "index paths: " << otherIndexPaths << endl; + + cout << "index variable maybe removed from the back\n"; + auto lastTensorLastVisited = indexesVisited.back(); + cout << "last index last visited " << lastTensorLastVisited << endl; + + if (lastTensorLastVisited == lastVisitedIndexVar) { + cout << "we can diffuse from the back\n"; + fissionFromBack = true; + removedAccessNode = otherIndexPaths.getAccess().getTensorVar().getName(); + cout << "removed access node " << removedAccessNode << endl; + + // mark producer accessed index variables + for (auto indexVar : sortedAllIndexes) { + if (indexVar != lastVisitedIndexVar) { // add everything except the last accessed index + std::cout << "producer vars: " << indexVar << std::endl; + producerVars.push_back(indexVar); + } + } + + for (auto indexVar : sortedAllIndexes) { + if (indexVar != lastVisitedIndexVar) { + if ( + find(resultIndexesVisited.begin(), resultIndexesVisited.end(), indexVar) + != resultIndexesVisited.end() || + find(indexesVisited.begin(), indexesVisited.end(), indexVar) + != indexesVisited.end() + ) { + modifiedResultIndexesAccessed.push_back(indexVar); + } + } + } + + // // get modified index for the intermediate calculated tensor expression + // for (unsigned long j=0; j& tensorPaths, + string& removedAccessNode, + vector& producerVars, + vector& consumerVars, + vector& modifiedResultIndexesAccessed, + vector& sortedAllIndexes) { + + std::cout << "check from front function execution\n"; + + const std::vector& resultIndexesVisited = resultTensorPath.getVariables(); + IndexVar firstVisitedIndexVar = resultIndexesVisited.front(); + + std::cout << "first fisited index variable: " << firstVisitedIndexVar << std::endl; + std::cout << "tensor path size: " << tensorPaths.size() << std::endl; + + bool onlyFirstTensorContainFirstIndexOfOutput = true; + bool fissionFromFront = false; + + // check from the front + for (long i=tensorPaths.size()-1; i>0; i--) { // change tensor paths to recursively use the functionality + std::cout << "i: " << i << std::endl; + const TensorPath& otherIndexPaths = tensorPaths.at(i); + const vector& indexesVisited = otherIndexPaths.getVariables(); + cout << "index paths: " << otherIndexPaths << endl; + + if (i != 0) { // check if other tensors also contain last index of output tensor + for (auto index : indexesVisited) { + cout << "checking " << index << " " << firstVisitedIndexVar << endl; + if (index == firstVisitedIndexVar) { + onlyFirstTensorContainFirstIndexOfOutput = false; + } + } + } + } + + + if (onlyFirstTensorContainFirstIndexOfOutput) { // last accessed tensorVariable + const TensorPath& otherIndexPaths = tensorPaths.front(); + const vector& indexesVisited = otherIndexPaths.getVariables(); + cout << "index paths: " << otherIndexPaths << endl; + + cout << "index variable maybe removed from the front\n"; + auto firstTensorFirstVisited = indexesVisited.front(); + cout << "first index first visited " << firstTensorFirstVisited << endl; + + if (firstTensorFirstVisited == firstVisitedIndexVar) { + cout << "we can diffuse from the front\n"; + fissionFromFront = true; + removedAccessNode = otherIndexPaths.getAccess().getTensorVar().getName(); + cout << "removed access node " << removedAccessNode << endl; + + // mark producer accessed index variables + for (auto indexVar : sortedAllIndexes) { + if (indexVar != firstVisitedIndexVar) { // add everything except the first accessed index + producerVars.emplace_back(indexVar); + } + } + + for (auto indexVar : sortedAllIndexes) { + if (indexVar != firstVisitedIndexVar) { + if ( + find(resultIndexesVisited.begin(), resultIndexesVisited.end(), indexVar) + != resultIndexesVisited.end() || + find(indexesVisited.begin(), indexesVisited.end(), indexVar) + != indexesVisited.end() + ) { + modifiedResultIndexesAccessed.push_back(indexVar); + } + } + } + + std::cout << "printing modifiedResultIndexesAccessed\n"; + for (auto& idx : modifiedResultIndexesAccessed) { + std::cout << "modifiedResultIndexesAccessed: " << idx << std::endl; + } + std::cout << "printed modifiedResultIndexesAccessed\n"; + + // get modified index for the intermediate calculated tensor expression + // for (unsigned long j=0; j forallParallelUnit; + map forallOutputRaceStrategy; + vector sortedIndexes; + Assignment innerBody; + + SortedIndexVars() {}; + + void visit(const ForallNode* node) { + Forall forallNode(node); + IndexVar i = forallNode.getIndexVar(); + std::cout << forallNode << std::endl; + + sortedIndexes.push_back(i); + forallParallelUnit[i] = forallNode.getParallelUnit(); + forallOutputRaceStrategy[i] = forallNode.getOutputRaceStrategy(); + + if (isa(forallNode.getStmt())) { + cout << "assignment node found: " << forallNode.getStmt() << endl;; + innerBody = to(forallNode.getStmt()); + return; // Only reorder first contiguous section of ForAlls + } + + IndexNotationVisitor::visit(node); + } + }; + + std::cout << "traversing through the index statement\n"; + SortedIndexVars sortedIndexVars; + stmt.accept(&sortedIndexVars); + std::cout << std::endl; + + struct IndexExprBuilder : public IndexNotationVisitor { + + using IndexNotationVisitor::visit; + vector accessLeftToRight; + map>> indexDimensionsMap; + + void visit(const AccessNode* node) { + Access accessNode(node); + std::cout << "access node: " << accessNode << std::endl; + accessLeftToRight.push_back(accessNode); + + TensorVar tensorVar = accessNode.getTensorVar(); + + for (unsigned long i=0; i < accessNode.getIndexVars().size(); i++) { + auto var = accessNode.getIndexVars()[i]; + + if (indexDimensionsMap.find(var) != indexDimensionsMap.end()) { + indexDimensionsMap[var].emplace_back( + pair(tensorVar.getType().getShape().getDimension(i), + tensorVar.getType())); + } + else { + indexDimensionsMap[var] = { + pair( + tensorVar.getType().getShape().getDimension(i), + tensorVar.getType()) + }; + } + } + + } + + }; + + IndexExpr rhsExpr = assignment.getRhs(); + Access lhsAccess = to(assignment.getLhs()); + std::cout << "right hand side expression: " << rhsExpr << std::endl; + IndexExprBuilder indexExprBuilder; + rhsExpr.accept(&indexExprBuilder); + TensorVar resultVar = lhsAccess.getTensorVar(); + + for (auto item : indexExprBuilder.indexDimensionsMap) { + auto indexVar = item.first; + cout << "var: " << indexVar << " "; + for (auto elem : item.second) { + cout << elem.first << " " << elem.second << " " ; + } + cout << endl; + } + + + // now I have the iteration graph + IterationGraph iterationGraph = IterationGraph::make(assignment); + std::cout << "/*******************************************/\n"; + std::cout << "/********** ITERATION GRAPH ****************/\n"; + std::cout << "/*******************************************/\n"; + std::cout << iterationGraph << std::endl; + + const TensorPath& resultTensorPath = iterationGraph.getResultTensorPath(); + const std::vector& tensorPaths = iterationGraph.getTensorPaths(); + + + string removedAccessNode; + vector producerVars; // producer accessed index variables + vector consumerVars; // consumer accessed index variables + vector fusedVars; + vector modifiedResultIndexesAccessed; + bool fissionFromBack = false; + if (side == "b") { + fissionFromBack = true; + } + + if (fissionFromBack) { + fissionFromBack = checkFromBack(resultTensorPath, tensorPaths, + removedAccessNode, producerVars, consumerVars, + modifiedResultIndexesAccessed, sortedIndexVars.sortedIndexes + ); + } + + bool fissionFromFront = false; + if (side == "f") { + fissionFromFront = true; + } + if (fissionFromBack == false && fissionFromFront) { + fissionFromFront = checkFromFront(resultTensorPath, tensorPaths, + removedAccessNode, producerVars, consumerVars, + modifiedResultIndexesAccessed, sortedIndexVars.sortedIndexes + ); + } + + if (!fissionFromBack && !fissionFromFront) { + cout << "fission operation cannot be performed from the back\n"; + return stmt; + } + + vector newAccessDims{}; + for (auto var : modifiedResultIndexesAccessed) { + auto item = indexExprBuilder.indexDimensionsMap[var]; + cout << "shared vars: " << var << endl; + newAccessDims.emplace_back(item[0].first); + } + TensorVar newAccessVar(resultVar.getName() + "_inner", + Type(resultVar.getType().getDataType(), newAccessDims)); + cout << "new inner assignment statement: " << modifiedResultIndexesAccessed << std::endl; + Access newResultAccess(newAccessVar, modifiedResultIndexesAccessed); + cout << "new access variable for iterative apply: " << newResultAccess << std::endl; + + if (fissionFromBack) { + std::cout << "fission from the back is possible\n"; + } + if (fissionFromFront) { + std::cout << "fission from the front is possible\n"; + } + + // // check from the front + // struct IndexExprSeparator : public IndexNotationVisitor { + + // using IndexNotationVisitor::visit; + // vector accessLeftToRight; + + // void visit(const MulNode* node) { + // Mul mulNode(node); + // IndexExpr lhs = mulNode.getA(); + // IndexExpr rhs = mulNode.getB(); + // std::cout << "access node: " << accessNode << std::endl; + // accessLeftToRight.push_back(accessNode); + // } + + // }; + + + cout << "\n\nProducer accessed index variables\n"; + auto it = producerVars.begin(); + for (; it != producerVars.end(); it++) { + cout << *it << endl; + } + cout << "\n\nConsumer accessed index variables\n"; + it = consumerVars.begin(); + for (; it != consumerVars.end(); it++) { + cout << *it << endl; + } + cout << endl << endl; + + // check common vars that can be fused + for (auto var : sortedIndexVars.sortedIndexes) { + if (find(producerVars.begin(), producerVars.end(), var) != producerVars.end() && + find(consumerVars.begin(), consumerVars.end(), var) != consumerVars.end()) { + fusedVars.emplace_back(var); + } + else { + break; + } + } + + for (auto& fv : fusedVars) { + std::cout << "fusable vars: " << fv << std::endl; + } + + vector sharedVars; + for (auto var : sortedIndexVars.sortedIndexes) { + if (find(fusedVars.begin(), fusedVars.end(), var) == fusedVars.end() && + find(producerVars.begin(), producerVars.end(), var) != producerVars.end() && + find(consumerVars.begin(), consumerVars.end(), var) != consumerVars.end() + ) { + sharedVars.emplace_back(var); + } + } + + for (auto& sv : sharedVars) { + std::cout << "shared vars: " << sv << std::endl; + } + + vector sharedDims{}; + for (auto var : sharedVars) { + auto item = indexExprBuilder.indexDimensionsMap[var]; + cout << "shared vars: " << var << endl; + sharedDims.emplace_back(item[0].first); + } + + + // get removing tensorvars and workspace dimension + const Type& type = resultTensorPath.getAccess().getTensorVar().getType(); + const Format& format = resultTensorPath.getAccess().getTensorVar().getFormat(); + TensorVar intermediateTensor("ws", type, format); + cout << intermediateTensor << endl; + + // TensorVar A("A", Type(), taco::dense); + TensorVar tempVar("t" + resultVar.getName(), + Type(resultVar.getType().getDataType(), sharedDims)); + cout << "tensor order: " << tempVar.getOrder() << endl; + cout << "tensor format: " << tempVar.getFormat() << endl; + cout << "format order: " << tempVar.getFormat().getOrder() << endl; + + // TensorVar* a = new TensorVar("A", Type()); + // TensorVar ws("ws", Type(type(), {jdim}) ); + + // get removing indexExpr and the rest of the indexExpr + Access workspace(tempVar, sharedVars); + std::cout << "workspace access tensor: " << workspace << std::endl; + + + + // construct producer expression right hand side + cout << "generating consumer expression\n"; + IndexExpr producerExpr; + int num_muls = 0; + for (Access accessNode : indexExprBuilder.accessLeftToRight) { + std::cout << "accessNodes: " << accessNode << endl; + if (removedAccessNode != accessNode.getTensorVar().getName()) { + if (producerExpr == NULL) { + std::cout << "index expression is null"; + producerExpr = accessNode; + std::cout << "producerExpr: " << producerExpr << std::endl; + } else { + num_muls++; + producerExpr = producerExpr * accessNode; + std::cout << "producerExpr: " << producerExpr << std::endl; + } + } + } + std::cout << producerExpr << std::endl; + Assignment producerAssignment(newResultAccess, + producerExpr); + std::cout << "new inner assignment statement: " << producerAssignment << std::endl; + Assignment producerInnerBody(workspace, + producerExpr, + sortedIndexVars.innerBody.getOperator() + ); + std::cout << "producerInnerBody: " << producerInnerBody << std::endl; + + // construct consumer expression right hand side + IndexExpr consumerExpr; + if (fissionFromBack) { + consumerExpr = workspace; + } + cout << "generating consumer expression: " << consumerExpr << std::endl; + for (Access accessNode : indexExprBuilder.accessLeftToRight) { + TensorVar tv = accessNode.getTensorVar(); + std::cout << "accessNodes: " << accessNode << endl; + if (removedAccessNode == accessNode.getTensorVar().getName()) { + if (consumerExpr == NULL) { + std::cout << "index expression is null"; + consumerExpr = accessNode; + std::cout << "consumerExpr: " << consumerExpr << std::endl; + } else { + consumerExpr = consumerExpr * accessNode; + std::cout << "consumerExpr: " << consumerExpr << std::endl; + } + } + } + if (fissionFromFront) { + consumerExpr = consumerExpr * workspace; + } + Assignment consumerInnerBody(lhsAccess, + consumerExpr, + sortedIndexVars.innerBody.getOperator() + ); + + cout << "Producer inner body: " << producerInnerBody << endl; + cout << "Consumer inner body: " << consumerInnerBody << endl; + + // rewrite indexstmt + // Reorder Foralls use a rewriter in case new nodes introduced outside of Forall + struct ProducerConsumerRewriter : public IndexNotationRewriter { + using IndexNotationRewriter::visit; + + const vector& producerConsumerVars; + const vector& fusedVars; + IndexStmt innerBody; + const map forallParallelUnit; + const map forallOutputRaceStrategy; + + ProducerConsumerRewriter(const vector& producerConsumerVars, + const vector& fusedVars, IndexStmt innerBody, + const map forallParallelUnit, + const map forallOutputRaceStrategy) + : producerConsumerVars(producerConsumerVars), fusedVars(fusedVars), innerBody(innerBody), + forallParallelUnit(forallParallelUnit), forallOutputRaceStrategy(forallOutputRaceStrategy) { + } + + void visit(const ForallNode* node) { + Forall foralli(node); + IndexVar i = foralli.getIndexVar(); + cout << "going through var: " << i << endl; + + // first forall must be in collected variables + // taco_iassert(util::contains(producerVars, i)); + // std::cout << "\ninner body of the statement\n" << innerBody; + // // done in reverse order? + // for (auto it = sortedVars.rbegin(); it != sortedVars.rend(); ++it) { + // stmt = forall(*it, stmt, forallParallelUnit.at(*it), forallOutputRaceStrategy.at(*it), foralli.getUnrollFactor()); + // } + stmt = rewrite(foralli.getStmt()); + cout << "after rewrite statement: " << stmt << endl; + + // omit the index variables in the fusedVar list + if (find(fusedVars.begin(), fusedVars.end(), i) == fusedVars.end() && + find(producerConsumerVars.begin(), producerConsumerVars.end(), i) != producerConsumerVars.end()) { + stmt = forall(i, stmt, forallParallelUnit.at(i), forallOutputRaceStrategy.at(i), foralli.getUnrollFactor()); + } + } + + void visit (const AssignmentNode* node) { + cout << "assignment node: " << node << endl; + stmt = innerBody; + cout << "producerStmt: " << innerBody << endl; + cout << "stmt: " << stmt << endl; + } + + }; + ProducerConsumerRewriter producerRewriter(producerVars, fusedVars, + producerInnerBody, + sortedIndexVars.forallParallelUnit, + sortedIndexVars.forallOutputRaceStrategy); + IndexStmt producerStmt = producerRewriter.rewrite(stmt); + std::cout << "\nAfter Producer rewriter\n"; + std::cout << producerStmt << std::endl; + if (num_muls > 1) { + producerStmt = loopFusionOverFission(producerStmt, producerInnerBody, + side, iters-1); + } + + + ProducerConsumerRewriter consumerRewriter(consumerVars, fusedVars, + consumerInnerBody, + sortedIndexVars.forallParallelUnit, + sortedIndexVars.forallOutputRaceStrategy); + IndexStmt consumerStmt = consumerRewriter.rewrite(stmt); + std::cout << "\nAfter Consumer rewriter\n"; + std::cout << consumerStmt << std::endl; + + + struct CombineProducerConsumerRewriter : public IndexNotationRewriter { + + const vector& fusedVars; + IndexStmt consumerStmt; + IndexStmt producerStmt; + const map forallParallelUnit; + const map forallOutputRaceStrategy; + + CombineProducerConsumerRewriter(const vector& fusedVars, + IndexStmt producerStmt, IndexStmt consumerStmt, + const map forallParallelUnit, + const map forallOutputRaceStrategy) + : fusedVars(fusedVars), consumerStmt(consumerStmt), producerStmt(producerStmt), + forallParallelUnit(forallParallelUnit), + forallOutputRaceStrategy(forallOutputRaceStrategy) {} + + using IndexNotationRewriter::visit; + + void visit(const ForallNode* node) { + Forall foralli(node); + IndexVar i = foralli.getIndexVar(); + cout << "going through var: " << i << endl; + + // omit the index variables in the fusedVar list + if (find(fusedVars.begin(), fusedVars.end(), i) != fusedVars.end()) { + cout << "fused var in stmt\n"; + stmt = rewrite(foralli.getStmt()); + cout << "rewritten stmt: " << stmt << endl; + stmt = forall(i, stmt, forallParallelUnit.at(i), forallOutputRaceStrategy.at(i), foralli.getUnrollFactor()); + } + else { + cout << "fused var not in stmt\n"; + cout << "producerStmt: " << producerStmt << endl; + cout << "consumerStmt: " << consumerStmt << endl; + stmt = where(consumerStmt, producerStmt); + cout << "where stmt: " << stmt << endl; + } + + cout << "after rewrite statement: " << stmt << endl; + } + + }; + + CombineProducerConsumerRewriter combineRewriter(fusedVars, + producerStmt, consumerStmt, + sortedIndexVars.forallParallelUnit, + sortedIndexVars.forallOutputRaceStrategy); + IndexStmt combinedStmt = combineRewriter.rewrite(stmt); + std::cout << "\nAfter Combine rewriter\n"; + std::cout << combinedStmt << std::endl; + + + return combinedStmt; + +} + IndexStmt reorderLoopsTopologically(IndexStmt stmt) { + std::cout << "executing reorderLoopsTopologically\n"; // Collect tensorLevelVars which stores the pairs of IndexVar and tensor // level that each tensor is accessed at struct DAGBuilder : public IndexNotationVisitor { @@ -1382,8 +2090,11 @@ IndexStmt reorderLoopsTopologically(IndexStmt stmt) { }; Iterators iterators(stmt); + std::cout << "DAG builder with iterators" << std::endl; DAGBuilder dagBuilder(iterators); stmt.accept(&dagBuilder); + std::cout << "After DAGBuilder\n"; + std::cout << stmt << std::endl; // Construct tensor dependencies (sorted list of IndexVars) from tensorLevelVars map>> tensorVarOrders; @@ -1391,6 +2102,7 @@ IndexStmt reorderLoopsTopologically(IndexStmt stmt) { tensorVarOrders[tensorLevelVar.first] = varOrderFromTensorLevels(tensorLevelVar.second); } + // hard dependencies const auto hardDeps = depsFromVarOrders(tensorVarOrders); struct CollectSoftDependencies : public IndexNotationVisitor { @@ -1412,12 +2124,17 @@ IndexStmt reorderLoopsTopologically(IndexStmt stmt) { } } }; + // soft dependencies CollectSoftDependencies collectSoftDeps; stmt.accept(&collectSoftDeps); + std::cout << "After CollectSoftDependencies\n"; + std::cout << stmt << std::endl; + // topological sort const auto sortedVars = topologicallySort(hardDeps, collectSoftDeps.softDeps, dagBuilder.indexVarOriginalOrder); + // rewrite indexstmt // Reorder Foralls use a rewriter in case new nodes introduced outside of Forall struct TopoReorderRewriter : public IndexNotationRewriter { using IndexNotationRewriter::visit; @@ -1440,7 +2157,9 @@ IndexStmt reorderLoopsTopologically(IndexStmt stmt) { // first forall must be in collected variables taco_iassert(util::contains(sortedVars, i)); + std::cout << "\ninner body of the statement\n" << innerBody; stmt = innerBody; + // done in reverse order? for (auto it = sortedVars.rbegin(); it != sortedVars.rend(); ++it) { stmt = forall(*it, stmt, forallParallelUnit.at(*it), forallOutputRaceStrategy.at(*it), foralli.getUnrollFactor()); } @@ -1450,7 +2169,11 @@ IndexStmt reorderLoopsTopologically(IndexStmt stmt) { }; TopoReorderRewriter rewriter(sortedVars, dagBuilder.innerBody, dagBuilder.forallParallelUnit, dagBuilder.forallOutputRaceStrategy); - return rewriter.rewrite(stmt); + IndexStmt stmtChanged = rewriter.rewrite(stmt); + std::cout << "After TopoReorderRewriter\n"; + std::cout << stmtChanged << std::endl; + + return stmtChanged; } IndexStmt scalarPromote(IndexStmt stmt, ProvenanceGraph provGraph, @@ -1478,6 +2201,7 @@ IndexStmt scalarPromote(IndexStmt stmt, ProvenanceGraph provGraph, void visit(const ForallNode* node) { Forall foralli(node); + std::cout << "scalar promote: " << foralli << std::endl; IndexVar i = foralli.getIndexVar(); // Don't allow hoisting out of forall's for GPU warp and block reduction diff --git a/src/ir/ir_printer.cpp b/src/ir/ir_printer.cpp index a1997a9b7..eddca3f29 100644 --- a/src/ir/ir_printer.cpp +++ b/src/ir/ir_printer.cpp @@ -1,6 +1,7 @@ #include #include +#include "taco/cuda.h" #include "taco/ir/ir.h" #include "taco/ir/ir_printer.h" #include "taco/ir/simplify.h" @@ -34,7 +35,11 @@ IRPrinter::IRPrinter(ostream &s) : IRPrinter(s, false, false) { } IRPrinter::IRPrinter(ostream &s, bool color, bool simplify) - : stream(s), indent(0), color(color), simplify(simplify) { + : stream(s), stream2(s), indent(0), color(color), simplify(simplify) { +} + +IRPrinter::IRPrinter(ostream &s, ostream &s2, bool color, bool simplify) + : stream(s), stream2(s2), indent(0), color(color), simplify(simplify) { } IRPrinter::~IRPrinter() { @@ -59,79 +64,169 @@ void IRPrinter::print(Stmt stmt) { } void IRPrinter::visit(const Literal* op) { - if (color) { - stream << blue ; - } - - switch (op->type.getKind()) { - case Datatype::Bool: - stream << op->getValue(); - break; - case Datatype::UInt8: - stream << static_cast(op->getValue()); - break; - case Datatype::UInt16: - stream << op->getValue(); - break; - case Datatype::UInt32: - stream << op->getValue(); - break; - case Datatype::UInt64: - stream << op->getValue(); - break; - case Datatype::UInt128: - taco_not_supported_yet; - break; - case Datatype::Int8: - stream << static_cast(op->getValue()); - break; - case Datatype::Int16: - stream << op->getValue(); - break; - case Datatype::Int32: - stream << op->getValue(); - break; - case Datatype::Int64: - stream << op->getValue(); - break; - case Datatype::Int128: - taco_not_supported_yet; - break; - case Datatype::Float32: - stream << ((op->getValue() != 0.0) - ? util::toString(op->getValue()) : "0.0"); - break; - case Datatype::Float64: - stream << ((op->getValue()!=0.0) - ? util::toString(op->getValue()) : "0.0"); - break; - case Datatype::Complex64: { - std::complex val = op->getValue>(); - stream << val.real() << " + I*" << val.imag(); - } - break; - case Datatype::Complex128: { - std::complex val = op->getValue>(); - stream << val.real() << " + I*" << val.imag(); + if (is_ISPC_code_stream_enabled()) { + if (color) { + stream2 << blue ; + } + + // It seems this is where all the types get printed in the final code generation. + // Come up with a way to generate different values if stream2 is used to generate ispc code + switch (op->type.getKind()) { + case Datatype::Bool: + stream2 << op->getValue(); + break; + case Datatype::UInt8: + stream2 << static_cast(op->getValue()); + break; + case Datatype::UInt16: + stream2 << op->getValue(); + break; + case Datatype::UInt32: + stream2 << op->getValue(); + break; + case Datatype::UInt64: + stream2 << op->getValue(); + break; + case Datatype::UInt128: + taco_not_supported_yet; + break; + case Datatype::Int8: + stream2 << static_cast(op->getValue()); + break; + case Datatype::Int16: + stream2 << op->getValue(); + break; + case Datatype::Int32: + stream2 << op->getValue(); + break; + case Datatype::Int64: + stream2 << op->getValue(); + break; + case Datatype::Int128: + taco_not_supported_yet; + break; + case Datatype::Float32: + stream2 << ((op->getValue() != 0.0) + ? util::toString(op->getValue()) : "0.0"); + break; + case Datatype::Float64: + stream2 << ((op->getValue()!=0.0) + ? util::toString(op->getValue()) : "0.0"); + break; + case Datatype::Complex64: { + std::complex val = op->getValue>(); + stream2 << val.real() << " + I*" << val.imag(); + } + break; + case Datatype::Complex128: { + std::complex val = op->getValue>(); + stream2 << val.real() << " + I*" << val.imag(); + } + break; + case Datatype::Undefined: + taco_ierror << "Undefined type in IR"; + break; + } + + if (color) { + stream2 << nc; + } } - break; - case Datatype::Undefined: - taco_ierror << "Undefined type in IR"; - break; - } - if (color) { - stream << nc; + + + else { + + if (color) { + stream << blue ; + } + + // It seems this is where all the types get printed in the final code generation. + // Come up with a way to generate different values if stream2 is used to generate ispc code + switch (op->type.getKind()) { + case Datatype::Bool: + stream << op->getValue(); + break; + case Datatype::UInt8: + stream << static_cast(op->getValue()); + break; + case Datatype::UInt16: + stream << op->getValue(); + break; + case Datatype::UInt32: + stream << op->getValue(); + break; + case Datatype::UInt64: + stream << op->getValue(); + break; + case Datatype::UInt128: + taco_not_supported_yet; + break; + case Datatype::Int8: + stream << static_cast(op->getValue()); + break; + case Datatype::Int16: + stream << op->getValue(); + break; + case Datatype::Int32: + stream << op->getValue(); + break; + case Datatype::Int64: + stream << op->getValue(); + break; + case Datatype::Int128: + taco_not_supported_yet; + break; + case Datatype::Float32: + stream << ((op->getValue() != 0.0) + ? util::toString(op->getValue()) : "0.0"); + break; + case Datatype::Float64: + stream << ((op->getValue()!=0.0) + ? util::toString(op->getValue()) : "0.0"); + break; + case Datatype::Complex64: { + std::complex val = op->getValue>(); + stream << val.real() << " + I*" << val.imag(); + } + break; + case Datatype::Complex128: { + std::complex val = op->getValue>(); + stream << val.real() << " + I*" << val.imag(); + } + break; + case Datatype::Undefined: + taco_ierror << "Undefined type in IR"; + break; + } + + if (color) { + stream << nc; + } + + } + } void IRPrinter::visit(const Var* op) { - if (varNames.contains(op)) { - stream << varNames.get(op); + if (is_ISPC_code_stream_enabled()) { + if (varNames.contains(op)) { + stream2 << varNames.get(op); + } + else { + stream2 << op->name; + } } else { - stream << op->name; + if (varNames.contains(op)) { + stream << varNames.get(op); + } + else { + stream << op->name; + } } + } void IRPrinter::visit(const Neg* op) { @@ -238,51 +333,101 @@ void IRPrinter::visit(const Cast* op) { } void IRPrinter::visit(const Call* op) { - stream << op->func << "("; - parentPrecedence = Precedence::CALL; - acceptJoin(this, stream, op->args, ", "); - stream << ")"; + if (!is_ISPC_code_stream_enabled()) { + stream << op->func << "("; + parentPrecedence = Precedence::CALL; + acceptJoin(this, stream, op->args, ", "); + stream << ")"; + } else { + // statically added function to the ispc file has __ in the front + stream2 << "__" << op->func << "("; + parentPrecedence = Precedence::CALL; + acceptJoin(this, stream2, op->args, ", "); + stream2 << ")"; + } } void IRPrinter::visit(const IfThenElse* op) { taco_iassert(op->cond.defined()); taco_iassert(op->then.defined()); doIndent(); - stream << keywordString("if "); - stream << "("; - parentPrecedence = Precedence::TOP; - op->cond.accept(this); - stream << ")"; + if (is_ISPC_code_stream_enabled()) { + stream2 << keywordString("if "); + stream2 << "("; + parentPrecedence = Precedence::TOP; + op->cond.accept(this); + stream2 << ")"; + + Stmt scopedStmt = Stmt(to(op->then)->scopedStmt); + if (isa(scopedStmt)) { + stream2 << " {" << endl; + op->then.accept(this); + doIndent(); + stream2 << "}"; + } + else if (isa(scopedStmt)) { + int tmp = indent; + indent = 0; + stream2 << " "; + scopedStmt.accept(this); + indent = tmp; + } + else { + stream2 << endl; + op->then.accept(this); + } - Stmt scopedStmt = Stmt(to(op->then)->scopedStmt); - if (isa(scopedStmt)) { - stream << " {" << endl; - op->then.accept(this); - doIndent(); - stream << "}"; - } - else if (isa(scopedStmt)) { - int tmp = indent; - indent = 0; - stream << " "; - scopedStmt.accept(this); - indent = tmp; + if (op->otherwise.defined()) { + stream2 << "\n"; + doIndent(); + stream2 << keywordString("else"); + stream2 << " {\n"; + op->otherwise.accept(this); + doIndent(); + stream2 << "}"; + } + stream2 << endl; } + + else { - stream << endl; - op->then.accept(this); - } + stream << keywordString("if "); + stream << "("; + parentPrecedence = Precedence::TOP; + op->cond.accept(this); + stream << ")"; - if (op->otherwise.defined()) { - stream << "\n"; - doIndent(); - stream << keywordString("else"); - stream << " {\n"; - op->otherwise.accept(this); - doIndent(); - stream << "}"; + Stmt scopedStmt = Stmt(to(op->then)->scopedStmt); + if (isa(scopedStmt)) { + stream << " {" << endl; + op->then.accept(this); + doIndent(); + stream << "}"; + } + else if (isa(scopedStmt)) { + int tmp = indent; + indent = 0; + stream << " "; + scopedStmt.accept(this); + indent = tmp; + } + else { + stream << endl; + op->then.accept(this); + } + + if (op->otherwise.defined()) { + stream << "\n"; + doIndent(); + stream << keywordString("else"); + stream << " {\n"; + op->otherwise.accept(this); + doIndent(); + stream << "}"; + } + stream << endl; } - stream << endl; + } void IRPrinter::visit(const Case* op) { @@ -345,12 +490,22 @@ void IRPrinter::visit(const Switch* op) { } void IRPrinter::visit(const Load* op) { - parentPrecedence = Precedence::LOAD; - op->arr.accept(this); - stream << "["; - parentPrecedence = Precedence::LOAD; - op->loc.accept(this); - stream << "]"; + if (is_ISPC_code_stream_enabled()) { + parentPrecedence = Precedence::LOAD; + op->arr.accept(this); + stream2 << "["; + parentPrecedence = Precedence::LOAD; + op->loc.accept(this); + stream2 << "]"; + } + else { + parentPrecedence = Precedence::LOAD; + op->arr.accept(this); + stream << "["; + parentPrecedence = Precedence::LOAD; + op->loc.accept(this); + stream << "]"; + } } void IRPrinter::visit(const Malloc* op) { @@ -367,66 +522,149 @@ void IRPrinter::visit(const Sizeof* op) { } void IRPrinter::visit(const Store* op) { - doIndent(); - op->arr.accept(this); - stream << "["; - parentPrecedence = Precedence::TOP; - op->loc.accept(this); - stream << "] = "; - parentPrecedence = Precedence::TOP; - op->data.accept(this); - stream << ";"; - stream << endl; + if (is_ISPC_code_stream_enabled()) { + doIndent(); + op->arr.accept(this); + stream2 << "["; + parentPrecedence = Precedence::TOP; + op->loc.accept(this); + stream2 << "] = "; + parentPrecedence = Precedence::TOP; + op->data.accept(this); + stream2 << ";"; + stream2 << endl; + } + else { + doIndent(); + op->arr.accept(this); + stream << "["; + parentPrecedence = Precedence::TOP; + op->loc.accept(this); + stream << "] = "; + parentPrecedence = Precedence::TOP; + op->data.accept(this); + stream << ";"; + stream << endl; + } + } void IRPrinter::visit(const For* op) { - doIndent(); - stream << keywordString("for") << " (" - << keywordString(util::toString(op->var.type())) << " "; - op->var.accept(this); - stream << " = "; - op->start.accept(this); - stream << keywordString("; "); - op->var.accept(this); - stream << " < "; - parentPrecedence = BOTTOM; - op->end.accept(this); - stream << keywordString("; "); - op->var.accept(this); + // std::cout << "This is IRPrinter::visit For op method\n"; + if (is_ISPC_code_stream_enabled()) { + doIndent(); + stream2 << keywordString("for") << " (" + << keywordString(util::toString(op->var.type())) << " "; + op->var.accept(this); + stream2 << " = "; + op->start.accept(this); + stream2 << keywordString("; "); + op->var.accept(this); + stream2 << " < "; + parentPrecedence = BOTTOM; + op->end.accept(this); + stream2 << keywordString("; "); + op->var.accept(this); + + auto lit = op->increment.as(); + if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || + (lit->type.isUInt() && lit->equalsScalar(1)))) { + stream2 << "++"; + } + else { + stream2 << " += "; + op->increment.accept(this); + } + stream2 << ") {\n"; - auto lit = op->increment.as(); - if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || - (lit->type.isUInt() && lit->equalsScalar(1)))) { - stream << "++"; + op->contents.accept(this); + doIndent(); + stream2 << "}"; + stream2 << endl; } + + else { - stream << " += "; - op->increment.accept(this); + doIndent(); + stream << keywordString("for") << " (" + << keywordString(util::toString(op->var.type())) << " "; + op->var.accept(this); + stream << " = "; + op->start.accept(this); + stream << keywordString("; "); + op->var.accept(this); + stream << " < "; + parentPrecedence = BOTTOM; + op->end.accept(this); + stream << keywordString("; "); + op->var.accept(this); + + auto lit = op->increment.as(); + if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || + (lit->type.isUInt() && lit->equalsScalar(1)))) { + stream << "++"; + } + else { + stream << " += "; + op->increment.accept(this); + } + stream << ") {\n"; + + op->contents.accept(this); + doIndent(); + stream << "}"; + stream << endl; } - stream << ") {\n"; - op->contents.accept(this); - doIndent(); - stream << "}"; - stream << endl; +} + +void IRPrinter::sendToStream(std::stringstream &stream) { + if (is_ISPC_code_stream_enabled()) { + this->stream2 << stream.str(); + } + else { + this->stream << stream.str(); + } } void IRPrinter::visit(const While* op) { - doIndent(); - stream << keywordString("while "); - stream << "("; - parentPrecedence = Precedence::TOP; - op->cond.accept(this); - stream << ")"; - stream << " {\n"; - op->contents.accept(this); - doIndent(); - stream << "}"; - stream << endl; + // std::stringstream stream; + if (is_ISPC_code_stream_enabled()) { + doIndent(); + stream2 << keywordString("while "); + stream2 << "("; + parentPrecedence = Precedence::TOP; + op->cond.accept(this); + stream2 << ")"; + stream2 << " {\n"; + op->contents.accept(this); + doIndent(); + stream2 << "}"; + stream2 << endl; + } + else { + doIndent(); + stream << keywordString("while "); + stream << "("; + parentPrecedence = Precedence::TOP; + op->cond.accept(this); + stream << ")"; + stream << " {\n"; + op->contents.accept(this); + doIndent(); + stream << "}"; + stream << endl; + } + // sendToStream(stream); } void IRPrinter::visit(const Block* op) { - acceptJoin(this, stream, op->contents, ""); + if (is_ISPC_code_stream_enabled()) { + acceptJoin(this, stream2, op->contents, ""); + } + else { + acceptJoin(this, stream, op->contents, ""); + } } void IRPrinter::visit(const Scope* op) { @@ -438,85 +676,140 @@ void IRPrinter::visit(const Scope* op) { } void IRPrinter::visit(const Function* op) { - stream << keywordString("void ") << op->name; - stream << "("; - if (op->outputs.size() > 0) stream << "Tensor "; - acceptJoin(this, stream, op->outputs, ", Tensor "); - if (op->outputs.size() > 0 && op->inputs.size()) stream << ", "; - if (op->inputs.size() > 0) stream << "Tensor "; - acceptJoin(this, stream, op->inputs, ", Tensor "); - stream << ") {" << endl; + if (is_ISPC_code_stream_enabled()) { + stream2 << keywordString("void ") << op->name; + stream2 << "("; + if (op->outputs.size() > 0) stream2 << "Tensor "; + acceptJoin(this, stream2, op->outputs, ", Tensor "); + if (op->outputs.size() > 0 && op->inputs.size()) stream2 << ", "; + if (op->inputs.size() > 0) stream2 << "Tensor "; + acceptJoin(this, stream2, op->inputs, ", Tensor "); + stream2 << ") {" << endl; + + resetNameCounters(); + op->body.accept(this); - resetNameCounters(); - op->body.accept(this); + doIndent(); + stream2 << "}"; + } + else { + stream << keywordString("void ") << op->name; + stream << "("; + if (op->outputs.size() > 0) stream << "Tensor "; + acceptJoin(this, stream, op->outputs, ", Tensor "); + if (op->outputs.size() > 0 && op->inputs.size()) stream << ", "; + if (op->inputs.size() > 0) stream << "Tensor "; + acceptJoin(this, stream, op->inputs, ", Tensor "); + stream << ") {" << endl; + + resetNameCounters(); + op->body.accept(this); + + doIndent(); + stream << "}"; + } - doIndent(); - stream << "}"; } void IRPrinter::visit(const VarDecl* op) { - doIndent(); - stream << keywordString(util::toString(op->var.type())); - taco_iassert(isa(op->var)); - if (to(op->var)->is_ptr) { - stream << "* restrict"; - } - stream << " "; - string varName = varNameGenerator.getUniqueName(util::toString(op->var)); - varNames.insert({op->var, varName}); - op->var.accept(this); - parentPrecedence = Precedence::TOP; - stream << " = "; - op->rhs.accept(this); - stream << ";"; - stream << endl; + if (is_ISPC_code_stream_enabled()) { + doIndent(); + if (op->var.type() == Int32) { + stream2 << keywordString("int32"); + } + else if (op->var.type() == Int64) { + stream2 << keywordString("int64"); + } else { + stream2 << keywordString(util::toString(op->var.type())); + } + taco_iassert(isa(op->var)); + if (to(op->var)->is_ptr) { + stream2 << "* "; // removed restrict keyword from here + } + stream2 << " "; + string varName = varNameGenerator.getUniqueName(util::toString(op->var)); + varNames.insert({op->var, varName}); + op->var.accept(this); + parentPrecedence = Precedence::TOP; + stream2 << " = "; + op->rhs.accept(this); + stream2 << ";"; + stream2 << endl; + } + else { + doIndent(); + stream << keywordString(util::toString(op->var.type())); + taco_iassert(isa(op->var)); + if (to(op->var)->is_ptr) { + stream << "* restrict"; + } + stream << " "; + string varName = varNameGenerator.getUniqueName(util::toString(op->var)); + varNames.insert({op->var, varName}); + op->var.accept(this); + parentPrecedence = Precedence::TOP; + stream << " = "; + op->rhs.accept(this); + stream << ";"; + stream << endl; + } + } void IRPrinter::visit(const Assign* op) { - doIndent(); - op->lhs.accept(this); - parentPrecedence = Precedence::TOP; - bool printed = false; - if (simplify) { - if (isa(op->rhs)) { - auto add = to(op->rhs); - if (add->a == op->lhs) { - const Literal* lit = add->b.as(); - if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || - (lit->type.isUInt() && lit->equalsScalar(1)))) { - stream << "++"; + if (is_ISPC_code_stream_enabled()) { + + } + + + + else { + doIndent(); + op->lhs.accept(this); + parentPrecedence = Precedence::TOP; + bool printed = false; + if (simplify) { + if (isa(op->rhs)) { + auto add = to(op->rhs); + if (add->a == op->lhs) { + const Literal* lit = add->b.as(); + if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || + (lit->type.isUInt() && lit->equalsScalar(1)))) { + stream << "++"; + } + else { + stream << " += "; + add->b.accept(this); + } + printed = true; } - else { - stream << " += "; - add->b.accept(this); + } + else if (isa(op->rhs)) { + auto mul = to(op->rhs); + if (mul->a == op->lhs) { + stream << " *= "; + mul->b.accept(this); + printed = true; } - printed = true; } - } - else if (isa(op->rhs)) { - auto mul = to(op->rhs); - if (mul->a == op->lhs) { - stream << " *= "; - mul->b.accept(this); - printed = true; + else if (isa(op->rhs)) { + auto bitOr = to(op->rhs); + if (bitOr->a == op->lhs) { + stream << " |= "; + bitOr->b.accept(this); + printed = true; + } } } - else if (isa(op->rhs)) { - auto bitOr = to(op->rhs); - if (bitOr->a == op->lhs) { - stream << " |= "; - bitOr->b.accept(this); - printed = true; - } + if (!printed) { + stream << " = "; + op->rhs.accept(this); } - } - if (!printed) { - stream << " = "; - op->rhs.accept(this); + + stream << ";"; + stream << endl; } - stream << ";"; - stream << endl; } void IRPrinter::visit(const Yield* op) { @@ -544,12 +837,22 @@ void IRPrinter::visit(const Allocate* op) { } void IRPrinter::visit(const Free* op) { - doIndent(); - stream << "free("; - parentPrecedence = Precedence::TOP; - op->var.accept(this); - stream << ");"; - stream << endl; + if (is_ISPC_code_stream_enabled()) { + doIndent(); + stream2 << "delete[] "; + parentPrecedence = Precedence::TOP; + op->var.accept(this); + stream2 << ";"; + stream2 << endl; + } + else { + doIndent(); + stream << "free("; + parentPrecedence = Precedence::TOP; + op->var.accept(this); + stream << ");"; + stream << endl; + } } void IRPrinter::visit(const Comment* op) { @@ -559,17 +862,32 @@ void IRPrinter::visit(const Comment* op) { } void IRPrinter::visit(const BlankLine*) { - stream << endl; + if (is_ISPC_code_stream_enabled()) { + stream2 << endl; + } + else { + stream << endl; + } } void IRPrinter::visit(const Continue*) { doIndent(); - stream << "continue;" << endl; + if (!is_ISPC_code_stream_enabled()) { + stream << "continue;" << endl; + } + else { + stream2 << "continue;" << endl; + } } void IRPrinter::visit(const Break*) { doIndent(); - stream << "break;" << endl; + if (!is_ISPC_code_stream_enabled()) { + stream << "break;" << endl; + } + else { + stream2 << "break;" << endl; + } } void IRPrinter::visit(const Print* op) { @@ -585,7 +903,12 @@ void IRPrinter::visit(const Print* op) { } void IRPrinter::visit(const GetProperty* op) { - stream << op->name; + if (is_ISPC_code_stream_enabled()) { + stream2 << op->name; + } + else { + stream << op->name; + } } void IRPrinter::visit(const Sort* op) { @@ -643,23 +966,47 @@ void IRPrinter::resetNameCounters() { } void IRPrinter::doIndent() { - for (int i=0; ivar); Expr start = rewrite(op->start); Expr end = rewrite(op->end); diff --git a/src/ir_tags.cpp b/src/ir_tags.cpp index af3dbd775..e7365d6c2 100644 --- a/src/ir_tags.cpp +++ b/src/ir_tags.cpp @@ -2,7 +2,7 @@ namespace taco { -const char *ParallelUnit_NAMES[] = {"NotParallel", "DefaultUnit", "GPUBlock", "GPUWarp", "GPUThread", "CPUThread", "CPUVector", "CPUThreadGroupReduction", "GPUBlockReduction", "GPUWarpReduction"}; +const char *ParallelUnit_NAMES[] = {"NotParallel", "DefaultUnit", "GPUBlock", "GPUWarp", "GPUThread", "CPUThread", "CPUVector", "CPUThreadGroupReduction", "GPUBlockReduction", "GPUWarpReduction", "CPUSimd", "CPUSpmd"}; const char *OutputRaceStrategy_NAMES[] = {"IgnoreRaces", "NoRaces", "Atomics", "Temporary", "ParallelReduction"}; const char *BoundType_NAMES[] = {"MinExact", "MinConstraint", "MaxExact", "MaxConstraint"}; const char *AssembleStrategy_NAMES[] = {"Append", "Insert"}; diff --git a/src/lower/iteration_graph.cpp b/src/lower/iteration_graph.cpp index 77735a8d2..482d84aae 100644 --- a/src/lower/iteration_graph.cpp +++ b/src/lower/iteration_graph.cpp @@ -48,6 +48,8 @@ struct IterationGraph::Content { IterationGraph::IterationGraph() { } +// remember that iteration graph does not have an ordering +// I got the ordering from topologically reorder index Ryan wrote IterationGraph IterationGraph::make(Assignment assignment) { TensorVar tensor = assignment.getLhs().getTensorVar(); IndexExpr expr = assignment.getRhs(); @@ -64,8 +66,16 @@ IterationGraph IterationGraph::make(Assignment assignment) { oldToSplitVar.insert({indexVar, indexVar}); } + // access nodes of right hand side match(expr, function([&](const AccessNode* op) { + std::cout << "access node: " << op->tensorVar << " <- " << IndexExpr(op) << std::endl; + std::cout << "index var: "; + for (auto indexVar : op->indexVars) { + std::cout << indexVar << " "; + } + std::cout << std::endl; + auto type = op->tensorVar.getType(); taco_iassert((size_t)type.getShape().getOrder() == op->indexVars.size()) << "Tensor access " << IndexExpr(op) << " but tensor format only has " diff --git a/src/lower/iterator.cpp b/src/lower/iterator.cpp index 0f0c024c5..eb3d8ac3b 100644 --- a/src/lower/iterator.cpp +++ b/src/lower/iterator.cpp @@ -569,6 +569,9 @@ void Iterators::createAccessIterators(Access access, Format format, Expr tensorI ProvenanceGraph provGraph, const map &tensorVars) { TensorVar tensorConcrete = access.getTensorVar(); + cout << "tensor: " << tensorConcrete << " " ; + cout << "tensorConcrete order: " << tensorConcrete.getOrder(); + cout << ", format order: " << format.getOrder() << endl; taco_iassert(tensorConcrete.getOrder() == format.getOrder()) << tensorConcrete << ", Format" << format; Shape shape = tensorConcrete.getType().getShape(); diff --git a/src/lower/lowerer_impl_imperative.cpp b/src/lower/lowerer_impl_imperative.cpp index b4c9ea710..1355c80a1 100644 --- a/src/lower/lowerer_impl_imperative.cpp +++ b/src/lower/lowerer_impl_imperative.cpp @@ -1,4 +1,6 @@ #include +#include "taco/cuda.h" +#include "taco/ir_tags.h" #include "taco/lower/lowerer_impl_imperative.h" #include "taco/lower/lowerer_impl.h" @@ -26,6 +28,7 @@ class LowererImplImperative::Visitor : public IndexNotationVisitorStrict { public: Visitor(LowererImplImperative* impl) : impl(impl) {} Stmt lower(IndexStmt stmt) { + // std::cout << "lowering IndexStmt to ir:Stmt - IndexStmt: " << stmt << std::endl; this->stmt = Stmt(); impl->accessibleIterators.scope(); IndexStmtVisitorStrict::visit(stmt); @@ -135,6 +138,7 @@ static bool returnsTrue(IndexExpr expr) { } void visit(const CastNode* op) { + std::cout << "visiting cast node\n"; expr = rewrite(op->a); } @@ -200,6 +204,7 @@ static std::set hasSparseInserts(IndexStmt stmt, Iterators iterators, return ret; } + Stmt LowererImplImperative::lower(IndexStmt stmt, string name, bool assemble, bool compute, bool pack, bool unpack) @@ -414,6 +419,7 @@ LowererImplImperative::lower(IndexStmt stmt, string name, Stmt LowererImplImperative::lowerAssignment(Assignment assignment) { + // std::cout << "\n\n converting assignment IndexStmt============================================ Assignment\n"; taco_iassert(generateAssembleCode() || generateComputeCode()); Stmt computeStmt; @@ -421,7 +427,7 @@ Stmt LowererImplImperative::lowerAssignment(Assignment assignment) Expr var = getTensorVar(result); const bool needComputeAssign = util::contains(needCompute, result); - + // std::cout << "does assignment need compute assign: " << needComputeAssign << std::endl; Expr rhs; if (needComputeAssign) { rhs = lower(assignment.getRhs()); @@ -429,20 +435,51 @@ Stmt LowererImplImperative::lowerAssignment(Assignment assignment) // Assignment to scalar variables. if (isScalar(result.getType())) { + // std::cout << "assignment to scalar variables\n"; if (needComputeAssign) { + // std::cout << "compute assign\n"; if (!assignment.getOperator().defined()) { + // std::cout << "assignment operator is not defined\n"; + // std::cout << "var: " << var << ", rhs, : " << rhs << std::endl; computeStmt = Assign::make(var, rhs); } else { taco_iassert(isa(assignment.getOperator())); - bool useAtomics = markAssignsAtomicDepth > 0 && - !util::contains(whereTemps, result); + + // std::cout << "assignment depth -- loopDepth: " << loopDepth << std::endl; + // std::cout << "is markAssignsAtomicDepth > 0: " << (markAssignsAtomicDepth > 0) << std::endl; + // for (auto &tensors_ : whereTemps) { + // // std::cout << tensors_ << ", "; + // } + // std::cout << std::endl; + // std::cout << result << std::endl; + int tempVarInitLoopDepth = whereTempsWithLoopDepth.find(result)->second; + // std::cout << "tempInitLoopDepth: " << tempVarInitLoopDepth << std::endl; + + bool reduction = false; + std::map::iterator itr; + for (itr = forUnits.begin(); itr!=forUnits.end(); ++itr) { + if (itr->first<=loopDepth && itr->first>tempVarInitLoopDepth && itr->second == ParallelUnit::CPUSimd) { + reduction = true; + } + // std::cout << itr->first << "\t" << ParallelUnit_NAMES[(int) itr->second] << std::endl; + } + + // less than or equal to loopDepth but greater than temp variable initialized loop depth + bool useAtomics = markAssignsAtomicDepth > 0 && (!util::contains(whereTemps, result) || reduction); + // std::cout << "whereTemps and result: " << !util::contains(whereTemps, result) << std::endl; + // std::cout << "assignment to scalar variables useAtomics: " << useAtomics << std::endl; computeStmt = compoundAssign(var, rhs, useAtomics, atomicParallelUnit); + // std::cout << "computeStatment: " << computeStmt << std::endl; } } + else { + // std::cout << "not compute assign\n"; + } } // Assignments to tensor variables (non-scalar). else { + // std::cout << "assignment to tensor variables\n"; Expr values = getValuesArray(result); Expr loc = generateValueLocExpr(assignment.getLhs()); @@ -476,6 +513,7 @@ Stmt LowererImplImperative::lowerAssignment(Assignment assignment) } if (needComputeAssign && values.defined()) { + // std::cout << "assign compute statement\n"; if (!assignment.getOperator().defined()) { computeStmt = Store::make(values, loc, rhs); } @@ -586,19 +624,39 @@ LowererImplImperative::splitAppenderAndInserters(const vector& results } +// important function +/* +* This is the for loop lowering part +*/ + Stmt LowererImplImperative::lowerForall(Forall forall) { + loopDepth++; + forUnits.insert(std::pair(loopDepth,forall.getParallelUnit())); + // std::cout << "doing lowerForall: " << forall << std::endl; bool hasExactBound = provGraph.hasExactBound(forall.getIndexVar()); bool forallNeedsUnderivedGuards = !hasExactBound && emitUnderivedGuards; + + + // std::cout << "printing temporary variables with their atomic depths\n"; + map::iterator itr; + for (itr = whereTempsWithLoopDepth.begin(); itr != whereTempsWithLoopDepth.end(); ++itr) { + // std::cout << itr->first << "\t" << itr->second << "\n"; + } + + if (!ignoreVectorize && forallNeedsUnderivedGuards && (forall.getParallelUnit() == ParallelUnit::CPUVector || forall.getUnrollFactor() > 0)) { + // std::cout << "calling lowerForallCloned(forall)\n"; return lowerForallCloned(forall); } + // std::cout << "inParallelLoopDepth: " << inParallelLoopDepth << "========================\n"; if (forall.getParallelUnit() != ParallelUnit::NotParallel) { inParallelLoopDepth++; } + // std::cout << "inParallelLoopDepth: " << inParallelLoopDepth << "========================\n"; // Recover any available parents that were not recoverable previously vector recoverySteps; @@ -786,19 +844,23 @@ Stmt LowererImplImperative::lowerForall(Forall forall) } if (!isWhereProducer && hasPosDescendant && underivedAncestors.size() > 1 && provGraph.isPosVariable(iterator.getIndexVar()) && posDescendant == forall.getIndexVar()) { + // std::cout << "calling lowerForallFusedPosition(forall\n"; loops = lowerForallFusedPosition(forall, iterator, locators, inserters, appenders, reducedAccesses, recoveryStmt); } else if (canAccelWithSparseIteration) { + // std::cout << "calling lowerForallDenseAcceleration(forall\n"; loops = lowerForallDenseAcceleration(forall, locators, inserters, appenders, reducedAccesses, recoveryStmt); } // Emit dimension coordinate iteration loop else if (iterator.isDimensionIterator()) { + // std::cout << "calling lowerForallDimension(forall\n"; loops = lowerForallDimension(forall, point.locators(), inserters, appenders, reducedAccesses, recoveryStmt); } // Emit position iteration loop else if (iterator.hasPosIter()) { + // std::cout << "calling lowerForallPosition(forall\n"; loops = lowerForallPosition(forall, iterator, locators, inserters, appenders, reducedAccesses, recoveryStmt); } @@ -816,6 +878,10 @@ Stmt LowererImplImperative::lowerForall(Forall forall) loops = lowerMergeLattice(lattice, underivedAncestors[0], forall.getStmt(), reducedAccesses); } + + // std::cout << "printing loops ----------------------------------------------------------------------------------------------\n"; + // std::cout << loops << std::endl; + // std::cout << "loops printed -----------------------------------------------------------------------------------------------\n"; // taco_iassert(loops.defined()); if (!generateComputeCode() && !hasStores(loops)) { @@ -832,6 +898,9 @@ Stmt LowererImplImperative::lowerForall(Forall forall) parallelUnitIndexVars.erase(forall.getParallelUnit()); parallelUnitSizes.erase(forall.getParallelUnit()); } + + forUnits.erase(loopDepth); + loopDepth--; return Block::blanks(preInitValues, temporaryValuesInitFree[0], loops, @@ -1136,13 +1205,22 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, set reducedAccesses, ir::Stmt recoveryStmt) { + // std::cout << "1 Stmt LowererImplImperative::lowerForallDimension\n"; + // std::cout << "1 Stmt LowererImplImperative::lowerForallDimension markAssignsAtomicDepth: " << markAssignsAtomicDepth << std::endl; Expr coordinate = getCoordinateVar(forall.getIndexVar()); if (forall.getParallelUnit() != ParallelUnit::NotParallel && forall.getOutputRaceStrategy() == OutputRaceStrategy::Atomics) { markAssignsAtomicDepth++; + // std::cout << "1 Stmt LowererImplImperative::lowerForallDimension getParallelUnit() is Not NotParallel and outputRaceStrategy is Atomics\n"; + // std::cout << "markAssignsAtomicDepth: " << markAssignsAtomicDepth << std::endl; atomicParallelUnit = forall.getParallelUnit(); } + else { + // std::cout << "1 Stmt LowererImplImperative::lowerForallDimension getParallelUnit() is NotParallel or outputRaceStrategy is not Atomics\n"; + } + // std::cout << "original forall : " << forall << std::endl; + // std::cout << "inside IndexStmt: " << forall.getStmt() << std::endl; Stmt body = lowerForallBody(coordinate, forall.getStmt(), locators, inserters, appenders, reducedAccesses); @@ -1158,7 +1236,18 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, std::vector bounds = provGraph.deriveIterBounds(forall.getIndexVar(), definedIndexVarsOrdered, underivedBounds, indexVarToExprMap, iterators); LoopKind kind = LoopKind::Serial; - if (forall.getParallelUnit() == ParallelUnit::CPUVector && !ignoreVectorize) { + if (should_use_ISPC_codegen()) { + // std::cout << "Foreach compatible loop\n"; + if (forall.getParallelUnit() == ParallelUnit::CPUSimd) { + kind = LoopKind::Foreach; + } + else if (forall.getParallelUnit() == ParallelUnit::CPUSpmd + && forall.getOutputRaceStrategy() != OutputRaceStrategy::ParallelReduction + ) { + kind = LoopKind::Mul_Thread; + } + } + else if (forall.getParallelUnit() == ParallelUnit::CPUVector && !ignoreVectorize) { kind = LoopKind::Vectorized; } else if (forall.getParallelUnit() != ParallelUnit::NotParallel @@ -1166,6 +1255,7 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, kind = LoopKind::Runtime; } + // std::cout << "2 Stmt LowererImplImperative::lowerForallDimension\n"; return Block::blanks(For::make(coordinate, bounds[0], bounds[1], 1, body, kind, ignoreVectorize ? ParallelUnit::NotParallel : forall.getParallelUnit(), ignoreVectorize ? 0 : forall.getUnrollFactor()), @@ -1179,6 +1269,7 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, set reducedAccesses, ir::Stmt recoveryStmt) { + // std::cout << "1 Stmt LowererImplImperative::lowerForallDenseAcceleration\n"; taco_iassert(locators.size() == 1) << "Optimizing a dense workspace is only supported when the consumer is the only RHS tensor"; taco_iassert(provGraph.isFullyDerived(forall.getIndexVar())) << "Sparsely accelerating a dense workspace only works with fully derived index vars"; taco_iassert(forall.getParallelUnit() == ParallelUnit::NotParallel) << "Sparsely accelerating a dense workspace only works within serial loops"; @@ -1204,6 +1295,8 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, } Stmt declareVar = VarDecl::make(coordinate, Load::make(indexList, loopVar)); + // std::cout << "original forall : " << forall << std::endl; + // std::cout << "inside IndexStmt: " << forall.getStmt() << std::endl; Stmt body = lowerForallBody(coordinate, forall.getStmt(), locators, inserters, appenders, reducedAccesses); Stmt resetGuard = ir::Store::make(bitGuard, coordinate, ir::Literal::make(false), markAssignsAtomicDepth > 0, atomicParallelUnit); @@ -1216,7 +1309,12 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, Stmt posAppend = generateAppendPositions(appenders); LoopKind kind = LoopKind::Serial; - if (forall.getParallelUnit() == ParallelUnit::CPUVector && !ignoreVectorize) { + if (should_use_ISPC_codegen()) { + if (forall.getParallelUnit() == ParallelUnit::CPUSimd) { + kind = LoopKind::Foreach; + } + } + else if (forall.getParallelUnit() == ParallelUnit::CPUVector && !ignoreVectorize) { kind = LoopKind::Vectorized; } else if (forall.getParallelUnit() != ParallelUnit::NotParallel @@ -1224,6 +1322,7 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, kind = LoopKind::Runtime; } + // std::cout << "2 Stmt LowererImplImperative::lowerForallDenseAcceleration\n"; return Block::blanks(For::make(loopVar, 0, indexListSize, 1, body, kind, ignoreVectorize ? ParallelUnit::NotParallel : forall.getParallelUnit(), ignoreVectorize ? 0 : forall.getUnrollFactor()), @@ -1247,6 +1346,8 @@ Stmt LowererImplImperative::lowerForallPosition(Forall forall, Iterator iterator set reducedAccesses, ir::Stmt recoveryStmt) { + // std::cout << "1 Stmt LowererImplImperative::lowerForallPosition\n" << std::endl; + Expr coordinate = getCoordinateVar(forall.getIndexVar()); Stmt declareCoordinate = Stmt(); Stmt strideGuard = Stmt(); @@ -1278,6 +1379,11 @@ Stmt LowererImplImperative::lowerForallPosition(Forall forall, Iterator iterator markAssignsAtomicDepth++; } + // see we are inside a forall. ex: forall(i, forall(j, y(i) += A(i,j) * x(j))) + // when you call forall.getStmt it returns forall(j, y(i) += A(i,j) * x(j)) which is the + // IndexStmt inside the forall IndexStmt + // std::cout << "original forall : " << forall << std::endl; + // std::cout << "inside IndexStmt: " << forall.getStmt() << std::endl; Stmt body = lowerForallBody(coordinate, forall.getStmt(), locators, inserters, appenders, reducedAccesses); @@ -1339,6 +1445,7 @@ Stmt LowererImplImperative::lowerForallPosition(Forall forall, Iterator iterator kind = LoopKind::Runtime; } + // std::cout << "2 Stmt LowererImplImperative::lowerForallPosition\n" << std::endl; // Loop with preamble and postamble return Block::blanks( boundsCompute, @@ -1357,6 +1464,7 @@ Stmt LowererImplImperative::lowerForallFusedPosition(Forall forall, Iterator ite set reducedAccesses, ir::Stmt recoveryStmt) { + // std::cout << "1 Stmt LowererImplImperative::lowerForallFusedPosition\n" << std::endl; Expr coordinate = getCoordinateVar(forall.getIndexVar()); Stmt declareCoordinate = Stmt(); if (provGraph.isCoordVariable(forall.getIndexVar())) { @@ -1447,6 +1555,8 @@ Stmt LowererImplImperative::lowerForallFusedPosition(Forall forall, Iterator ite markAssignsAtomicDepth++; } + // std::cout << "original forall : " << forall << std::endl; + // std::cout << "inside IndexStmt: " << forall.getStmt() << std::endl; Stmt body = lowerForallBody(coordinate, forall.getStmt(), locators, inserters, appenders, reducedAccesses); @@ -1503,6 +1613,8 @@ Stmt LowererImplImperative::lowerForallFusedPosition(Forall forall, Iterator ite && forall.getOutputRaceStrategy() != OutputRaceStrategy::ParallelReduction && !ignoreVectorize) { kind = LoopKind::Runtime; } + + // std::cout << "2 Stmt LowererImplImperative::lowerForallFusedPosition\n" << std::endl; // Loop with preamble and postamble return Block::blanks(boundsCompute, Block::make(Block::make(searchForUnderivedStart), @@ -1603,6 +1715,7 @@ Stmt LowererImplImperative::lowerMergePoint(MergeLattice pointLattice, ir::Assign::make(indexSetIter.getCoordVar(), indexSetIter.getPosVar()) ); // Code to increment both iterator variables. + std::cout << "some casting stuff happening\n"; auto incr = ir::Block::make( compoundAssign(iter.getIteratorVar(), ir::Cast::make(Eq::make(iter.getCoordVar(), setMatch), iter.getIteratorVar().type())), compoundAssign(indexSetIter.getIteratorVar(), ir::Cast::make(Eq::make(indexSetIter.getCoordVar(), setMatch), indexSetIter.getIteratorVar().type())), @@ -1765,6 +1878,9 @@ Stmt LowererImplImperative::lowerForallBody(Expr coordinate, IndexStmt stmt, vector inserters, vector appenders, const set& reducedAccesses) { + + // std::cout << "lowering a forall body----------------------------------------------------\n"; + Stmt initVals = resizeAndInitValues(appenders, reducedAccesses); // Inserter positions @@ -1780,6 +1896,7 @@ Stmt LowererImplImperative::lowerForallBody(Expr coordinate, IndexStmt stmt, // Code of loop body statement Stmt body = lower(stmt); + // std::cout << "\nBefore: [" << stmt << "]\nAfter : [" << body << "]\n"; // Code to append coordinates Stmt appendCoords = appendCoordinate(appenders, coordinate); @@ -1797,10 +1914,12 @@ Expr LowererImplImperative::getTemporarySize(Where where) { TensorVar temporary = where.getTemporary(); Dimension temporarySize = temporary.getType().getShape().getDimension(0); Access temporaryAccess = getResultAccesses(where.getProducer()).first[0]; + std::cout << "temporaryAccess: " << temporaryAccess; std::vector indexVars = temporaryAccess.getIndexVars(); if(util::all(indexVars, [&](const IndexVar& var) { return provGraph.isUnderived(var);})) { // All index vars underived then use tensor properties to get tensor size + std::cout << "All index vars underived then use tensor properties to get tensor size\n"; taco_iassert(util::contains(dimensions, indexVars[0])) << "Missing " << indexVars[0]; ir::Expr size = dimensions.at(indexVars[0]); for(size_t i = 1; i < indexVars.size(); ++i) { @@ -1811,16 +1930,19 @@ Expr LowererImplImperative::getTemporarySize(Where where) { } if (temporarySize.isFixed()) { + std::cout << "temporary is fixed\n" ; return ir::Literal::make(temporarySize.getSize()); } if (temporarySize.isIndexVarSized()) { + std::cout << "temporary is index var sized\n"; IndexVar var = temporarySize.getIndexVarSize(); vector bounds = provGraph.deriveIterBounds(var, definedIndexVarsOrdered, underivedBounds, indexVarToExprMap, iterators); return ir::Sub::make(bounds[1], bounds[0]); } + std::cout << "should this be an error\n"; taco_ierror; // TODO return Expr(); } @@ -1889,6 +2011,7 @@ vector LowererImplImperative::codeToInitializeDenseAcceleratorArrays(Where Expr p = Var::make("p" + temporary.getName(), Int()); Stmt guardZeroInit = Store::make(alreadySetArr, p, ir::Literal::zero(bitGuardType)); + // std::cout << "vector LowererImplImperative::codeToInitializeDenseAcceleratorArrays\n" << std::endl; Stmt zeroInitLoop = For::make(p, 0, bitGuardSize, 1, guardZeroInit, LoopKind::Serial); Stmt inits = Block::make(alreadySetDecl, indexListDecl, allocateAlreadySet, allocateIndexList, zeroInitLoop); return {inits, freeTemps}; @@ -2090,8 +2213,10 @@ vector LowererImplImperative::codeToInitializeTemporaryParallel(Where wher vector LowererImplImperative::codeToInitializeTemporary(Where where) { TensorVar temporary = where.getTemporary(); + cout << "temporary found: " << temporary << std::endl; const bool accelerateDense = canAccelerateDenseTemp(where).first; + cout << "accelerateDense: " << accelerateDense << std::endl; Stmt freeTemporary = Stmt(); Stmt initializeTemporary = Stmt(); @@ -2102,6 +2227,7 @@ vector LowererImplImperative::codeToInitializeTemporary(Where where) { initializeTemporary = Block::make(initializeTemporary, initTempSet); tempToBitGuard[temporary] = tempSet; } else { + cout << "higher order temporary found: " << temporary << std::endl; // TODO: Need to support keeping track of initialized elements for // temporaries that don't have sparse accelerator taco_iassert(!util::contains(guardedTemps, temporary) || accelerateDense); @@ -2119,19 +2245,32 @@ vector LowererImplImperative::codeToInitializeTemporary(Where where) { needComputeValues(where, temporary)) { values = ir::Var::make(temporary.getName(), temporary.getType().getDataType(), true, false); - taco_iassert(temporary.getType().getOrder() == 1) - << " Temporary order was " << temporary.getType().getOrder(); // TODO + std::cout << "values: " << values << std::endl; + std::cout << "dataType: " << values.type() << std::endl; + + // taco_iassert(temporary.getType().getOrder() == 1) + // << " Temporary order was " << temporary.getType().getOrder(); // TODO + Expr size = getTemporarySize(where); + std::cout << "temporarySize: " << size << std::endl; + // no decl needed for shared memory Stmt decl = Stmt(); if ((isa(where.getProducer()) && inParallelLoopDepth == 0) || !should_use_CUDA_codegen()) { decl = VarDecl::make(values, ir::Literal::make(0)); + std::cout << "decl statement: " << decl << std::endl; } Stmt allocate = Allocate::make(values, size); + std::cout << "allocate stmt: " << allocate << std::endl; freeTemporary = Block::make(freeTemporary, Free::make(values)); + std::cout << "free temp: " << freeTemporary << std::endl; initializeTemporary = Block::make(decl, initializeTemporary, allocate); + std::cout << "initializeTemporary: " << initializeTemporary << std::endl; + + // taco_iassert(temporary.getType().getOrder() == 1) + // << " Temporary order was " << temporary.getType().getOrder(); // TODO } /// Make a struct object that lowerAssignment and lowerAccess can read @@ -2144,6 +2283,7 @@ vector LowererImplImperative::codeToInitializeTemporary(Where where) { } Stmt LowererImplImperative::lowerWhere(Where where) { + // std::cout << "\n--------------------------------------- lowering where statement: " << where << "\n\n\n"; TensorVar temporary = where.getTemporary(); bool accelerateDenseWorkSpace, sortAccelerator; std::tie(accelerateDenseWorkSpace, sortAccelerator) = @@ -2180,6 +2320,7 @@ Stmt LowererImplImperative::lowerWhere(Where where) { }) ); + // std::cout << "\ninitiating lowering of where consumer: " << where.getConsumer() << std::endl; Stmt consumer = lower(where.getConsumer()); if (accelerateDenseWorkSpace && sortAccelerator) { // We need to sort the indices array @@ -2203,11 +2344,13 @@ Stmt LowererImplImperative::lowerWhere(Where where) { true, false); Expr size = getTemporarySize(where); Stmt zeroInit = Store::make(values, p, ir::Literal::zero(temporary.getType().getDataType())); + // std::cout << "Stmt LowererImplImperative::lowerWhere\n"; Stmt loopInit = For::make(p, 0, size, 1, zeroInit, LoopKind::Serial); initializeTemporary = Block::make(initializeTemporary, loopInit); } whereConsumers.push_back(consumer); + // std::cout << "\nwhere temporaries: " << where.getTemporary() << std::endl; whereTemps.push_back(where.getTemporary()); captureNextLocatePos = true; @@ -2218,6 +2361,9 @@ Stmt LowererImplImperative::lowerWhere(Where where) { restoreAtomicDepth = true; } + whereTempsWithLoopDepth.insert(std::pair(where.getTemporary(), loopDepth)); + + // std::cout << "\ninitiating lowering of where producer: " << where.getConsumer() << std::endl; Stmt producer = lower(where.getProducer()); if (accelerateDenseWorkSpace) { const Expr indexListSizeExpr = tempToIndexListSize.at(temporary); @@ -2225,6 +2371,8 @@ Stmt LowererImplImperative::lowerWhere(Where where) { initializeTemporary = Block::make(indexListSizeDecl, initializeTemporary); } + whereTempsWithLoopDepth.erase(where.getTemporary()); + if (restoreAtomicDepth) { markAssignsAtomicDepth++; } @@ -2334,6 +2482,7 @@ Stmt LowererImplImperative::lowerAssemble(Assemble assemble) { resultModeOrdering[iter.getMode().getLevel() - 1]); Expr pos = iter.getPosVar(); Stmt initPos = VarDecl::make(pos, iter.locate(locateCoords)[0]); + // std::cout << "Stmt LowererImplImperative::lowerAssemble\n"; insertEdgeLoop = For::make(coords.back(), 0, dim, 1, Block::make(initPos, insertEdgeLoop)); } else { @@ -2371,7 +2520,7 @@ Stmt LowererImplImperative::lowerAssemble(Assemble assemble) { initAssembleStmts.push_back(initValues); } } else if (zeroInit) { - initAssembleStmts.push_back(zeroInitValues(resultTensorVar, 0, prevSize)); + initAssembleStmts.push_back(zeroInitValues(resultTensorVar, 0, prevSize)); // init values } } Stmt initAssemble = Block::make(initAssembleStmts); @@ -2415,6 +2564,7 @@ Stmt LowererImplImperative::lowerMulti(Multi multi) { } Stmt LowererImplImperative::lowerSuchThat(SuchThat suchThat) { + // std::cout << "lowering such that statement\n"; Stmt stmt = lower(suchThat.getStmt()); return Block::make(stmt); } @@ -2528,6 +2678,7 @@ Expr LowererImplImperative::lowerSqrt(Sqrt sqrt) { Expr LowererImplImperative::lowerCast(Cast cast) { + std::cout << "casting: " << cast.getA() << ", dataType: " << cast.getDataType() << std::endl; return ir::Cast::make(lower(cast.getA()), cast.getDataType()); } @@ -2744,7 +2895,7 @@ Stmt LowererImplImperative::initResultArrays(vector writes, // iteration of all the iterators is not full. We can check this by seeing if we can recover a // full iterator from our set of iterators. Expr size = generateAssembleCode() ? getCapacityVar(tensor) : parentSize; - result.push_back(zeroInitValues(tensor, 0, size)); + result.push_back(zeroInitValues(tensor, 0, size)); // init values } } return result.empty() ? Stmt() : Block::blanks(result); @@ -2895,7 +3046,7 @@ Stmt LowererImplImperative::initResultArrays(IndexVar var, vector writes util::contains(reducedAccesses, write)) { // Zero-initialize values array if might not assign to every element // in values array during compute - result.push_back(zeroInitValues(tensor, resultParentPos, stride)); + result.push_back(zeroInitValues(tensor, resultParentPos, stride)); // init values } } } @@ -2942,6 +3093,7 @@ Stmt LowererImplImperative::resizeAndInitValues(const std::vector& app Stmt LowererImplImperative::zeroInitValues(Expr tensor, Expr begin, Expr size) { + // std::cout << "1 Stmt LowererImplImperative::zeroInitValues\n"; Expr lower = simplify(ir::Mul::make(begin, size)); Expr upper = simplify(ir::Mul::make(ir::Add::make(begin, 1), size)); Expr p = Var::make("p" + util::toString(tensor), Int()); @@ -2954,6 +3106,11 @@ Stmt LowererImplImperative::zeroInitValues(Expr tensor, Expr begin, Expr size) { return ir::VarDecl::make(ir::Var::make("status", Int()), ir::Call::make("cudaMemset", {values, ir::Literal::make(0, Int()), ir::Mul::make(ir::Sub::make(upper, lower), ir::Literal::make(values.type().getNumBytes()))}, Int())); } + // std::cout << "2 Stmt LowererImplImperative::zeroInitValues\n"; + // if generating ispc code, we will keep the LoopKind as Init so that we can initializa it if tasks are used + if (should_use_ISPC_codegen()) { + return For::make(p, lower, upper, 1, zeroInit, LoopKind::Init); + } return For::make(p, lower, upper, 1, zeroInit, parallel); } @@ -3235,6 +3392,7 @@ Stmt LowererImplImperative::codeToIncIteratorVars(Expr coordinate, IndexVar coor for (auto& iterator : levelIterators) { Expr ivar = iterator.getIteratorVar(); if (iterator.isUnique()) { + std::cout << "casting \n"; Expr increment = iterator.isFull() ? 1 : ir::Cast::make(Eq::make(iterator.getCoordVar(), @@ -3505,6 +3663,7 @@ Expr LowererImplImperative::generateAssembleGuard(IndexExpr expr) { } void visit(const CastNode* node) { + std::cout << "lowering to cast node\n"; expr = lower(node->a); } diff --git a/src/lower/tensor_path.h b/src/lower/tensor_path.h index 4f5dc49af..da52fb782 100644 --- a/src/lower/tensor_path.h +++ b/src/lower/tensor_path.h @@ -2,6 +2,7 @@ #define TACO_TENSOR_PATH_H #include +#include #include #include "taco/util/comparable.h" @@ -47,14 +48,13 @@ class TensorPath : public util::Comparable { friend bool operator==(const TensorPath&, const TensorPath&); friend bool operator<(const TensorPath&, const TensorPath&); + friend std::ostream& operator<<(std::ostream&, const TensorPath&); private: struct Content; std::shared_ptr content; }; -std::ostream& operator<<(std::ostream&, const TensorPath&); - /// A step along a tensor path. class TensorPathStep : public util::Comparable { diff --git a/src/tensor.cpp b/src/tensor.cpp index fab437ff1..176856196 100644 --- a/src/tensor.cpp +++ b/src/tensor.cpp @@ -10,6 +10,7 @@ #include #include +#include "../test/util.h" #include "taco/cuda.h" #include "taco/format.h" #include "taco/taco_tensor_t.h" @@ -278,6 +279,7 @@ static size_t unpackTensorData(const taco_tensor_t& tensorData, /// Pack coordinates into a data structure given by the tensor format. void TensorBase::pack() { + std::cout << "TensorBase::Pack() method\n"; if (!needsPack()) { return; } @@ -346,6 +348,7 @@ void TensorBase::pack() { taco_iassert((content->coordinateBufferUsed % content->coordinateSize) == 0); const size_t numCoordinates = content->coordinateBufferUsed / content->coordinateSize; + std::cout << "call helperFuncs\n"; const auto helperFuncs = getHelperFunctions(getFormat(), getComponentType(), dimensions); @@ -619,10 +622,12 @@ void TensorBase::compile() { IndexStmt stmt = makeConcreteNotation(makeReductionNotation(assignment)); stmt = reorderLoopsTopologically(stmt); stmt = insertTemporaries(stmt); + std::cout << "calling parallelizeOuterLoop(stmt)\n"; stmt = parallelizeOuterLoop(stmt); compile(stmt, content->assembleWhileCompute); } void TensorBase::compile(taco::IndexStmt stmt, bool assembleWhileCompute) { + std::cout << "TensorBase::compile\n"; if (!needsCompile()) { return; } @@ -802,6 +807,63 @@ void TensorBase::assemble() { } } +void TensorBase::compute(std::ofstream& statfile, std::string& sofile) { + taco_uassert(!needsCompile()) << error::compute_without_compile; + // if (!needsCompute()) { + // return; + // } + setNeedsCompute(false); + // Sync operand tensors if needed. + auto operands = getTensors(getAssignment().getRhs()); + for (auto& operand : operands) { + // std::cout << "operand: " << operand.second << std::endl; + operand.second.syncValues(); + operand.second.removeDependentTensor(*this); + } + + auto arguments = packArguments(*this); + + taco::util::TimeResults timevalue; + bool time = true; + TOOL_BENCHMARK_TIMER2(this->content->module->callFuncPacked("compute", sofile, arguments.data()), + "\nkernel execution time: ", timevalue); + // this->content->module->callFuncPacked("compute", arguments.data()); + + if (content->assembleWhileCompute) { + setNeedsAssemble(false); + taco_tensor_t* tensorData = ((taco_tensor_t*)arguments[0]); + content->valuesSize = unpackTensorData(*tensorData, *this); + } +} + +void TensorBase::compute(std::ofstream& statfile) { + taco_uassert(!needsCompile()) << error::compute_without_compile; + // if (!needsCompute()) { + // return; + // } + setNeedsCompute(false); + // Sync operand tensors if needed. + auto operands = getTensors(getAssignment().getRhs()); + for (auto& operand : operands) { + operand.second.syncValues(); + operand.second.removeDependentTensor(*this); + } + + auto arguments = packArguments(*this); + + taco::util::TimeResults timevalue; + bool time = true; + TOOL_BENCHMARK_TIMER2(this->content->module->callFuncPacked("compute", arguments.data()), + "\nkernel execution time: ", timevalue); + // this->content->module->callFuncPacked("compute", arguments.data()); + + if (content->assembleWhileCompute) { + setNeedsAssemble(false); + taco_tensor_t* tensorData = ((taco_tensor_t*)arguments[0]); + content->valuesSize = unpackTensorData(*tensorData, *this); + } +} + void TensorBase::compute() { taco_uassert(!needsCompile()) << error::compute_without_compile; if (!needsCompute()) { @@ -816,7 +878,9 @@ void TensorBase::compute() { } auto arguments = packArguments(*this); + std::cout << "running the compute function from the shared library\n"; this->content->module->callFuncPacked("compute", arguments.data()); + std::cout << "compute function executed\n"; if (content->assembleWhileCompute) { setNeedsAssemble(false); @@ -934,6 +998,7 @@ TensorBase::getHelperFunctions(const Format& format, Datatype ctype, }; const auto dims = util::map(dimensions, getDim); + set_ISPC_code_stream_enabled(false); if (format.getOrder() > 0) { const Format bufferFormat = COO(format.getOrder(), false, true, false, format.getModeOrdering()); @@ -951,6 +1016,7 @@ TensorBase::getHelperFunctions(const Format& format, Datatype ctype, } // Lower packing and iterator code. + std::cout << "1 Lower packing and iterator code\n"; helperModule->addFunction(lower(packStmt, "pack", true, true)); helperModule->addFunction(lower(iterateStmt, "iterate", false, true)); } else { @@ -964,12 +1030,14 @@ TensorBase::getHelperFunctions(const Format& format, Datatype ctype, IndexVar indexVar; IndexStmt assignment = (packedScalar() = bufferVector(indexVar)); IndexStmt packStmt= makeConcreteNotation(makeReductionNotation(assignment)); + std::cout << "2 Lower packing and iterator code\n"; helperModule->addFunction(lower(packStmt, "pack", true, true)); // Define and lower iterator code. IndexStmt iterateStmt = Yield({}, packedScalar()); helperModule->addFunction(lower(iterateStmt, "iterate", false, true)); } + std::cout << "Compiling the helperModule\n"; helperModule->compile(); helperFunctionsMutex.lock(); diff --git a/taco-uml.wsd b/taco-uml.wsd new file mode 100644 index 000000000..4b8e39802 --- /dev/null +++ b/taco-uml.wsd @@ -0,0 +1,411 @@ +@startuml taco +scale 1 + + +class IntrusivePtr { + +T *ptr +} +class Uncopyable {} + +class IRNode { + +virtual void accept(IRVisitorStrict *v) const = 0 + +virtual IRNodeType type_info() const = 0; +} + +class BaseStmtNode {} +class BaseExprNode { + +Datatype type +} + +class StmtNode { + +void accept(IRVisitorStrict *v) const +} +class ExprNode { + +void accept(IRVisitorStrict *v) const +} + +Uncopyable <|-- IRNode +IRNode <|-- BaseStmtNode +IRNode <|-- BaseExprNode +BaseStmtNode <|-- StmtNode +BaseExprNode <|-- ExprNode + +class IRHandle { + +void accept(IRVisitorStrict *v) const +} +class Expr {} +class Stmt {} + +IntrusivePtr <|-- IRHandle +IRHandle <|-- Expr +IRHandle <|-- Stmt + +IRHandle "1" *-- "1" IRNode : contains + + + +' this class is abstract but plantuml version does not support interface keyword +interface IRVisitorStrict { + +virtual void visit(const IRNode*) const = 0 +} + +/' +IRVisitor is not an interface or abstract because it +has not pure virtual methods +'/ +class IRVisitor { + +virtual void visit(const IRNode*) +} + +class IRRewriter { + ' protected fields and methods + #Expr expr + #Stmt stmt + + #virtual void visit(const ExprNode* op) + #virtual void visit(const StmtNode* op) + + ' public fields and methods + +Expr rewrite(Expr) + +Stmt rewrite(Stmt) +} +class IRPrinter { + #std::ostream &stream + #std::ostream &stream2 + #int indent + #bool color + #bool simplify + #enum Precedence + #Precedence parentPrecedence = BOTTOM + #NameGenerator varNameGenerator + #scopedMap varNames + + #void doIndent() + #void printBinOp(Expr a, Expr b, std::string op, Precedence precedence) + #void fewMoreMethods() + + #virtual void visit(const ExprNode*) + #virtual void visit(const StmtNode*) + + +setColor(bool color) + +print(Stmt) +} +class IRVerifier {} + +IRVisitorStrict <|-- IRVisitor +IRVisitorStrict <|-- IRPrinter +IRVisitorStrict <|-- IRRewriter +IRVisitor <|-- IRVerifier + +' Inheritance from IRRewriter +' simplifier for ir::Expr +class ExpressionSimplifier {} +IRRewriter <|-- ExpressionSimplifier + +' simplifiers for ir::Stmt +class RemoveRedundantStatements {} +class RemoveRedundantLoops {} +class RemoveDuplicateBody {} + +IRRewriter <|-- RemoveRedundantStatements +IRRewriter <|-- RemoveRedundantLoops +IRRewriter <|-- RemoveDuplicateBody + + +' Inheritance from IRPrinter +class CodeGen {} +class CodeGen_C {} +class CodeGen_CUDA {} +class CodeGen_ISPC { + -class FindVars +} + +class FindVars {} + +IRPrinter <|-- CodeGen +CodeGen <|-- CodeGen_C +CodeGen <|-- CodeGen_ISPC +CodeGen <|-- CodeGen_CUDA + +IRVisitor <|-- FindVars +CodeGen_ISPC +-- FindVars + +class Manageable {} +class IndexStmtNode { + -virtual void accept(IndexStmtVisitorStrict*) const = 0 +} +class IndexExprNode { + -virtual void accept(IndexStmtVisitorStrict*) const = 0 +} + + +Manageable <|-- IndexStmtNode +Uncopyable <|-- IndexStmtNode +Manageable <|-- IndexExprNode +Uncopyable <|-- IndexExprNode + +class IndexStmt {} +class IndexExpr {} + +IntrusivePtr <|-- IndexStmt +IndexStmt "1" *-- "1" IndexStmtNode +IntrusivePtr <|-- IndexExpr +IndexExpr "1" *-- "1" IndexExprNode + + +abstract class IndexExprVisitorStrict { + +void visit(const IndexStmt&) + +virtual void visit(const AccessNode*) = 0 + +virtual void visit(const LiteralNode*) = 0 + +virtual void visit(const NegNode*) = 0 + +virtual void visit(const AddNode*) = 0 + +virtual void visit(const SubNode*) = 0 + +virtual void visit(const MulNode*) = 0 + +virtual void visit(const DivNode*) = 0 + +virtual void visit(const SqrtNode*) = 0 + +virtual void visit(const CastNode*) = 0 + +virtual void visit(const CallIntrinsicNode*) = 0 + +virtual void visit(const ReductionNode*) = 0 +} +abstract class IndexStmtVisitorStrict { + +void visit(const IndexStmt&) + +virtual void visit(const AssignmentNode*) = 0 + +virtual void visit(const YieldNode*) = 0 + +virtual void visit(const ForallNode*) = 0 + +virtual void visit(const WhereNode*) = 0 + +virtual void visit(const SequenceNode*) = 0 + +virtual void visit(const AssembleNode*) = 0 + +virtual void visit(const MultiNode*) = 0 + +virtual void visit(const SuchThatNode*) = 0 +} + +abstract class IndexNotationVisitorStrict {} +class IndexNotationPrinter { + +void print(const IndexExpr& expr) + +void print(const IndexStmt& expr) + + ' Index Expressions visit() + +void visit(const AccessNode* node) + +void visit(const LiteralNode* node) + + void visit(const NegNode* node) + + void visit(const AddNode* node) + + void visit(const SubNode* node) + + void visit(const MulNode* node) + + void visit(const DivNode* node) + + void visit(const SqrtNode* node) + + void visit(const CastNode* node) + + void visit(const CallIntrinsicNode* node) + + void visit(const UnaryExprNode* node) + + void visit(const BinaryExprNode* node) + + void visit(const ReductionNode* node) + + ' Index Statement visit() + + void visit(const AssignmentNode* node) + + void visit(const YieldNode* node) + + void visit(const ForallNode* node) + + void visit(const WhereNode* node) + + void visit(const SequenceNode* node) + + void visit(const AssembleNode* node) + + void visit(const MultiNode* node) + + void visit(const SuchThatNode* node) +} +class IndexNotationVisitor { + ' Index Expressions visit() + +virtual void visit(const AccessNode* node) + +virtual void visit(const LiteralNode* node) + +virtual void visit(const NegNode* node) + +virtual void visit(const AddNode* node) + +virtual void visit(const SubNode* node) + +virtual void visit(const MulNode* node) + +virtual void visit(const DivNode* node) + +virtual void visit(const SqrtNode* node) + +virtual void visit(const CastNode* node) + +virtual void visit(const CallIntrinsicNode* node) + +virtual void visit(const UnaryExprNode* node) + +virtual void visit(const BinaryExprNode* node) + +virtual void visit(const ReductionNode* node) + + ' Index Statement visit() + +virtual void visit(const AssignmentNode* node) + +virtual void visit(const YieldNode* node) + +virtual void visit(const ForallNode* node) + +virtual void visit(const WhereNode* node) + +virtual void visit(const SequenceNode* node) + +virtual void visit(const AssembleNode* node) + +virtual void visit(const MultiNode* node) + +virtual void visit(const SuchThatNode* node) +} +class Matcher { + +} + +abstract class IndexExprRewriterStrict { + +IndexExpr rewrite(IndexExpr) + + #IndexExpr expr + + #virtual void visit(const AccessNode* op) = 0 + #virtual void visit(const LiteralNode* op) = 0 + #virtual void visit(const NegNode* op) = 0 + #virtual void visit(const SqrtNode* op) = 0 + #virtual void visit(const AddNode* op) = 0 + #virtual void visit(const SubNode* op) = 0 + #virtual void visit(const MulNode* op) = 0 + #virtual void visit(const DivNode* op) = 0 + #virtual void visit(const CastNode* op) = 0 + #virtual void visit(const CallIntrinsicNode* op) = 0 + #virtual void visit(const ReductionNode* op) = 0 +} +abstract class IndexStmtRewriterStrict { + +IndexStmt rewrite(IndexStmt) + + #IndexStmt stmt + + #virtual void visit(const AssignmentNode* op) = 0 + #virtual void visit(const YieldNode* op) = 0 + #virtual void visit(const ForallNode* op) = 0 + #virtual void visit(const WhereNode* op) = 0 + #virtual void visit(const SequenceNode* op) = 0 + #virtual void visit(const AssembleNode* op) = 0 + #virtual void visit(const MultiNode* op) = 0 + #virtual void visit(const SuchThatNode* op) = 0 +} +abstract class IndexNotationRewriterStrict {} +class IndexNotationRewriter { + ' Index Expressions visit() + +virtual void visit(const AccessNode* node) + +virtual void visit(const LiteralNode* node) + +virtual void visit(const NegNode* node) + +virtual void visit(const AddNode* node) + +virtual void visit(const SubNode* node) + +virtual void visit(const MulNode* node) + +virtual void visit(const DivNode* node) + +virtual void visit(const SqrtNode* node) + +virtual void visit(const CastNode* node) + +virtual void visit(const CallIntrinsicNode* node) + +virtual void visit(const UnaryExprNode* node) + +virtual void visit(const BinaryExprNode* node) + +virtual void visit(const ReductionNode* node) + + ' Index Statement visit() + +virtual void visit(const AssignmentNode* node) + +virtual void visit(const YieldNode* node) + +virtual void visit(const ForallNode* node) + +virtual void visit(const WhereNode* node) + +virtual void visit(const SequenceNode* node) + +virtual void visit(const AssembleNode* node) + +virtual void visit(const MultiNode* node) + +virtual void visit(const SuchThatNode* node) +} + + +IndexExprVisitorStrict <|-- IndexNotationVisitorStrict +IndexStmtVisitorStrict <|-- IndexNotationVisitorStrict +IndexNotationVisitorStrict <|-- IndexNotationVisitor +IndexNotationVisitorStrict <|-- IndexNotationPrinter +IndexNotationVisitor <|-- Matcher + +IndexExprVisitorStrict <|-- IndexExprRewriterStrict +IndexStmtVisitorStrict <|-- IndexStmtRewriterStrict +IndexExprRewriterStrict <|-- IndexNotationRewriterStrict +IndexStmtRewriterStrict <|-- IndexNotationRewriterStrict + +IndexNotationRewriterStrict <|-- IndexNotationRewriter + +' - private +' # protected +' ~ package private +' + public + +' {static} +' {abstract} virtual methods + +' lowering part -- convertion from IndexExpr and IndexStmt to ir::Expr and ir::Stmt +class Lowerer { + +std::shared_ptr impl; +} +abstract class LowererImpl { + ' protected fields and methods + #class Visitor; + #friend class Visitor; + #std::shared_ptr visitor; + + #virtual ir::Stmt lower(IndexStmt stmt); + #virtual ir::Expr lower(IndexExpr expr); + + #virtual ir::Expr lowerExpr(IndexExpr expr) = 0; + #virtual ir::Stmt lowerStmt(IndexStmt stmt) = 0; + + ' public fields and methods + +virtual ir::Stmt lower(IndexStmt stmt, std::string name, + bool assemble, bool compute, bool pack, bool unpack) = 0; +} + +class LowererImplImperative { + ' private fields and methods + -class Visitor + -fiend class Visitor + -std::shared_ptr visitor + -bool assemble + -bool compute + -vars a_bunch_of_other_fields + + ' protected fields and methods + #virtual ir::Stmt lowerExpr(IndexExpr expr); + #virtual ir::Stmt lowerStmt(IndexStmt stmt); + + ' public fields and methods + +ir::Stmt lower(IndexStmt stmt, std::string name, + bool assemble, bool compute, bool pack, bool unpack) + +} +note bottom of LowererImplImperative : Stmt LowererImplImperative::lower(IndexStmt stmt) {\n return visitor->lower(stmt);\n} + +Uncopyable <|-- LowererImpl +Lowerer "1" *-- "1" LowererImpl : contains + + +' visitor that does the lowering +class Visitor { + ' private fields and methods + -LowererImpl* impl + -Expr expr + -Stmt stmt + + -void visit(const AssignmentNode* node) + -void visit(const YieldNode* node) + -void visit(const ForallNode* node) + -void visit(const WhereNode* node) + -void visit(const MultiNode* node) + -void visit(const SuchThatNode* node) + -void visit(const SequenceNode* node) + -void visit(const AssembleNode* node) + -void visit(const AccessNode* node) + -void visit(const LiteralNode* node) + -void visit(const NegNode* node) + -void visit(const AddNode* node) + -void visit(const SubNode* node) + -void visit(const MulNode* node) + -void visit(const DivNode* node) + -void visit(const SqrtNode* node) + -void visit(const CastNode* node) + -void visit(const CallIntrinsicNode* node) + -void visit(const ReductionNode* node) + + ' public fields and methods + +Visitor(LowererImplImperative* impl) + +Stmt lower(IndexStmt stmt) + +Expr lower(IndexExpr expr) +} + +note bottom of Visitor: Stmt lower(IndexStmt stmt) {\n this->stmt = Stmt();\n impl->accessibleIterators.scope();\n IndexStmtVisitorStrict::visit(stmt);\n impl->accessibleIterators.unscope();\n return this->stmt;\n} + +IndexNotationVisitorStrict <|-- Visitor +LowererImpl "1" +-- "1" Visitor : contains +Visitor "1" *-- "1" LowererImpl : contains + +LowererImpl <|-- LowererImplImperative +LowererImplImperative "1" +-- "1" Visitor : contains +Visitor "1" *-- "1" LowererImplImperative : contains + +@enduml \ No newline at end of file diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 02464ce26..f4d848de0 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -11,6 +11,7 @@ add_executable(taco-test ${TEST_SOURCES} ${TEST_HEADERS}) target_link_libraries(taco-test taco-gtest) target_link_libraries(taco-test pthread) target_link_libraries(taco-test taco) +target_link_libraries(taco-test papi) if(${CMAKE_VERSION} VERSION_LESS "3.9.0") add_test(NAME taco-test COMMAND taco-test) diff --git a/test/kernels/mttkrp_gemm/mttkrp_ryan.c b/test/kernels/mttkrp_gemm/mttkrp_ryan.c new file mode 100644 index 000000000..9d0536b8c --- /dev/null +++ b/test/kernels/mttkrp_gemm/mttkrp_ryan.c @@ -0,0 +1,177 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A1845, taco_tensor_t *matmul_5_5_5, taco_tensor_t *A1475, taco_tensor_t *A1416) { + int A18451_dimension = (int)(A1845->dimensions[0]); + int A18452_dimension = (int)(A1845->dimensions[1]); + double* restrict A1845_vals = (double*)(A1845->vals); + + A1845_vals = (double*)malloc(sizeof(double) * (A18451_dimension * A18452_dimension)); + + A1845->vals = (uint8_t*)A1845_vals; + return 0; +} + +int compute(taco_tensor_t *A1845, taco_tensor_t *matmul_5_5_5, taco_tensor_t *A1475, taco_tensor_t *A1416) { + int A18451_dimension = (int)(A1845->dimensions[0]); + int A18452_dimension = (int)(A1845->dimensions[1]); + double* restrict A1845_vals = (double*)(A1845->vals); + int* restrict matmul_5_5_51_pos = (int*)(matmul_5_5_5->indices[0][0]); + int* restrict matmul_5_5_51_crd = (int*)(matmul_5_5_5->indices[0][1]); + int* restrict matmul_5_5_52_pos = (int*)(matmul_5_5_5->indices[1][0]); + int* restrict matmul_5_5_52_crd = (int*)(matmul_5_5_5->indices[1][1]); + int* restrict matmul_5_5_53_pos = (int*)(matmul_5_5_5->indices[2][0]); + int* restrict matmul_5_5_53_crd = (int*)(matmul_5_5_5->indices[2][1]); + double* restrict matmul_5_5_5_vals = (double*)(matmul_5_5_5->vals); + int A14751_dimension = (int)(A1475->dimensions[0]); + int A14752_dimension = (int)(A1475->dimensions[1]); + double* restrict A1475_vals = (double*)(A1475->vals); + int A14161_dimension = (int)(A1416->dimensions[0]); + int A14162_dimension = (int)(A1416->dimensions[1]); + double* restrict A1416_vals = (double*)(A1416->vals); + + #pragma omp parallel for schedule(static) + for (int32_t pA1845 = 0; pA1845 < (A18451_dimension * A18452_dimension); pA1845++) { + A1845_vals[pA1845] = 0.0; + } + + #pragma omp parallel for schedule(runtime) + for (int32_t i1542matmul_5_5_5 = matmul_5_5_51_pos[0]; i1542matmul_5_5_5 < matmul_5_5_51_pos[1]; i1542matmul_5_5_5++) { + int32_t i1542 = matmul_5_5_51_crd[i1542matmul_5_5_5]; + for (int32_t i1545 = 0; i1545 < A14162_dimension; i1545++) { + int32_t i1545A1845 = i1542 * A18452_dimension + i1545; + double ti1543A1845_val = 0.0; + for (int32_t i1543matmul_5_5_5 = matmul_5_5_52_pos[i1542matmul_5_5_5]; i1543matmul_5_5_5 < matmul_5_5_52_pos[(i1542matmul_5_5_5 + 1)]; i1543matmul_5_5_5++) { + int32_t i1543 = matmul_5_5_52_crd[i1543matmul_5_5_5]; + int32_t i1545A1416 = i1543 * A14162_dimension + i1545; + for (int32_t i1544matmul_5_5_5 = matmul_5_5_53_pos[i1543matmul_5_5_5]; i1544matmul_5_5_5 < matmul_5_5_53_pos[(i1543matmul_5_5_5 + 1)]; i1544matmul_5_5_5++) { + int32_t i1544 = matmul_5_5_53_crd[i1544matmul_5_5_5]; + int32_t i1545A1475 = i1544 * A14752_dimension + i1545; + ti1543A1845_val += (matmul_5_5_5_vals[i1544matmul_5_5_5] * A1475_vals[i1545A1475]) * A1416_vals[i1545A1416]; + } + } + A1845_vals[i1545A1845] = ti1543A1845_val; + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/mttkrp_gemm/taco_default.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} diff --git a/test/kernels/mttkrp_gemm/mttkrp_ryan.h b/test/kernels/mttkrp_gemm/mttkrp_ryan.h new file mode 100644 index 000000000..3d0c06f50 --- /dev/null +++ b/test/kernels/mttkrp_gemm/mttkrp_ryan.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A1845, taco_tensor_t *matmul_5_5_5, taco_tensor_t *A1475, taco_tensor_t *A1416); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A1845, taco_tensor_t *matmul_5_5_5, taco_tensor_t *A1475, taco_tensor_t *A1416); +#endif diff --git a/test/kernels/mttkrp_gemm/taco_default.c b/test/kernels/mttkrp_gemm/taco_default.c new file mode 100644 index 000000000..edf8cdb16 --- /dev/null +++ b/test/kernels/mttkrp_gemm/taco_default.c @@ -0,0 +1,183 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A1538, taco_tensor_t *matmul_5_5_5, taco_tensor_t *A1475, taco_tensor_t *A1416, taco_tensor_t *A1479) { + int A15381_dimension = (int)(A1538->dimensions[0]); + int A15382_dimension = (int)(A1538->dimensions[1]); + double* restrict A1538_vals = (double*)(A1538->vals); + + A1538_vals = (double*)malloc(sizeof(double) * (A15381_dimension * A15382_dimension)); + + A1538->vals = (uint8_t*)A1538_vals; + return 0; +} + +int compute(taco_tensor_t *A1538, taco_tensor_t *matmul_5_5_5, taco_tensor_t *A1475, taco_tensor_t *A1416, taco_tensor_t *A1479) { + int A15381_dimension = (int)(A1538->dimensions[0]); + int A15382_dimension = (int)(A1538->dimensions[1]); + double* restrict A1538_vals = (double*)(A1538->vals); + int* restrict matmul_5_5_51_pos = (int*)(matmul_5_5_5->indices[0][0]); + int* restrict matmul_5_5_51_crd = (int*)(matmul_5_5_5->indices[0][1]); + int* restrict matmul_5_5_52_pos = (int*)(matmul_5_5_5->indices[1][0]); + int* restrict matmul_5_5_52_crd = (int*)(matmul_5_5_5->indices[1][1]); + int* restrict matmul_5_5_53_pos = (int*)(matmul_5_5_5->indices[2][0]); + int* restrict matmul_5_5_53_crd = (int*)(matmul_5_5_5->indices[2][1]); + double* restrict matmul_5_5_5_vals = (double*)(matmul_5_5_5->vals); + int A14751_dimension = (int)(A1475->dimensions[0]); + int A14752_dimension = (int)(A1475->dimensions[1]); + double* restrict A1475_vals = (double*)(A1475->vals); + int A14161_dimension = (int)(A1416->dimensions[0]); + int A14162_dimension = (int)(A1416->dimensions[1]); + double* restrict A1416_vals = (double*)(A1416->vals); + int A14791_dimension = (int)(A1479->dimensions[0]); + int A14792_dimension = (int)(A1479->dimensions[1]); + double* restrict A1479_vals = (double*)(A1479->vals); + + #pragma omp parallel for schedule(static) + for (int32_t pA1538 = 0; pA1538 < (A15381_dimension * A15382_dimension); pA1538++) { + A1538_vals[pA1538] = 0.0; + } + + #pragma omp parallel for schedule(runtime) + for (int32_t i1542matmul_5_5_5 = matmul_5_5_51_pos[0]; i1542matmul_5_5_5 < matmul_5_5_51_pos[1]; i1542matmul_5_5_5++) { + int32_t i1542 = matmul_5_5_51_crd[i1542matmul_5_5_5]; + for (int32_t i1546 = 0; i1546 < A14792_dimension; i1546++) { + int32_t i1546A1538 = i1542 * A15382_dimension + i1546; + double ti1543A1538_val = 0.0; + for (int32_t i1543matmul_5_5_5 = matmul_5_5_52_pos[i1542matmul_5_5_5]; i1543matmul_5_5_5 < matmul_5_5_52_pos[(i1542matmul_5_5_5 + 1)]; i1543matmul_5_5_5++) { + int32_t i1543 = matmul_5_5_52_crd[i1543matmul_5_5_5]; + for (int32_t i1544matmul_5_5_5 = matmul_5_5_53_pos[i1543matmul_5_5_5]; i1544matmul_5_5_5 < matmul_5_5_53_pos[(i1543matmul_5_5_5 + 1)]; i1544matmul_5_5_5++) { + int32_t i1544 = matmul_5_5_53_crd[i1544matmul_5_5_5]; + for (int32_t i1545 = 0; i1545 < A14791_dimension; i1545++) { + int32_t i1545A1475 = i1544 * A14752_dimension + i1545; + int32_t i1545A1416 = i1543 * A14162_dimension + i1545; + int32_t i1546A1479 = i1545 * A14792_dimension + i1546; + ti1543A1538_val += ((matmul_5_5_5_vals[i1544matmul_5_5_5] * A1475_vals[i1545A1475]) * A1416_vals[i1545A1416]) * A1479_vals[i1546A1479]; + } + } + } + A1538_vals[i1546A1538] = ti1543A1538_val; + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/mttkrp_gemm/taco_default.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]), (taco_tensor_t*)(parameterPack[4])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]), (taco_tensor_t*)(parameterPack[4])); +} diff --git a/test/kernels/mttkrp_gemm/taco_default.h b/test/kernels/mttkrp_gemm/taco_default.h new file mode 100644 index 000000000..54274569e --- /dev/null +++ b/test/kernels/mttkrp_gemm/taco_default.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A1538, taco_tensor_t *matmul_5_5_5, taco_tensor_t *A1475, taco_tensor_t *A1416, taco_tensor_t *A1479); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A1538, taco_tensor_t *matmul_5_5_5, taco_tensor_t *A1475, taco_tensor_t *A1416, taco_tensor_t *A1479); +#endif diff --git a/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.c b/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.c new file mode 100644 index 000000000..a5e031e7a --- /dev/null +++ b/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.c @@ -0,0 +1,199 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A2531, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451) { + int* restrict A25312_pos = (int*)(A2531->indices[1][0]); + int* restrict A25312_crd = (int*)(A2531->indices[1][1]); + double* restrict A2531_vals = (double*)(A2531->vals); + int* restrict cage32_pos = (int*)(cage3->indices[1][0]); + int* restrict cage32_crd = (int*)(cage3->indices[1][1]); + int A13921_dimension = (int)(A1392->dimensions[0]); + + A25312_pos = (int32_t*)malloc(sizeof(int32_t) * 6); + A25312_pos[0] = 0; + for (int32_t pA25312 = 1; pA25312 < 6; pA25312++) { + A25312_pos[pA25312] = 0; + } + int32_t A25312_crd_size = 1048576; + A25312_crd = (int32_t*)malloc(sizeof(int32_t) * A25312_crd_size); + int32_t i1468A2531 = 0; + + for (int32_t i1467 = 0; i1467 < A13921_dimension; i1467++) { + int32_t pA25312_begin = i1468A2531; + + for (int32_t i1468cage3 = cage32_pos[i1467]; i1468cage3 < cage32_pos[(i1467 + 1)]; i1468cage3++) { + int32_t i1468 = cage32_crd[i1468cage3]; + if (A25312_crd_size <= i1468A2531) { + A25312_crd = (int32_t*)realloc(A25312_crd, sizeof(int32_t) * (A25312_crd_size * 2)); + A25312_crd_size *= 2; + } + A25312_crd[i1468A2531] = i1468; + i1468A2531++; + } + + A25312_pos[i1467 + 1] = i1468A2531 - pA25312_begin; + } + + int32_t csA25312 = 0; + for (int32_t pA253120 = 1; pA253120 < 6; pA253120++) { + csA25312 += A25312_pos[pA253120]; + A25312_pos[pA253120] = csA25312; + } + + A2531_vals = (double*)malloc(sizeof(double) * i1468A2531); + + A2531->indices[1][0] = (uint8_t*)(A25312_pos); + A2531->indices[1][1] = (uint8_t*)(A25312_crd); + A2531->vals = (uint8_t*)A2531_vals; + return 0; +} + +int compute(taco_tensor_t *A2531, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451) { + double* restrict A2531_vals = (double*)(A2531->vals); + int* restrict cage32_pos = (int*)(cage3->indices[1][0]); + int* restrict cage32_crd = (int*)(cage3->indices[1][1]); + double* restrict cage3_vals = (double*)(cage3->vals); + int A13921_dimension = (int)(A1392->dimensions[0]); + int A13922_dimension = (int)(A1392->dimensions[1]); + double* restrict A1392_vals = (double*)(A1392->vals); + int A14512_dimension = (int)(A1451->dimensions[1]); + double* restrict A1451_vals = (double*)(A1451->vals); + +// int32_t i1468A2531 = 0; + + #pragma omp parallel for schedule(runtime) + for (int32_t i1467 = 0; i1467 < A13921_dimension; i1467++) { + for (int32_t i1468cage3 = cage32_pos[i1467]; i1468cage3 < cage32_pos[(i1467 + 1)]; i1468cage3++) { + int32_t i1468 = cage32_crd[i1468cage3]; + double ti1469A2531_val = 0.0; + for (int32_t i1469 = 0; i1469 < A14512_dimension; i1469++) { + int32_t i1469A1392 = i1467 * A13922_dimension + i1469; + int32_t i1469A1451 = i1468 * A14512_dimension + i1469; + ti1469A2531_val += (cage3_vals[i1468cage3] * A1392_vals[i1469A1392]) * A1451_vals[i1469A1451]; + } + A2531_vals[i1468cage3] = ti1469A2531_val; + // i1468A2531++; + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} diff --git a/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.h b/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.h new file mode 100644 index 000000000..a9d6b760d --- /dev/null +++ b/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A2531, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A2531, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451); +#endif diff --git a/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.so b/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.so new file mode 100755 index 000000000..c2c5ca30e Binary files /dev/null and b/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.so differ diff --git a/test/kernels/sddmm_spmm/csr_dense_spmm.c b/test/kernels/sddmm_spmm/csr_dense_spmm.c new file mode 100644 index 000000000..7f710f6c1 --- /dev/null +++ b/test/kernels/sddmm_spmm/csr_dense_spmm.c @@ -0,0 +1,190 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A2535, taco_tensor_t *A2531, taco_tensor_t *A1455) { + int A25352_dimension = (int)(A2535->dimensions[1]); + double* restrict A2535_vals = (double*)(A2535->vals); + + A2535_vals = (double*)malloc(sizeof(double) * (5 * A25352_dimension)); + + A2535->vals = (uint8_t*)A2535_vals; + return 0; +} + +int compute(taco_tensor_t *C, taco_tensor_t *A, taco_tensor_t *B) { + int C1_dimension = (int)(C->dimensions[0]); + int C2_dimension = (int)(C->dimensions[1]); + double* restrict C_vals = (double*)(C->vals); + int A1_dimension = (int)(A->dimensions[0]); + int* restrict A2_pos = (int*)(A->indices[1][0]); + int* restrict A2_crd = (int*)(A->indices[1][1]); + double* restrict A_vals = (double*)(A->vals); + int B1_dimension = (int)(B->dimensions[0]); + int B2_dimension = (int)(B->dimensions[1]); + double* restrict B_vals = (double*)(B->vals); + + #pragma omp parallel for schedule(static) + for (int32_t pC = 0; pC < (C1_dimension * C2_dimension); pC++) { + C_vals[pC] = 0.0; + } + + #pragma omp parallel for schedule(dynamic, 1) + for (int32_t i0 = 0; i0 < ((A1_dimension + 15) / 16); i0++) { + for (int32_t i1 = 0; i1 < 16; i1++) { + int32_t i = i0 * 16 + i1; + if (i >= A1_dimension) + continue; + + for (int32_t jpos0 = A2_pos[i] / 4; jpos0 < ((A2_pos[(i + 1)] + 3) / 4); jpos0++) { + int32_t jposA = jpos0 * 4; + if (jpos0 * 4 < A2_pos[i] || (jpos0 * 4 + 4) + ((jpos0 * 4 + 4) - jpos0 * 4) >= A2_pos[(i + 1)]) { + for (int32_t k = 0; k < B2_dimension; k++) { + int32_t kC = i * C2_dimension + k; + for (int32_t jpos1 = 0; jpos1 < 4; jpos1++) { + int32_t jposA = jpos0 * 4 + jpos1; + if (jposA < A2_pos[i] || jposA >= A2_pos[(i + 1)]) + continue; + + int32_t j = A2_crd[jposA]; + int32_t kB = j * B2_dimension + k; + C_vals[kC] = C_vals[kC] + A_vals[jposA] * B_vals[kB]; + } + } + } + else { + #pragma clang loop interleave(enable) vectorize(enable) + for (int32_t k = 0; k < B2_dimension; k++) { + int32_t kC = i * C2_dimension + k; + for (int32_t jpos1 = 0; jpos1 < 4; jpos1++) { + int32_t jposA = jpos0 * 4 + jpos1; + int32_t j = A2_crd[jposA]; + int32_t kB = j * B2_dimension + k; + C_vals[kC] = C_vals[kC] + A_vals[jposA] * B_vals[kB]; + } + } + } + } + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2])); +} diff --git a/test/kernels/sddmm_spmm/csr_dense_spmm.h b/test/kernels/sddmm_spmm/csr_dense_spmm.h new file mode 100644 index 000000000..cf0cf205c --- /dev/null +++ b/test/kernels/sddmm_spmm/csr_dense_spmm.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A2535, taco_tensor_t *A2531, taco_tensor_t *A1455); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A2535, taco_tensor_t *A2531, taco_tensor_t *A1455); +#endif diff --git a/test/kernels/sddmm_spmm/csr_dense_spmm.so b/test/kernels/sddmm_spmm/csr_dense_spmm.so new file mode 100755 index 000000000..398362532 Binary files /dev/null and b/test/kernels/sddmm_spmm/csr_dense_spmm.so differ diff --git a/test/kernels/sddmm_spmm/fused_kernel.c b/test/kernels/sddmm_spmm/fused_kernel.c new file mode 100644 index 000000000..1572bce5a --- /dev/null +++ b/test/kernels/sddmm_spmm/fused_kernel.c @@ -0,0 +1,183 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A1459, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451, taco_tensor_t *A1455) { + int A14592_dimension = (int)(A1459->dimensions[1]); + double* restrict A1459_vals = (double*)(A1459->vals); + + A1459_vals = (double*)malloc(sizeof(double) * (5 * A14592_dimension)); + + A1459->vals = (uint8_t*)A1459_vals; + return 0; +} + +int compute(taco_tensor_t *A1459, taco_tensor_t *B, taco_tensor_t *A1392, taco_tensor_t *A1451, taco_tensor_t *A1455) { + int A14591_dimension = (int)(A1459->dimensions[0]); + int A14592_dimension = (int)(A1459->dimensions[1]); + double* restrict A1459_vals = (double*)(A1459->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + double* restrict B_vals = (double*)(B->vals); + int A13921_dimension = (int)(A1392->dimensions[0]); + int A13922_dimension = (int)(A1392->dimensions[1]); + double* restrict A1392_vals = (double*)(A1392->vals); + int A14511_dimension = (int)(A1451->dimensions[0]); + int A14512_dimension = (int)(A1451->dimensions[1]); + double* restrict A1451_vals = (double*)(A1451->vals); + int A14551_dimension = (int)(A1455->dimensions[0]); + int A14552_dimension = (int)(A1455->dimensions[1]); + double* restrict A1455_vals = (double*)(A1455->vals); + + #pragma omp parallel for schedule(static) + for (int32_t pA1459 = 0; pA1459 < (A14591_dimension * A14592_dimension); pA1459++) { + A1459_vals[pA1459] = 0.0; + } + + + #pragma omp parallel for schedule(runtime) + for (int32_t i0 = 0; i0 < ((A13921_dimension + 15) / 16); i0++) { + + for (int32_t i1 = 0; i1 < 16; i1++) { + int32_t i1467 = i0 * 16 + i1; + if (i1467 >= A13921_dimension) + continue; + + for (int32_t i1468B = B2_pos[i1467]; i1468B < B2_pos[(i1467 + 1)]; i1468B++) { + int32_t i1468 = B2_crd[i1468B]; + double tA1459_val = 0.0; + for (int32_t i1469 = 0; i1469 < A14512_dimension; i1469++) { + int32_t i1469A1392 = i1467 * A13922_dimension + i1469; + int32_t i1469A1451 = i1468 * A14512_dimension + i1469; + tA1459_val += (B_vals[i1468B] * A1392_vals[i1469A1392]) * A1451_vals[i1469A1451]; + } + for (int32_t i1470 = 0; i1470 < A14552_dimension; i1470++) { + int32_t i1470A1459 = i1467 * A14592_dimension + i1470; + int32_t i1470A1455 = i1468 * A14552_dimension + i1470; + A1459_vals[i1470A1459] = A1459_vals[i1470A1459] + tA1459_val * A1455_vals[i1470A1455]; + } + } + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/fused_kernel.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]), (taco_tensor_t*)(parameterPack[4])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]), (taco_tensor_t*)(parameterPack[4])); +} diff --git a/test/kernels/sddmm_spmm/fused_kernel.h b/test/kernels/sddmm_spmm/fused_kernel.h new file mode 100644 index 000000000..e67e5a761 --- /dev/null +++ b/test/kernels/sddmm_spmm/fused_kernel.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A1459, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451, taco_tensor_t *A1455); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A1459, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451, taco_tensor_t *A1455); +#endif diff --git a/test/kernels/sddmm_spmm/fused_kernel.so b/test/kernels/sddmm_spmm/fused_kernel.so new file mode 100755 index 000000000..10619e0ca Binary files /dev/null and b/test/kernels/sddmm_spmm/fused_kernel.so differ diff --git a/test/kernels/sddmm_spmm/sddmm_ryan.c b/test/kernels/sddmm_spmm/sddmm_ryan.c new file mode 100644 index 000000000..760fb5361 --- /dev/null +++ b/test/kernels/sddmm_spmm/sddmm_ryan.c @@ -0,0 +1,210 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A2531, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451) { + int* restrict A25312_pos = (int*)(A2531->indices[1][0]); + int* restrict A25312_crd = (int*)(A2531->indices[1][1]); + double* restrict A2531_vals = (double*)(A2531->vals); + int* restrict cage32_pos = (int*)(cage3->indices[1][0]); + int* restrict cage32_crd = (int*)(cage3->indices[1][1]); + int A13921_dimension = (int)(A1392->dimensions[0]); + + A25312_pos = (int32_t*)malloc(sizeof(int32_t) * 6); + A25312_pos[0] = 0; + for (int32_t pA25312 = 1; pA25312 < 6; pA25312++) { + A25312_pos[pA25312] = 0; + } + int32_t A25312_crd_size = 1048576; + A25312_crd = (int32_t*)malloc(sizeof(int32_t) * A25312_crd_size); + int32_t i1468A2531 = 0; + + for (int32_t i1467 = 0; i1467 < A13921_dimension; i1467++) { + int32_t pA25312_begin = i1468A2531; + + for (int32_t i1468cage3 = cage32_pos[i1467]; i1468cage3 < cage32_pos[(i1467 + 1)]; i1468cage3++) { + int32_t i1468 = cage32_crd[i1468cage3]; + if (A25312_crd_size <= i1468A2531) { + A25312_crd = (int32_t*)realloc(A25312_crd, sizeof(int32_t) * (A25312_crd_size * 2)); + A25312_crd_size *= 2; + } + A25312_crd[i1468A2531] = i1468; + i1468A2531++; + } + + A25312_pos[i1467 + 1] = i1468A2531 - pA25312_begin; + } + + int32_t csA25312 = 0; + for (int32_t pA253120 = 1; pA253120 < 6; pA253120++) { + csA25312 += A25312_pos[pA253120]; + A25312_pos[pA253120] = csA25312; + } + + A2531_vals = (double*)malloc(sizeof(double) * i1468A2531); + + A2531->indices[1][0] = (uint8_t*)(A25312_pos); + A2531->indices[1][1] = (uint8_t*)(A25312_crd); + A2531->vals = (uint8_t*)A2531_vals; + return 0; +} + +int compute(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) { + + int A1_dimension = (int)(A->dimensions[0]); + double* restrict A_vals = (double*)(A->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + double* restrict B_vals = (double*)(B->vals); + int C1_dimension = (int)(C->dimensions[0]); + int C2_dimension = (int)(C->dimensions[1]); + double* restrict C_vals = (double*)(C->vals); + int D1_dimension = (int)(D->dimensions[0]); + int D2_dimension = (int)(D->dimensions[1]); + double* restrict D_vals = (double*)(D->vals); + + int32_t jA = 0; + + #pragma omp parallel for schedule(runtime) + for (int32_t i0 = 0; i0 < ((C1_dimension + 15) / 16); i0++) { + for (int32_t i1 = 0; i1 < 16; i1++) { + int32_t i = i0 * 16 + i1; + if (i >= C1_dimension) + continue; + + for (int32_t jB = B2_pos[i]; jB < B2_pos[(i + 1)]; jB++) { + int32_t j = B2_crd[jB]; + double tkA_val = 0.0; + for (int32_t k = 0; k < D2_dimension; k++) { + int32_t kC = i * C2_dimension + k; + int32_t kD = j * D2_dimension + k; + tkA_val += (B_vals[jB] * C_vals[kC]) * D_vals[kD]; + } + A_vals[jB] = tkA_val; + // jA++; + } + } + } + return 0; + +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/sddmm_ryan.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} diff --git a/test/kernels/sddmm_spmm/sddmm_ryan.h b/test/kernels/sddmm_spmm/sddmm_ryan.h new file mode 100644 index 000000000..f0f9e372a --- /dev/null +++ b/test/kernels/sddmm_spmm/sddmm_ryan.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A2531, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D); +#endif diff --git a/test/kernels/sddmm_spmm/sddmm_ryan.so b/test/kernels/sddmm_spmm/sddmm_ryan.so new file mode 100755 index 000000000..c3deae084 Binary files /dev/null and b/test/kernels/sddmm_spmm/sddmm_ryan.so differ diff --git a/test/kernels/sddmm_spmm/taco_original.c b/test/kernels/sddmm_spmm/taco_original.c new file mode 100644 index 000000000..4f084ff5e --- /dev/null +++ b/test/kernels/sddmm_spmm/taco_original.c @@ -0,0 +1,166 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A1463, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451, taco_tensor_t *A1455) { + int A14632_dimension = (int)(A1463->dimensions[1]); + double* restrict A1463_vals = (double*)(A1463->vals); + + A1463_vals = (double*)malloc(sizeof(double) * (5 * A14632_dimension)); + + A1463->vals = (uint8_t*)A1463_vals; + return 0; +} + +int compute(taco_tensor_t *A1463, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451, taco_tensor_t *A1455) { + int A14632_dimension = (int)(A1463->dimensions[1]); + double* restrict A1463_vals = (double*)(A1463->vals); + int* restrict cage32_pos = (int*)(cage3->indices[1][0]); + int* restrict cage32_crd = (int*)(cage3->indices[1][1]); + double* restrict cage3_vals = (double*)(cage3->vals); + int A13921_dimension = (int)(A1392->dimensions[0]); + int A13922_dimension = (int)(A1392->dimensions[1]); + double* restrict A1392_vals = (double*)(A1392->vals); + int A14512_dimension = (int)(A1451->dimensions[1]); + double* restrict A1451_vals = (double*)(A1451->vals); + int A14552_dimension = (int)(A1455->dimensions[1]); + double* restrict A1455_vals = (double*)(A1455->vals); + + #pragma omp parallel for schedule(runtime) + for (int32_t i1467 = 0; i1467 < A13921_dimension; i1467++) { + for (int32_t i1470 = 0; i1470 < A14552_dimension; i1470++) { + int32_t i1470A1463 = i1467 * A14632_dimension + i1470; + double ti1468A1463_val = 0.0; + for (int32_t i1468cage3 = cage32_pos[i1467]; i1468cage3 < cage32_pos[(i1467 + 1)]; i1468cage3++) { + int32_t i1468 = cage32_crd[i1468cage3]; + int32_t i1470A1455 = i1468 * A14552_dimension + i1470; + for (int32_t i1469 = 0; i1469 < A14512_dimension; i1469++) { + int32_t i1469A1392 = i1467 * A13922_dimension + i1469; + int32_t i1469A1451 = i1468 * A14512_dimension + i1469; + ti1468A1463_val += ((cage3_vals[i1468cage3] * A1392_vals[i1469A1392]) * A1451_vals[i1469A1451]) * A1455_vals[i1470A1455]; + } + } + A1463_vals[i1470A1463] = ti1468A1463_val; + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/taco_original.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]), (taco_tensor_t*)(parameterPack[4])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]), (taco_tensor_t*)(parameterPack[4])); +} diff --git a/test/kernels/sddmm_spmm/taco_original.h b/test/kernels/sddmm_spmm/taco_original.h new file mode 100644 index 000000000..71ce53402 --- /dev/null +++ b/test/kernels/sddmm_spmm/taco_original.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A1463, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451, taco_tensor_t *A1455); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A1463, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451, taco_tensor_t *A1455); +#endif diff --git a/test/kernels/sddmm_spmm/taco_original.so b/test/kernels/sddmm_spmm/taco_original.so new file mode 100755 index 000000000..f50931baa Binary files /dev/null and b/test/kernels/sddmm_spmm/taco_original.so differ diff --git a/test/kernels/spmm_gemm/gemm_default.c b/test/kernels/spmm_gemm/gemm_default.c new file mode 100644 index 000000000..605cc491f --- /dev/null +++ b/test/kernels/spmm_gemm/gemm_default.c @@ -0,0 +1,160 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A2039, taco_tensor_t *A2035, taco_tensor_t *A1450) { + int A20391_dimension = (int)(A2039->dimensions[0]); + int A20392_dimension = (int)(A2039->dimensions[1]); + double* restrict A2039_vals = (double*)(A2039->vals); + + A2039_vals = (double*)malloc(sizeof(double) * (A20391_dimension * A20392_dimension)); + + A2039->vals = (uint8_t*)A2039_vals; + return 0; +} + +int compute(taco_tensor_t *A2039, taco_tensor_t *A2035, taco_tensor_t *A1450) { + int A20391_dimension = (int)(A2039->dimensions[0]); + int A20392_dimension = (int)(A2039->dimensions[1]); + double* restrict A2039_vals = (double*)(A2039->vals); + int A20351_dimension = (int)(A2035->dimensions[0]); + int A20352_dimension = (int)(A2035->dimensions[1]); + double* restrict A2035_vals = (double*)(A2035->vals); + int A14501_dimension = (int)(A1450->dimensions[0]); + int A14502_dimension = (int)(A1450->dimensions[1]); + double* restrict A1450_vals = (double*)(A1450->vals); + + #pragma omp parallel for schedule(runtime) + for (int32_t i1517 = 0; i1517 < A20351_dimension; i1517++) { + for (int32_t i1520 = 0; i1520 < A14502_dimension; i1520++) { + int32_t i1520A2039 = i1517 * A20392_dimension + i1520; + double ti1519A2039_val = 0.0; + for (int32_t i1519 = 0; i1519 < A14501_dimension; i1519++) { + int32_t i1519A2035 = i1517 * A20352_dimension + i1519; + int32_t i1520A1450 = i1519 * A14502_dimension + i1520; + ti1519A2039_val += A2035_vals[i1519A2035] * A1450_vals[i1520A1450]; + } + A2039_vals[i1520A2039] = ti1519A2039_val; + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/spmm_gemm/gemm_default.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2])); +} diff --git a/test/kernels/spmm_gemm/gemm_default.h b/test/kernels/spmm_gemm/gemm_default.h new file mode 100644 index 000000000..769514531 --- /dev/null +++ b/test/kernels/spmm_gemm/gemm_default.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A2039, taco_tensor_t *A2035, taco_tensor_t *A1450); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A2039, taco_tensor_t *A2035, taco_tensor_t *A1450); +#endif diff --git a/test/kernels/spmm_gemm/gemm_default.so b/test/kernels/spmm_gemm/gemm_default.so new file mode 100755 index 000000000..9de7a7933 Binary files /dev/null and b/test/kernels/spmm_gemm/gemm_default.so differ diff --git a/test/kernels/spmm_gemm/gemm_template.c b/test/kernels/spmm_gemm/gemm_template.c new file mode 100644 index 000000000..4a4e5faeb --- /dev/null +++ b/test/kernels/spmm_gemm/gemm_template.c @@ -0,0 +1,183 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A2039, taco_tensor_t *A2035, taco_tensor_t *A1450) { + int A20391_dimension = (int)(A2039->dimensions[0]); + int A20392_dimension = (int)(A2039->dimensions[1]); + double* restrict A2039_vals = (double*)(A2039->vals); + + A2039_vals = (double*)malloc(sizeof(double) * (A20391_dimension * A20392_dimension)); + + A2039->vals = (uint8_t*)A2039_vals; + return 0; +} + +int compute(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *C) { + int A1_dimension = (int)(A->dimensions[0]); + int A2_dimension = (int)(A->dimensions[1]); + double* restrict A_vals = (double*)(A->vals); + int B1_dimension = (int)(B->dimensions[0]); + int B2_dimension = (int)(B->dimensions[1]); + double* restrict B_vals = (double*)(B->vals); + int C1_dimension = (int)(C->dimensions[0]); + int C2_dimension = (int)(C->dimensions[1]); + double* restrict C_vals = (double*)(C->vals); + + #pragma omp parallel for schedule(static) + for (int32_t pA = 0; pA < (A1_dimension * A2_dimension); pA++) { + A_vals[pA] = 0.0; + } + + #pragma omp parallel for schedule(runtime) + for (int32_t i0 = 0; i0 < ((B1_dimension + 15) / 16); i0++) { + for (int32_t j0 = 0; j0 < ((C1_dimension + 15) / 16); j0++) { + for (int32_t k0 = 0; k0 < ((C2_dimension + 15) / 16); k0++) { + for (int32_t i1 = 0; i1 < 16; i1++) { + int32_t i = i0 * 16 + i1; + if (i >= B1_dimension) + continue; + + for (int32_t j1 = 0; j1 < 16; j1++) { + int32_t j = j0 * 16 + j1; + int32_t jB = i * B2_dimension + j; + int32_t jA = i * A2_dimension + j; + if (j >= C1_dimension) + continue; + + double tk1A_val = 0.0; + for (int32_t k1 = 0; k1 < 16; k1++) { + int32_t k = k0 * 16 + k1; + int32_t kC = j * C2_dimension + k; + if (k >= C2_dimension) + continue; + + tk1A_val += B_vals[jB] * C_vals[kC]; + } + A_vals[jA] = A_vals[jA] + tk1A_val; + } + } + } + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/spmm_gemm/gemm_template.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2])); +} diff --git a/test/kernels/spmm_gemm/gemm_template.h b/test/kernels/spmm_gemm/gemm_template.h new file mode 100644 index 000000000..769514531 --- /dev/null +++ b/test/kernels/spmm_gemm/gemm_template.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A2039, taco_tensor_t *A2035, taco_tensor_t *A1450); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A2039, taco_tensor_t *A2035, taco_tensor_t *A1450); +#endif diff --git a/test/kernels/spmm_gemm/gemm_template.so b/test/kernels/spmm_gemm/gemm_template.so new file mode 100755 index 000000000..2cfcd7ad3 Binary files /dev/null and b/test/kernels/spmm_gemm/gemm_template.so differ diff --git a/test/kernels/spmv_spmv/spmv_fused.c b/test/kernels/spmv_spmv/spmv_fused.c new file mode 100644 index 000000000..0964fb8e1 --- /dev/null +++ b/test/kernels/spmv_spmv/spmv_fused.c @@ -0,0 +1,178 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A, taco_tensor_t *C, taco_tensor_t *v, taco_tensor_t *B) { + double* restrict A_vals = (double*)(A->vals); + + A_vals = (double*)malloc(sizeof(double) * 5); + + A->vals = (uint8_t*)A_vals; + return 0; +} + +int compute(taco_tensor_t *A, taco_tensor_t *C, taco_tensor_t *v, taco_tensor_t *B) { + printf("Adhitha1\n"); + + double* restrict A_vals = (double*)(A->vals); + int* restrict C2_pos = (int*)(C->indices[1][0]); + int* restrict C2_crd = (int*)(C->indices[1][1]); + double* restrict C_vals = (double*)(C->vals); + double* restrict v_vals = (double*)(v->vals); + printf("Adhitha2\n"); + int B1_dimension = (int)(B->dimensions[0]); + int C1_dimension = (int)(B->dimensions[0]); + printf("Adhitha3 %d, %d\n", B1_dimension, C1_dimension); + int* restrict B2_pos = (int*)(B->indices[1][0]); + printf("Adhitha4\n"); + int* restrict B2_crd = (int*)(B->indices[1][1]); + printf("Adhitha2\n"); + double* restrict B_vals = (double*)(B->vals); + + printf("Adhitha3\n"); + + double* restrict tA = 0; + tA = (double*)malloc(sizeof(double) * C1_dimension); + for (int32_t ptA = 0; ptA < C1_dimension; ptA++) { + tA[ptA] = 0.0; + } + for (int32_t i1439 = 0; i1439 < C1_dimension; i1439++) { + double ti1440tA_val = 0.0; + for (int32_t i1440C = C2_pos[i1439]; i1440C < C2_pos[(i1439 + 1)]; i1440C++) { + int32_t i1440 = C2_crd[i1440C]; + ti1440tA_val += C_vals[i1440C] * v_vals[i1440]; + } + tA[i1439] = ti1440tA_val; + } + for (int32_t i1438 = 0; i1438 < B1_dimension; i1438++) { + double ti1439A_val = 0.0; + for (int32_t i1439B = B2_pos[i1438]; i1439B < B2_pos[(i1438 + 1)]; i1439B++) { + int32_t i1439 = B2_crd[i1439B]; + ti1439A_val += B_vals[i1439B] * tA[i1439]; + } + A_vals[i1438] = ti1439A_val; + } + free(tA); + + A->vals = (uint8_t*)A_vals; + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/spmv_spmv/spmv_fused.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} diff --git a/test/kernels/spmv_spmv/spmv_fused.h b/test/kernels/spmv_spmv/spmv_fused.h new file mode 100644 index 000000000..bc78275ac --- /dev/null +++ b/test/kernels/spmv_spmv/spmv_fused.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A, taco_tensor_t *C, taco_tensor_t *v, taco_tensor_t *B); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A, taco_tensor_t *C, taco_tensor_t *v, taco_tensor_t *B); +#endif diff --git a/test/kernels/spmv_spmv/spmv_fused.so b/test/kernels/spmv_spmv/spmv_fused.so new file mode 100755 index 000000000..5efd6a4d8 Binary files /dev/null and b/test/kernels/spmv_spmv/spmv_fused.so differ diff --git a/test/kernels/spmv_spmv/spmv_spmv_default.c b/test/kernels/spmv_spmv/spmv_spmv_default.c new file mode 100644 index 000000000..dfaa1c4b0 --- /dev/null +++ b/test/kernels/spmv_spmv/spmv_spmv_default.c @@ -0,0 +1,157 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *ref, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *v) { + double* restrict ref_vals = (double*)(ref->vals); + + ref_vals = (double*)malloc(sizeof(double) * 5); + + ref->vals = (uint8_t*)ref_vals; + return 0; +} + +int compute(taco_tensor_t *ref, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *v) { + double* restrict ref_vals = (double*)(ref->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + double* restrict B_vals = (double*)(B->vals); + int* restrict C2_pos = (int*)(C->indices[1][0]); + int* restrict C2_crd = (int*)(C->indices[1][1]); + double* restrict C_vals = (double*)(C->vals); + double* restrict v_vals = (double*)(v->vals); + + #pragma omp parallel for schedule(runtime) + for (int32_t i1438 = 0; i1438 < B1_dimension; i1438++) { + double ti1439ref_val = 0.0; + for (int32_t i1439B = B2_pos[i1438]; i1439B < B2_pos[(i1438 + 1)]; i1439B++) { + int32_t i1439 = B2_crd[i1439B]; + for (int32_t i1440C = C2_pos[i1439]; i1440C < C2_pos[(i1439 + 1)]; i1440C++) { + int32_t i1440 = C2_crd[i1440C]; + ti1439ref_val += (B_vals[i1439B] * C_vals[i1440C]) * v_vals[i1440]; + } + } + ref_vals[i1438] = ti1439ref_val; + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/spmv_spmv/spmv_spmv_default.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} diff --git a/test/kernels/spmv_spmv/spmv_spmv_default.h b/test/kernels/spmv_spmv/spmv_spmv_default.h new file mode 100644 index 000000000..b53193484 --- /dev/null +++ b/test/kernels/spmv_spmv/spmv_spmv_default.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *ref, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *v); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *ref, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *v); +#endif diff --git a/test/kernels/ttm_ttm/fused copy.c b/test/kernels/ttm_ttm/fused copy.c new file mode 100644 index 000000000..5d40c8aa9 --- /dev/null +++ b/test/kernels/ttm_ttm/fused copy.c @@ -0,0 +1,248 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A1532, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) { + int A15321_dimension = (int)(A1532->dimensions[0]); + int A15323_dimension = (int)(A1532->dimensions[2]); + int* restrict A15322_pos = (int*)(A1532->indices[1][0]); + int* restrict A15322_crd = (int*)(A1532->indices[1][1]); + double* restrict A1532_vals = (double*)(A1532->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + + A15322_pos = (int32_t*)malloc(sizeof(int32_t) * (A15321_dimension + 1)); + A15322_pos[0] = 0; + for (int32_t pA15322 = 1; pA15322 < (A15321_dimension + 1); pA15322++) { + A15322_pos[pA15322] = 0; + } + int32_t A15322_crd_size = 1048576; + A15322_crd = (int32_t*)malloc(sizeof(int32_t) * A15322_crd_size); + int32_t i1543A1532 = 0; + + for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= B1_dimension) + continue; + + int32_t pA15322_begin = i1543A1532; + + for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) { + int32_t i1543 = B2_crd[i1543B]; + if (A15322_crd_size <= i1543A1532) { + A15322_crd = (int32_t*)realloc(A15322_crd, sizeof(int32_t) * (A15322_crd_size * 2)); + A15322_crd_size *= 2; + } + A15322_crd[i1543A1532] = i1543; + i1543A1532++; + } + + A15322_pos[i1542 + 1] = i1543A1532 - pA15322_begin; + } + } + + int32_t csA15322 = 0; + for (int32_t pA153220 = 1; pA153220 < (A15321_dimension + 1); pA153220++) { + csA15322 += A15322_pos[pA153220]; + A15322_pos[pA153220] = csA15322; + } + + A1532_vals = (double*)malloc(sizeof(double) * (i1543A1532 * A15323_dimension)); + + A1532->indices[1][0] = (uint8_t*)(A15322_pos); + A1532->indices[1][1] = (uint8_t*)(A15322_crd); + A1532->vals = (uint8_t*)A1532_vals; + return 0; +} + +int compute(taco_tensor_t *A1532, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) { + int A15321_dimension = (int)(A1532->dimensions[0]); + int A15323_dimension = (int)(A1532->dimensions[2]); + int* restrict A15322_pos = (int*)(A1532->indices[1][0]); + double* restrict A1532_vals = (double*)(A1532->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + int* restrict B3_pos = (int*)(B->indices[2][0]); + int* restrict B3_crd = (int*)(B->indices[2][1]); + double* restrict B_vals = (double*)(B->vals); + int C1_dimension = (int)(C->dimensions[0]); + int C2_dimension = (int)(C->dimensions[1]); + double* restrict C_vals = (double*)(C->vals); + int D1_dimension = (int)(D->dimensions[0]); + int D2_dimension = (int)(D->dimensions[1]); + double* restrict D_vals = (double*)(D->vals); + +// int32_t i1543A1532 = 0; + + #pragma omp parallel for schedule(static) + for (int32_t pA1532 = 0; pA1532 < (A15322_pos[A15321_dimension] * A15323_dimension); pA1532++) { + A1532_vals[pA1532] = 0.0; + } + + double* restrict rA1532_all = 0; + tA1532_all = (double*)malloc(sizeof(double) * D1_dimension * omp_get_max_threads()); + + #pragma omp parallel for schedule(runtime) + for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= B1_dimension) + continue; + + double* restrict tA1532 = 0; + tA1532 = &tA1532_all[D1_dimension*omp_get_thread_num()]; + // tA1532 = (double*)malloc(sizeof(double) * D1_dimension); + + for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) { + for (int32_t ptA1532 = 0; ptA1532 < D1_dimension; ptA1532++) { + tA1532[ptA1532] = 0.0; + } + for (int32_t i1544B = B3_pos[i1543B]; i1544B < B3_pos[(i1543B + 1)]; i1544B++) { + int32_t i1544 = B3_crd[i1544B]; + for (int32_t i1545 = 0; i1545 < D1_dimension; i1545++) { + int32_t i1545C = i1544 * C2_dimension + i1545; + tA1532[i1545] = tA1532[i1545] + B_vals[i1544B] * C_vals[i1545C]; + } + } + for (int32_t i1545 = 0; i1545 < D1_dimension; i1545++) { + for (int32_t i1546 = 0; i1546 < D2_dimension; i1546++) { + int32_t i1546A1532 = i1543B * A15323_dimension + i1546; + int32_t i1546D = i1545 * D2_dimension + i1546; + A1532_vals[i1546A1532] = A1532_vals[i1546A1532] + tA1532[i1545] * D_vals[i1546D]; + } + } + // i1543A1532++; + } + + + } + + } + free(tA1532_all); + + A1532->indices[1][0] = (uint8_t*)(A15322_pos); + A1532->vals = (uint8_t*)A1532_vals; + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/fused.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} diff --git a/test/kernels/ttm_ttm/fused.c b/test/kernels/ttm_ttm/fused.c new file mode 100644 index 000000000..f490913cb --- /dev/null +++ b/test/kernels/ttm_ttm/fused.c @@ -0,0 +1,242 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A1532, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) { + int A15321_dimension = (int)(A1532->dimensions[0]); + int A15323_dimension = (int)(A1532->dimensions[2]); + int* restrict A15322_pos = (int*)(A1532->indices[1][0]); + int* restrict A15322_crd = (int*)(A1532->indices[1][1]); + double* restrict A1532_vals = (double*)(A1532->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + + A15322_pos = (int32_t*)malloc(sizeof(int32_t) * (A15321_dimension + 1)); + A15322_pos[0] = 0; + for (int32_t pA15322 = 1; pA15322 < (A15321_dimension + 1); pA15322++) { + A15322_pos[pA15322] = 0; + } + int32_t A15322_crd_size = 1048576; + A15322_crd = (int32_t*)malloc(sizeof(int32_t) * A15322_crd_size); + int32_t i1543A1532 = 0; + + for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= B1_dimension) + continue; + + int32_t pA15322_begin = i1543A1532; + + for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) { + int32_t i1543 = B2_crd[i1543B]; + if (A15322_crd_size <= i1543A1532) { + A15322_crd = (int32_t*)realloc(A15322_crd, sizeof(int32_t) * (A15322_crd_size * 2)); + A15322_crd_size *= 2; + } + A15322_crd[i1543A1532] = i1543; + i1543A1532++; + } + + A15322_pos[i1542 + 1] = i1543A1532 - pA15322_begin; + } + } + + int32_t csA15322 = 0; + for (int32_t pA153220 = 1; pA153220 < (A15321_dimension + 1); pA153220++) { + csA15322 += A15322_pos[pA153220]; + A15322_pos[pA153220] = csA15322; + } + + A1532_vals = (double*)malloc(sizeof(double) * (i1543A1532 * A15323_dimension)); + + A1532->indices[1][0] = (uint8_t*)(A15322_pos); + A1532->indices[1][1] = (uint8_t*)(A15322_crd); + A1532->vals = (uint8_t*)A1532_vals; + return 0; +} + +int compute(taco_tensor_t *A1532, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) { + int A15321_dimension = (int)(A1532->dimensions[0]); + int A15323_dimension = (int)(A1532->dimensions[2]); + int* restrict A15322_pos = (int*)(A1532->indices[1][0]); + double* restrict A1532_vals = (double*)(A1532->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + int* restrict B3_pos = (int*)(B->indices[2][0]); + int* restrict B3_crd = (int*)(B->indices[2][1]); + double* restrict B_vals = (double*)(B->vals); + int C1_dimension = (int)(C->dimensions[0]); + int C2_dimension = (int)(C->dimensions[1]); + double* restrict C_vals = (double*)(C->vals); + int D1_dimension = (int)(D->dimensions[0]); + int D2_dimension = (int)(D->dimensions[1]); + double* restrict D_vals = (double*)(D->vals); + +// int32_t i1543A1532 = 0; + + #pragma omp parallel for schedule(static) + for (int32_t pA1532 = 0; pA1532 < (A15322_pos[A15321_dimension] * A15323_dimension); pA1532++) { + A1532_vals[pA1532] = 0.0; + } + + #pragma omp parallel for schedule(runtime) + for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= B1_dimension) + continue; + + double* restrict tA1532 = 0; + tA1532 = (double*)malloc(sizeof(double) * D1_dimension); + + for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) { + for (int32_t ptA1532 = 0; ptA1532 < D1_dimension; ptA1532++) { + tA1532[ptA1532] = 0.0; + } + for (int32_t i1544B = B3_pos[i1543B]; i1544B < B3_pos[(i1543B + 1)]; i1544B++) { + int32_t i1544 = B3_crd[i1544B]; + for (int32_t i1545 = 0; i1545 < D1_dimension; i1545++) { + int32_t i1545C = i1544 * C2_dimension + i1545; + tA1532[i1545] = tA1532[i1545] + B_vals[i1544B] * C_vals[i1545C]; + } + } + for (int32_t i1545 = 0; i1545 < D1_dimension; i1545++) { + for (int32_t i1546 = 0; i1546 < D2_dimension; i1546++) { + int32_t i1546A1532 = i1543B * A15323_dimension + i1546; + int32_t i1546D = i1545 * D2_dimension + i1546; + A1532_vals[i1546A1532] = A1532_vals[i1546A1532] + tA1532[i1545] * D_vals[i1546D]; + } + } + // i1543A1532++; + } + + free(tA1532); + } + } + + A1532->indices[1][0] = (uint8_t*)(A15322_pos); + A1532->vals = (uint8_t*)A1532_vals; + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/fused.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} diff --git a/test/kernels/ttm_ttm/fused.h b/test/kernels/ttm_ttm/fused.h new file mode 100644 index 000000000..d613c8f07 --- /dev/null +++ b/test/kernels/ttm_ttm/fused.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A1532, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A1532, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D); +#endif diff --git a/test/kernels/ttm_ttm/fused.so b/test/kernels/ttm_ttm/fused.so new file mode 100755 index 000000000..69c65a1dc Binary files /dev/null and b/test/kernels/ttm_ttm/fused.so differ diff --git a/test/kernels/ttm_ttm/gemm.c b/test/kernels/ttm_ttm/gemm.c new file mode 100644 index 000000000..ee2b24e99 --- /dev/null +++ b/test/kernels/ttm_ttm/gemm.c @@ -0,0 +1,181 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A2886, taco_tensor_t *C, taco_tensor_t *D) { + int A28861_dimension = (int)(A2886->dimensions[0]); + int A28862_dimension = (int)(A2886->dimensions[1]); + double* restrict A2886_vals = (double*)(A2886->vals); + + A2886_vals = (double*)malloc(sizeof(double) * (A28861_dimension * A28862_dimension)); + + A2886->vals = (uint8_t*)A2886_vals; + return 0; +} + +int compute(taco_tensor_t *A2886, taco_tensor_t *C, taco_tensor_t *D) { + int A28861_dimension = (int)(A2886->dimensions[0]); + int A28862_dimension = (int)(A2886->dimensions[1]); + double* restrict A2886_vals = (double*)(A2886->vals); + int C1_dimension = (int)(C->dimensions[0]); + int C2_dimension = (int)(C->dimensions[1]); + double* restrict C_vals = (double*)(C->vals); + int D1_dimension = (int)(D->dimensions[0]); + int D2_dimension = (int)(D->dimensions[1]); + double* restrict D_vals = (double*)(D->vals); + + #pragma omp parallel for schedule(static) + for (int32_t pA2886 = 0; pA2886 < (A28861_dimension * A28862_dimension); pA2886++) { + A2886_vals[pA2886] = 0.0; + } + + #pragma omp parallel for schedule(runtime) + for (int32_t i1551 = 0; i1551 < ((C1_dimension + 31) / 32); i1551++) { + for (int32_t i1553 = 0; i1553 < ((D1_dimension + 31) / 32); i1553++) { + for (int32_t i1555 = 0; i1555 < ((D2_dimension + 31) / 32); i1555++) { + for (int32_t i1552 = 0; i1552 < 32; i1552++) { + int32_t i1544 = i1551 * 32 + i1552; + if (i1544 >= C1_dimension) + continue; + + for (int32_t i1554 = 0; i1554 < 32; i1554++) { + int32_t i1545 = i1553 * 32 + i1554; + int32_t i1545C = i1544 * C2_dimension + i1545; + if (i1545 >= D1_dimension) + continue; + + for (int32_t i1556 = 0; i1556 < 32; i1556++) { + int32_t i1546 = i1555 * 32 + i1556; + int32_t i1546D = i1545 * D2_dimension + i1546; + int32_t i1546A2886 = i1544 * A28862_dimension + i1546; + if (i1546 >= D2_dimension) + continue; + + A2886_vals[i1546A2886] = A2886_vals[i1546A2886] + C_vals[i1545C] * D_vals[i1546D]; + } + } + } + } + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/gemm.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2])); +} diff --git a/test/kernels/ttm_ttm/gemm.h b/test/kernels/ttm_ttm/gemm.h new file mode 100644 index 000000000..20cd2db53 --- /dev/null +++ b/test/kernels/ttm_ttm/gemm.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A2886, taco_tensor_t *C, taco_tensor_t *D); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A2886, taco_tensor_t *C, taco_tensor_t *D); +#endif diff --git a/test/kernels/ttm_ttm/ttm1_1.c b/test/kernels/ttm_ttm/ttm1_1.c new file mode 100644 index 000000000..e016491a2 --- /dev/null +++ b/test/kernels/ttm_ttm/ttm1_1.c @@ -0,0 +1,219 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A2398, taco_tensor_t *B, taco_tensor_t *C) { + int A23981_dimension = (int)(A2398->dimensions[0]); + int A23983_dimension = (int)(A2398->dimensions[2]); + int* restrict A23982_pos = (int*)(A2398->indices[1][0]); + int* restrict A23982_crd = (int*)(A2398->indices[1][1]); + double* restrict A2398_vals = (double*)(A2398->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + + A23982_pos = (int32_t*)malloc(sizeof(int32_t) * (A23981_dimension + 1)); + A23982_pos[0] = 0; + for (int32_t pA23982 = 1; pA23982 < (A23981_dimension + 1); pA23982++) { + A23982_pos[pA23982] = 0; + } + int32_t A23982_crd_size = 1048576; + A23982_crd = (int32_t*)malloc(sizeof(int32_t) * A23982_crd_size); + int32_t i1543A2398 = 0; + + for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= B1_dimension) + continue; + + int32_t pA23982_begin = i1543A2398; + + for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) { + int32_t i1543 = B2_crd[i1543B]; + if (A23982_crd_size <= i1543A2398) { + A23982_crd = (int32_t*)realloc(A23982_crd, sizeof(int32_t) * (A23982_crd_size * 2)); + A23982_crd_size *= 2; + } + A23982_crd[i1543A2398] = i1543; + i1543A2398++; + } + + A23982_pos[i1542 + 1] = i1543A2398 - pA23982_begin; + } + } + + int32_t csA23982 = 0; + for (int32_t pA239820 = 1; pA239820 < (A23981_dimension + 1); pA239820++) { + csA23982 += A23982_pos[pA239820]; + A23982_pos[pA239820] = csA23982; + } + + A2398_vals = (double*)malloc(sizeof(double) * (i1543A2398 * A23983_dimension)); + + A2398->indices[1][0] = (uint8_t*)(A23982_pos); + A2398->indices[1][1] = (uint8_t*)(A23982_crd); + A2398->vals = (uint8_t*)A2398_vals; + return 0; +} + +int compute(taco_tensor_t *A2398, taco_tensor_t *B, taco_tensor_t *C) { + int A23981_dimension = (int)(A2398->dimensions[0]); + int A23983_dimension = (int)(A2398->dimensions[2]); + double* restrict A2398_vals = (double*)(A2398->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + int* restrict B3_pos = (int*)(B->indices[2][0]); + int* restrict B3_crd = (int*)(B->indices[2][1]); + double* restrict B_vals = (double*)(B->vals); + int C1_dimension = (int)(C->dimensions[0]); + int C2_dimension = (int)(C->dimensions[1]); + double* restrict C_vals = (double*)(C->vals); + + // int32_t i1543A2398 = 0; + + #pragma omp parallel for schedule(runtime) + for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= B1_dimension) + continue; + + for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) { + for (int32_t i1545 = 0; i1545 < C2_dimension; i1545++) { + // int32_t i1545A2398 = i1543A2398 * A23983_dimension + i1545; + int32_t i1545A2398 = i1543B * A23983_dimension + i1545; + double ti1544A2398_val = 0.0; + for (int32_t i1544B = B3_pos[i1543B]; i1544B < B3_pos[(i1543B + 1)]; i1544B++) { + int32_t i1544 = B3_crd[i1544B]; + int32_t i1545C = i1544 * C2_dimension + i1545; + ti1544A2398_val += B_vals[i1544B] * C_vals[i1545C]; + } + A2398_vals[i1545A2398] = ti1544A2398_val; + } + // i1543A2398++; + } + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/ttm1_1.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2])); +} diff --git a/test/kernels/ttm_ttm/ttm1_1.h b/test/kernels/ttm_ttm/ttm1_1.h new file mode 100644 index 000000000..4c631f227 --- /dev/null +++ b/test/kernels/ttm_ttm/ttm1_1.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A2398, taco_tensor_t *B, taco_tensor_t *C); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A2398, taco_tensor_t *B, taco_tensor_t *C); +#endif diff --git a/test/kernels/ttm_ttm/ttm1_1.so b/test/kernels/ttm_ttm/ttm1_1.so new file mode 100755 index 000000000..911c44fa1 Binary files /dev/null and b/test/kernels/ttm_ttm/ttm1_1.so differ diff --git a/test/kernels/ttm_ttm/ttm1_2.c b/test/kernels/ttm_ttm/ttm1_2.c new file mode 100644 index 000000000..b04e23a54 --- /dev/null +++ b/test/kernels/ttm_ttm/ttm1_2.c @@ -0,0 +1,219 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A3056, taco_tensor_t *B, taco_tensor_t *A2886) { + int A30561_dimension = (int)(A3056->dimensions[0]); + int A30563_dimension = (int)(A3056->dimensions[2]); + int* restrict A30562_pos = (int*)(A3056->indices[1][0]); + int* restrict A30562_crd = (int*)(A3056->indices[1][1]); + double* restrict A3056_vals = (double*)(A3056->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + + A30562_pos = (int32_t*)malloc(sizeof(int32_t) * (A30561_dimension + 1)); + A30562_pos[0] = 0; + for (int32_t pA30562 = 1; pA30562 < (A30561_dimension + 1); pA30562++) { + A30562_pos[pA30562] = 0; + } + int32_t A30562_crd_size = 1048576; + A30562_crd = (int32_t*)malloc(sizeof(int32_t) * A30562_crd_size); + int32_t i1543A3056 = 0; + + for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= B1_dimension) + continue; + + int32_t pA30562_begin = i1543A3056; + + for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) { + int32_t i1543 = B2_crd[i1543B]; + if (A30562_crd_size <= i1543A3056) { + A30562_crd = (int32_t*)realloc(A30562_crd, sizeof(int32_t) * (A30562_crd_size * 2)); + A30562_crd_size *= 2; + } + A30562_crd[i1543A3056] = i1543; + i1543A3056++; + } + + A30562_pos[i1542 + 1] = i1543A3056 - pA30562_begin; + } + } + + int32_t csA30562 = 0; + for (int32_t pA305620 = 1; pA305620 < (A30561_dimension + 1); pA305620++) { + csA30562 += A30562_pos[pA305620]; + A30562_pos[pA305620] = csA30562; + } + + A3056_vals = (double*)malloc(sizeof(double) * (i1543A3056 * A30563_dimension)); + + A3056->indices[1][0] = (uint8_t*)(A30562_pos); + A3056->indices[1][1] = (uint8_t*)(A30562_crd); + A3056->vals = (uint8_t*)A3056_vals; + return 0; +} + +int compute(taco_tensor_t *A3056, taco_tensor_t *B, taco_tensor_t *A2886) { + int A30561_dimension = (int)(A3056->dimensions[0]); + int A30563_dimension = (int)(A3056->dimensions[2]); + double* restrict A3056_vals = (double*)(A3056->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + int* restrict B3_pos = (int*)(B->indices[2][0]); + int* restrict B3_crd = (int*)(B->indices[2][1]); + double* restrict B_vals = (double*)(B->vals); + int A28861_dimension = (int)(A2886->dimensions[0]); + int A28862_dimension = (int)(A2886->dimensions[1]); + double* restrict A2886_vals = (double*)(A2886->vals); + + // int32_t i1543A3056 = 0; + + #pragma omp parallel for schedule(runtime) + for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= B1_dimension) + continue; + + for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) { + for (int32_t i1546 = 0; i1546 < A28862_dimension; i1546++) { + // int32_t i1546A3056 = i1543A3056 * A30563_dimension + i1546; + int32_t i1546A3056 = i1543B * A30563_dimension + i1546; + double ti1544A3056_val = 0.0; + for (int32_t i1544B = B3_pos[i1543B]; i1544B < B3_pos[(i1543B + 1)]; i1544B++) { + int32_t i1544 = B3_crd[i1544B]; + int32_t i1546A2886 = i1544 * A28862_dimension + i1546; + ti1544A3056_val += B_vals[i1544B] * A2886_vals[i1546A2886]; + } + A3056_vals[i1546A3056] = ti1544A3056_val; + } + // i1543A3056++; + } + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/ttm1_2.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2])); +} diff --git a/test/kernels/ttm_ttm/ttm1_2.h b/test/kernels/ttm_ttm/ttm1_2.h new file mode 100644 index 000000000..86ebdb633 --- /dev/null +++ b/test/kernels/ttm_ttm/ttm1_2.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A3056, taco_tensor_t *B, taco_tensor_t *A2886); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A3056, taco_tensor_t *B, taco_tensor_t *A2886); +#endif diff --git a/test/kernels/ttm_ttm/ttm1_2.so b/test/kernels/ttm_ttm/ttm1_2.so new file mode 100755 index 000000000..c698ec991 Binary files /dev/null and b/test/kernels/ttm_ttm/ttm1_2.so differ diff --git a/test/kernels/ttm_ttm/ttm2.c b/test/kernels/ttm_ttm/ttm2.c new file mode 100644 index 000000000..e98f44e35 --- /dev/null +++ b/test/kernels/ttm_ttm/ttm2.c @@ -0,0 +1,218 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A2593, taco_tensor_t *A2398, taco_tensor_t *D) { + int A25931_dimension = (int)(A2593->dimensions[0]); + int A25933_dimension = (int)(A2593->dimensions[2]); + int* restrict A25932_pos = (int*)(A2593->indices[1][0]); + int* restrict A25932_crd = (int*)(A2593->indices[1][1]); + double* restrict A2593_vals = (double*)(A2593->vals); + int A23981_dimension = (int)(A2398->dimensions[0]); + int* restrict A23982_pos = (int*)(A2398->indices[1][0]); + int* restrict A23982_crd = (int*)(A2398->indices[1][1]); + + A25932_pos = (int32_t*)malloc(sizeof(int32_t) * (A25931_dimension + 1)); + A25932_pos[0] = 0; + for (int32_t pA25932 = 1; pA25932 < (A25931_dimension + 1); pA25932++) { + A25932_pos[pA25932] = 0; + } + int32_t A25932_crd_size = 1048576; + A25932_crd = (int32_t*)malloc(sizeof(int32_t) * A25932_crd_size); + int32_t i1543A2593 = 0; + + for (int32_t i1547 = 0; i1547 < ((A23981_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= A23981_dimension) + continue; + + int32_t pA25932_begin = i1543A2593; + + for (int32_t i1543A2398 = A23982_pos[i1542]; i1543A2398 < A23982_pos[(i1542 + 1)]; i1543A2398++) { + int32_t i1543 = A23982_crd[i1543A2398]; + if (A25932_crd_size <= i1543A2593) { + A25932_crd = (int32_t*)realloc(A25932_crd, sizeof(int32_t) * (A25932_crd_size * 2)); + A25932_crd_size *= 2; + } + A25932_crd[i1543A2593] = i1543; + i1543A2593++; + } + + A25932_pos[i1542 + 1] = i1543A2593 - pA25932_begin; + } + } + + int32_t csA25932 = 0; + for (int32_t pA259320 = 1; pA259320 < (A25931_dimension + 1); pA259320++) { + csA25932 += A25932_pos[pA259320]; + A25932_pos[pA259320] = csA25932; + } + + A2593_vals = (double*)malloc(sizeof(double) * (i1543A2593 * A25933_dimension)); + + A2593->indices[1][0] = (uint8_t*)(A25932_pos); + A2593->indices[1][1] = (uint8_t*)(A25932_crd); + A2593->vals = (uint8_t*)A2593_vals; + return 0; +} + +int compute(taco_tensor_t *A2593, taco_tensor_t *A2398, taco_tensor_t *D) { + int A25931_dimension = (int)(A2593->dimensions[0]); + int A25933_dimension = (int)(A2593->dimensions[2]); + double* restrict A2593_vals = (double*)(A2593->vals); + int A23981_dimension = (int)(A2398->dimensions[0]); + int A23983_dimension = (int)(A2398->dimensions[2]); + int* restrict A23982_pos = (int*)(A2398->indices[1][0]); + int* restrict A23982_crd = (int*)(A2398->indices[1][1]); + double* restrict A2398_vals = (double*)(A2398->vals); + int D1_dimension = (int)(D->dimensions[0]); + int D2_dimension = (int)(D->dimensions[1]); + double* restrict D_vals = (double*)(D->vals); + +// int32_t i1543A2593 = 0; + + #pragma omp parallel for schedule(runtime) + for (int32_t i1547 = 0; i1547 < ((A23981_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= A23981_dimension) + continue; + + for (int32_t i1543A2398 = A23982_pos[i1542]; i1543A2398 < A23982_pos[(i1542 + 1)]; i1543A2398++) { + for (int32_t i1546 = 0; i1546 < D2_dimension; i1546++) { + // int32_t i1546A2593 = i1543A2593 * A25933_dimension + i1546; + int32_t i1546A2593 = i1543A2398 * A25933_dimension + i1546; + double ti1545A2593_val = 0.0; + for (int32_t i1545 = 0; i1545 < D1_dimension; i1545++) { + int32_t i1545A2398 = i1543A2398 * A23983_dimension + i1545; + int32_t i1546D = i1545 * D2_dimension + i1546; + ti1545A2593_val += A2398_vals[i1545A2398] * D_vals[i1546D]; + } + A2593_vals[i1546A2593] = ti1545A2593_val; + } + // i1543A2593++; + } + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/ttm2.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2])); +} diff --git a/test/kernels/ttm_ttm/ttm2.h b/test/kernels/ttm_ttm/ttm2.h new file mode 100644 index 000000000..40f1400d1 --- /dev/null +++ b/test/kernels/ttm_ttm/ttm2.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A2593, taco_tensor_t *A2398, taco_tensor_t *D); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A2593, taco_tensor_t *A2398, taco_tensor_t *D); +#endif diff --git a/test/kernels/ttm_ttm/ttm2.so b/test/kernels/ttm_ttm/ttm2.so new file mode 100755 index 000000000..16a3d2542 Binary files /dev/null and b/test/kernels/ttm_ttm/ttm2.so differ diff --git a/test/kernels/ttm_ttm/ttm_original copy 2.c b/test/kernels/ttm_ttm/ttm_original copy 2.c new file mode 100644 index 000000000..cb21b209f --- /dev/null +++ b/test/kernels/ttm_ttm/ttm_original copy 2.c @@ -0,0 +1,242 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A1537, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) { + int A15371_dimension = (int)(A1537->dimensions[0]); + int A15373_dimension = (int)(A1537->dimensions[2]); + int* restrict A15372_pos = (int*)(A1537->indices[1][0]); + int* restrict A15372_crd = (int*)(A1537->indices[1][1]); + double* restrict A1537_vals = (double*)(A1537->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + + A15372_pos = (int32_t*)malloc(sizeof(int32_t) * (A15371_dimension + 1)); + A15372_pos[0] = 0; + for (int32_t pA15372 = 1; pA15372 < (A15371_dimension + 1); pA15372++) { + A15372_pos[pA15372] = 0; + } + int32_t A15372_crd_size = 1048576; + A15372_crd = (int32_t*)malloc(sizeof(int32_t) * A15372_crd_size); + int32_t i1543A1537 = 0; + + for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= B1_dimension) + continue; + + int32_t pA15372_begin = i1543A1537; + + for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) { + int32_t i1543 = B2_crd[i1543B]; + if (A15372_crd_size <= i1543A1537) { + A15372_crd = (int32_t*)realloc(A15372_crd, sizeof(int32_t) * (A15372_crd_size * 2)); + A15372_crd_size *= 2; + } + A15372_crd[i1543A1537] = i1543; + i1543A1537++; + } + + A15372_pos[i1542 + 1] = i1543A1537 - pA15372_begin; + } + } + + int32_t csA15372 = 0; + for (int32_t pA153720 = 1; pA153720 < (A15371_dimension + 1); pA153720++) { + csA15372 += A15372_pos[pA153720]; + A15372_pos[pA153720] = csA15372; + } + + A1537_vals = (double*)malloc(sizeof(double) * (i1543A1537 * A15373_dimension)); + + A1537->indices[1][0] = (uint8_t*)(A15372_pos); + A1537->indices[1][1] = (uint8_t*)(A15372_crd); + A1537->vals = (uint8_t*)A1537_vals; + return 0; +} + +int compute(taco_tensor_t *A1537, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) { + int A15371_dimension = (int)(A1537->dimensions[0]); + int A15373_dimension = (int)(A1537->dimensions[2]); + int* restrict A15372_pos = (int*)(A1537->indices[1][0]); + double* restrict A1537_vals = (double*)(A1537->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + int* restrict B3_pos = (int*)(B->indices[2][0]); + int* restrict B3_crd = (int*)(B->indices[2][1]); + double* restrict B_vals = (double*)(B->vals); + int C1_dimension = (int)(C->dimensions[0]); + int C2_dimension = (int)(C->dimensions[1]); + double* restrict C_vals = (double*)(C->vals); + int D1_dimension = (int)(D->dimensions[0]); + int D2_dimension = (int)(D->dimensions[1]); + double* restrict D_vals = (double*)(D->vals); + + // int32_t i1543A1537 = 0; + + #pragma omp parallel for schedule(static) + for (int32_t pA1537 = 0; pA1537 < (A15372_pos[A15371_dimension] * A15373_dimension); pA1537++) { + A1537_vals[pA1537] = 0.0; + } + + #pragma omp parallel for schedule(runtime) + for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= B1_dimension) + continue; + + for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) { + for (int32_t i1544B = B3_pos[i1543B]; i1544B < B3_pos[(i1543B + 1)]; i1544B++) { + int32_t i1544 = B3_crd[i1544B]; + for (int32_t i1553 = 0; i1553 < ((D1_dimension + 31) / 32); i1553++) { + for (int32_t i1555 = 0; i1555 < ((D2_dimension + 31) / 32); i1555++) { + for (int32_t i1554 = 0; i1554 < 32; i1554++) { + int32_t i1545 = i1553 * 32 + i1554; + int32_t i1545C = i1544 * C2_dimension + i1545; + if (i1545 >= D1_dimension) + continue; + + for (int32_t i1556 = 0; i1556 < 32; i1556++) { + int32_t i1546 = i1555 * 32 + i1556; + // int32_t i1546A1537 = i1543A1537 * A15373_dimension + i1546; + int32_t i1546A1537 = i1544B * A15373_dimension + i1546; + int32_t i1546D = i1545 * D2_dimension + i1546; + if (i1546 >= D2_dimension) + continue; + + A1537_vals[i1546A1537] = A1537_vals[i1546A1537] + (B_vals[i1544B] * C_vals[i1545C]) * D_vals[i1546D]; + } + } + } + } + } + + // i1543A1537++; + } + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/ttm_original.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} diff --git a/test/kernels/ttm_ttm/ttm_original copy.c b/test/kernels/ttm_ttm/ttm_original copy.c new file mode 100644 index 000000000..2db396c0a --- /dev/null +++ b/test/kernels/ttm_ttm/ttm_original copy.c @@ -0,0 +1,225 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A1537, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) { + int A15371_dimension = (int)(A1537->dimensions[0]); + int A15373_dimension = (int)(A1537->dimensions[2]); + int* restrict A15372_pos = (int*)(A1537->indices[1][0]); + int* restrict A15372_crd = (int*)(A1537->indices[1][1]); + double* restrict A1537_vals = (double*)(A1537->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + + A15372_pos = (int32_t*)malloc(sizeof(int32_t) * (A15371_dimension + 1)); + A15372_pos[0] = 0; + for (int32_t pA15372 = 1; pA15372 < (A15371_dimension + 1); pA15372++) { + A15372_pos[pA15372] = 0; + } + int32_t A15372_crd_size = 1048576; + A15372_crd = (int32_t*)malloc(sizeof(int32_t) * A15372_crd_size); + int32_t i1543A1537 = 0; + + for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= B1_dimension) + continue; + + int32_t pA15372_begin = i1543A1537; + + for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) { + int32_t i1543 = B2_crd[i1543B]; + if (A15372_crd_size <= i1543A1537) { + A15372_crd = (int32_t*)realloc(A15372_crd, sizeof(int32_t) * (A15372_crd_size * 2)); + A15372_crd_size *= 2; + } + A15372_crd[i1543A1537] = i1543; + i1543A1537++; + } + + A15372_pos[i1542 + 1] = i1543A1537 - pA15372_begin; + } + } + + int32_t csA15372 = 0; + for (int32_t pA153720 = 1; pA153720 < (A15371_dimension + 1); pA153720++) { + csA15372 += A15372_pos[pA153720]; + A15372_pos[pA153720] = csA15372; + } + + A1537_vals = (double*)malloc(sizeof(double) * (i1543A1537 * A15373_dimension)); + + A1537->indices[1][0] = (uint8_t*)(A15372_pos); + A1537->indices[1][1] = (uint8_t*)(A15372_crd); + A1537->vals = (uint8_t*)A1537_vals; + return 0; +} + +int compute(taco_tensor_t *A1537, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) { + int A15371_dimension = (int)(A1537->dimensions[0]); + int A15373_dimension = (int)(A1537->dimensions[2]); + double* restrict A1537_vals = (double*)(A1537->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + int* restrict B3_pos = (int*)(B->indices[2][0]); + int* restrict B3_crd = (int*)(B->indices[2][1]); + double* restrict B_vals = (double*)(B->vals); + int C1_dimension = (int)(C->dimensions[0]); + int C2_dimension = (int)(C->dimensions[1]); + double* restrict C_vals = (double*)(C->vals); + int D1_dimension = (int)(D->dimensions[0]); + int D2_dimension = (int)(D->dimensions[1]); + double* restrict D_vals = (double*)(D->vals); + + // int32_t i1543A1537 = 0; + + #pragma omp parallel for schedule(runtime) + for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= B1_dimension) + continue; + + for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) { + for (int32_t i1546 = 0; i1546 < D2_dimension; i1546++) { + // int32_t i1546A1537 = i1543A1537 * A15373_dimension + i1546; + int32_t i1546A1537 = i1543B * A15373_dimension + i1546; + double ti1544A1537_val = 0.0; + for (int32_t i1544B = B3_pos[i1543B]; i1544B < B3_pos[(i1543B + 1)]; i1544B++) { + int32_t i1544 = B3_crd[i1544B]; + for (int32_t i1545 = 0; i1545 < D1_dimension; i1545++) { + int32_t i1545C = i1544 * C2_dimension + i1545; + int32_t i1546D = i1545 * D2_dimension + i1546; + ti1544A1537_val += (B_vals[i1544B] * C_vals[i1545C]) * D_vals[i1546D]; + } + } + A1537_vals[i1546A1537] = ti1544A1537_val; + } + // i1543A1537++; + } + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/ttm_original.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} diff --git a/test/kernels/ttm_ttm/ttm_original.c b/test/kernels/ttm_ttm/ttm_original.c new file mode 100644 index 000000000..ac2674239 --- /dev/null +++ b/test/kernels/ttm_ttm/ttm_original.c @@ -0,0 +1,226 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A1537, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) { + int A15371_dimension = (int)(A1537->dimensions[0]); + int A15373_dimension = (int)(A1537->dimensions[2]); + int* restrict A15372_pos = (int*)(A1537->indices[1][0]); + int* restrict A15372_crd = (int*)(A1537->indices[1][1]); + double* restrict A1537_vals = (double*)(A1537->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + + A15372_pos = (int32_t*)malloc(sizeof(int32_t) * (A15371_dimension + 1)); + A15372_pos[0] = 0; + for (int32_t pA15372 = 1; pA15372 < (A15371_dimension + 1); pA15372++) { + A15372_pos[pA15372] = 0; + } + int32_t A15372_crd_size = 1048576; + A15372_crd = (int32_t*)malloc(sizeof(int32_t) * A15372_crd_size); + int32_t i1543A1537 = 0; + + for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= B1_dimension) + continue; + + int32_t pA15372_begin = i1543A1537; + + for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) { + int32_t i1543 = B2_crd[i1543B]; + if (A15372_crd_size <= i1543A1537) { + A15372_crd = (int32_t*)realloc(A15372_crd, sizeof(int32_t) * (A15372_crd_size * 2)); + A15372_crd_size *= 2; + } + A15372_crd[i1543A1537] = i1543; + i1543A1537++; + } + + A15372_pos[i1542 + 1] = i1543A1537 - pA15372_begin; + } + } + + int32_t csA15372 = 0; + for (int32_t pA153720 = 1; pA153720 < (A15371_dimension + 1); pA153720++) { + csA15372 += A15372_pos[pA153720]; + A15372_pos[pA153720] = csA15372; + } + + A1537_vals = (double*)malloc(sizeof(double) * (i1543A1537 * A15373_dimension)); + + A1537->indices[1][0] = (uint8_t*)(A15372_pos); + A1537->indices[1][1] = (uint8_t*)(A15372_crd); + A1537->vals = (uint8_t*)A1537_vals; + return 0; +} + +int compute(taco_tensor_t *A1537, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) { + int A15371_dimension = (int)(A1537->dimensions[0]); + int A15373_dimension = (int)(A1537->dimensions[2]); + double* restrict A1537_vals = (double*)(A1537->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + int* restrict B3_pos = (int*)(B->indices[2][0]); + int* restrict B3_crd = (int*)(B->indices[2][1]); + double* restrict B_vals = (double*)(B->vals); + int C1_dimension = (int)(C->dimensions[0]); + int C2_dimension = (int)(C->dimensions[1]); + double* restrict C_vals = (double*)(C->vals); + int D1_dimension = (int)(D->dimensions[0]); + int D2_dimension = (int)(D->dimensions[1]); + double* restrict D_vals = (double*)(D->vals); + + // int32_t i1543A1537 = 0; + + #pragma omp parallel for schedule(runtime) + for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= B1_dimension) + continue; + + for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) { + for (int32_t i1546 = 0; i1546 < D2_dimension; i1546++) { + // int32_t i1546A1537 = i1543A1537 * A15373_dimension + i1546; + int32_t i1546A1537 = i1543B * A15373_dimension + i1546; + double ti1544A1537_val = 0.0; + for (int32_t i1544B = B3_pos[i1543B]; i1544B < B3_pos[(i1543B + 1)]; i1544B++) { + int32_t i1544 = B3_crd[i1544B]; + for (int32_t i1545 = 0; i1545 < D1_dimension; i1545++) { + int32_t i1545C = i1544 * C2_dimension + i1545; + int32_t i1546D = i1545 * D2_dimension + i1546; + ti1544A1537_val += (B_vals[i1544B] * C_vals[i1545C]) * D_vals[i1546D]; + } + } + A1537_vals[i1546A1537] = ti1544A1537_val; + } + // i1543A1537++; + } + } + } + return 0; +} + +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/ttm_original.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} diff --git a/test/kernels/ttm_ttm/ttm_original.h b/test/kernels/ttm_ttm/ttm_original.h new file mode 100644 index 000000000..a27841047 --- /dev/null +++ b/test/kernels/ttm_ttm/ttm_original.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A1537, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A1537, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D); +#endif diff --git a/test/kernels/ttm_ttm/ttm_original.so b/test/kernels/ttm_ttm/ttm_original.so new file mode 100755 index 000000000..fa04aed35 Binary files /dev/null and b/test/kernels/ttm_ttm/ttm_original.so differ diff --git a/test/kernels/ttm_ttm/ttm_original2.c b/test/kernels/ttm_ttm/ttm_original2.c new file mode 100644 index 000000000..8dd62d6dd --- /dev/null +++ b/test/kernels/ttm_ttm/ttm_original2.c @@ -0,0 +1,229 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A1542, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) { + int A15421_dimension = (int)(A1542->dimensions[0]); + int A15423_dimension = (int)(A1542->dimensions[2]); + int* restrict A15422_pos = (int*)(A1542->indices[1][0]); + int* restrict A15422_crd = (int*)(A1542->indices[1][1]); + double* restrict A1542_vals = (double*)(A1542->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + + A15422_pos = (int32_t*)malloc(sizeof(int32_t) * (A15421_dimension + 1)); + A15422_pos[0] = 0; + for (int32_t pA15422 = 1; pA15422 < (A15421_dimension + 1); pA15422++) { + A15422_pos[pA15422] = 0; + } + int32_t A15422_crd_size = 1048576; + A15422_crd = (int32_t*)malloc(sizeof(int32_t) * A15422_crd_size); + int32_t i1548A1542 = 0; + + for (int32_t i1552 = 0; i1552 < ((B1_dimension + 15) / 16); i1552++) { + for (int32_t i1553 = 0; i1553 < 16; i1553++) { + int32_t i1547 = i1552 * 16 + i1553; + if (i1547 >= B1_dimension) + continue; + + int32_t pA15422_begin = i1548A1542; + + for (int32_t i1548B = B2_pos[i1547]; i1548B < B2_pos[(i1547 + 1)]; i1548B++) { + int32_t i1548 = B2_crd[i1548B]; + if (A15422_crd_size <= i1548A1542) { + A15422_crd = (int32_t*)realloc(A15422_crd, sizeof(int32_t) * (A15422_crd_size * 2)); + A15422_crd_size *= 2; + } + A15422_crd[i1548A1542] = i1548; + i1548A1542++; + } + + A15422_pos[i1547 + 1] = i1548A1542 - pA15422_begin; + } + } + + int32_t csA15422 = 0; + for (int32_t pA154220 = 1; pA154220 < (A15421_dimension + 1); pA154220++) { + csA15422 += A15422_pos[pA154220]; + A15422_pos[pA154220] = csA15422; + } + + A1542_vals = (double*)malloc(sizeof(double) * (i1548A1542 * A15423_dimension)); + + A1542->indices[1][0] = (uint8_t*)(A15422_pos); + A1542->indices[1][1] = (uint8_t*)(A15422_crd); + A1542->vals = (uint8_t*)A1542_vals; + return 0; +} + +int compute(taco_tensor_t *A1542, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) { + int A15421_dimension = (int)(A1542->dimensions[0]); + int A15423_dimension = (int)(A1542->dimensions[2]); + int* restrict A15422_pos = (int*)(A1542->indices[1][0]); + double* restrict A1542_vals = (double*)(A1542->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + int* restrict B3_pos = (int*)(B->indices[2][0]); + int* restrict B3_crd = (int*)(B->indices[2][1]); + double* restrict B_vals = (double*)(B->vals); + int C1_dimension = (int)(C->dimensions[0]); + int C2_dimension = (int)(C->dimensions[1]); + double* restrict C_vals = (double*)(C->vals); + int D1_dimension = (int)(D->dimensions[0]); + int D2_dimension = (int)(D->dimensions[1]); + double* restrict D_vals = (double*)(D->vals); + +// int32_t i1548A1542 = 0; + + #pragma omp parallel for schedule(static) + for (int32_t pA1542 = 0; pA1542 < (A15422_pos[A15421_dimension] * A15423_dimension); pA1542++) { + A1542_vals[pA1542] = 0.0; + } + + #pragma omp parallel for schedule(runtime) + for (int32_t i1552 = 0; i1552 < ((B1_dimension + 15) / 16); i1552++) { + for (int32_t i1553 = 0; i1553 < 16; i1553++) { + int32_t i1547 = i1552 * 16 + i1553; + if (i1547 >= B1_dimension) + continue; + + for (int32_t i1548B = B2_pos[i1547]; i1548B < B2_pos[(i1547 + 1)]; i1548B++) { + for (int32_t i1549B = B3_pos[i1548B]; i1549B < B3_pos[(i1548B + 1)]; i1549B++) { + int32_t i1549 = B3_crd[i1549B]; + for (int32_t i1550 = 0; i1550 < D1_dimension; i1550++) { + int32_t i1550C = i1549 * C2_dimension + i1550; + for (int32_t i1551 = 0; i1551 < D2_dimension; i1551++) { + // int32_t i1551A1542 = i1548A1542 * A15423_dimension + i1551; + int32_t i1551A1542 = i1548B * A15423_dimension + i1551; + int32_t i1551D = i1550 * D2_dimension + i1551; + A1542_vals[i1551A1542] = A1542_vals[i1551A1542] + (B_vals[i1549B] * C_vals[i1550C]) * D_vals[i1551D]; + } + } + } + // i1548A1542++; + } + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/ttm_original2.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} diff --git a/test/kernels/ttm_ttm/ttm_original2.h b/test/kernels/ttm_ttm/ttm_original2.h new file mode 100644 index 000000000..8a08b4548 --- /dev/null +++ b/test/kernels/ttm_ttm/ttm_original2.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A1542, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A1542, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D); +#endif diff --git a/test/kernels/ttm_ttm/ttm_original2.so b/test/kernels/ttm_ttm/ttm_original2.so new file mode 100755 index 000000000..6466a2af2 Binary files /dev/null and b/test/kernels/ttm_ttm/ttm_original2.so differ diff --git a/test/stats/hadamard-gemm.txt b/test/stats/hadamard-gemm.txt new file mode 100644 index 000000000..6e730cf50 --- /dev/null +++ b/test/stats/hadamard-gemm.txt @@ -0,0 +1,921 @@ + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 64, vals: 2330688 +D1_dimension: 36417, D2_dimension: 64, vals: 2330688 +E1_dimension: 64, E2_dimension: 64, vals: 4096 + + +kernel execution time: 22.4288 ms +fused time: 23.1383 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 64, vals: 2330688 +D1_dimension: 36417, D2_dimension: 64, vals: 2330688 +E1_dimension: 64, E2_dimension: 64, vals: 4096 + + +kernel execution time: 8.99985 ms +fused time: 9.71943 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 64, vals: 2330688 +D1_dimension: 36417, D2_dimension: 64, vals: 2330688 +E1_dimension: 64, E2_dimension: 64, vals: 4096 + + +kernel execution time: 8.65832 ms +fused time: 9.33544 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 64, vals: 2330688 +D1_dimension: 36417, D2_dimension: 64, vals: 2330688 +E1_dimension: 64, E2_dimension: 64, vals: 4096 + + +kernel execution time: 21.7432 ms +fused time: 22.466 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 206500, B2_dimension: 206500, vals: 1273389 +C1_dimension: 206500, C2_dimension: 64, vals: 13216000 +D1_dimension: 206500, D2_dimension: 64, vals: 13216000 +E1_dimension: 64, E2_dimension: 64, vals: 4096 + + +kernel execution time: 25.8057 ms +fused time: 26.4891 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 206500, B2_dimension: 206500, vals: 1273389 +C1_dimension: 206500, C2_dimension: 64, vals: 13216000 +D1_dimension: 206500, D2_dimension: 64, vals: 13216000 +E1_dimension: 64, E2_dimension: 64, vals: 4096 + + +kernel execution time: 26.7972 ms +fused time: 27.2892 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 206500, B2_dimension: 206500, vals: 1273389 +C1_dimension: 206500, C2_dimension: 64, vals: 13216000 +D1_dimension: 206500, D2_dimension: 64, vals: 13216000 +E1_dimension: 64, E2_dimension: 64, vals: 4096 + + +kernel execution time: 46.4376 ms +fused time: 47.1315 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 206500, B2_dimension: 206500, vals: 1273389 +C1_dimension: 206500, C2_dimension: 64, vals: 13216000 +D1_dimension: 206500, D2_dimension: 64, vals: 13216000 +E1_dimension: 64, E2_dimension: 64, vals: 4096 + + +kernel execution time: 26.8781 ms +fused time: 27.4325 + +kernel execution time: 61.7475 ms +taco reference time: 62.3899 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 206500, B2_dimension: 206500, vals: 1273389 +C1_dimension: 206500, C2_dimension: 64, vals: 13216000 +D1_dimension: 206500, D2_dimension: 64, vals: 13216000 +E1_dimension: 64, E2_dimension: 64, vals: 4096 + + +kernel execution time: 25.4837 ms +fused time: 25.9563 + +kernel execution time: 15.5567 ms +sddmm time: 16.2101 + +kernel execution time: 73.7443 ms +taco reference time: 74.42 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 206500, B2_dimension: 206500, vals: 1273389 +C1_dimension: 206500, C2_dimension: 64, vals: 13216000 +D1_dimension: 206500, D2_dimension: 64, vals: 13216000 +E1_dimension: 64, E2_dimension: 64, vals: 4096 + + +kernel execution time: 24.5312 ms +fused time: 25.0641 + +kernel execution time: 14.7877 ms +hadamard time: 15.4539 + +kernel execution time: 18.149 ms +gemm time: 18.7191 + +kernel execution time: 73.8142 ms +taco reference time: 74.4567 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 206500, B2_dimension: 206500, vals: 1273389 +C1_dimension: 206500, C2_dimension: 128, vals: 26432000 +D1_dimension: 206500, D2_dimension: 128, vals: 26432000 +E1_dimension: 128, E2_dimension: 64, vals: 8192 + + +kernel execution time: 36.5794 ms +fused time: 37.1963 + +kernel execution time: 31.9277 ms +hadamard time: 32.6108 + +kernel execution time: 28.0947 ms +gemm time: 28.7572 + +kernel execution time: 203.157 ms +taco reference time: 203.921 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 206500, B2_dimension: 206500, vals: 1273389 +C1_dimension: 206500, C2_dimension: 128, vals: 26432000 +D1_dimension: 206500, D2_dimension: 128, vals: 26432000 +E1_dimension: 128, E2_dimension: 128, vals: 16384 + + +kernel execution time: 42.4207 ms +fused time: 42.9584 + +kernel execution time: 31.1526 ms +hadamard time: 31.8623 + +kernel execution time: 62.6041 ms +gemm time: 63.199 + +kernel execution time: 416.714 ms +taco reference time: 417.403 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291 +C1_dimension: 5558326, C2_dimension: 128, vals: 711465728 +D1_dimension: 5558326, D2_dimension: 128, vals: 711465728 +E1_dimension: 128, E2_dimension: 128, vals: 16384 + + +kernel execution time: 1265.12 ms +fused time: 1266.15 + +kernel execution time: 4815.82 ms +hadamard time: 4816.95 + +kernel execution time: 1478.77 ms +gemm time: 1479.51 + +kernel execution time: 63618.8 ms +taco reference time: 63619.9 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 128, vals: 1404672 +D1_dimension: 10974, D2_dimension: 128, vals: 1404672 +E1_dimension: 128, E2_dimension: 128, vals: 16384 + + +kernel execution time: 4.44366 ms +fused time: 5.30002 + +kernel execution time: 1.60353 ms +hadamard time: 2.06029 + +kernel execution time: 4.56709 ms +gemm time: 4.9084 + +kernel execution time: 52.2837 ms +taco reference time: 52.7156 +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 128, vals: 4661376 +D1_dimension: 36417, D2_dimension: 128, vals: 4661376 +E1_dimension: 128, E2_dimension: 128, vals: 16384 + + +kernel execution time: 13.0806 ms +fused time: 13.6544 + +kernel execution time: 12.1216 ms +hadamard time: 12.8046 + +kernel execution time: 11.8732 ms +gemm time: 12.47 + +kernel execution time: 477.422 ms +taco reference time: 477.987 +/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 46835, B2_dimension: 46835, vals: 2374001 +C1_dimension: 46835, C2_dimension: 128, vals: 5994880 +D1_dimension: 46835, D2_dimension: 128, vals: 5994880 +E1_dimension: 128, E2_dimension: 128, vals: 16384 + + +kernel execution time: 13.6475 ms +fused time: 14.2071 + +kernel execution time: 12.1816 ms +hadamard time: 12.8468 + +kernel execution time: 14.7018 ms +gemm time: 15.233 + +kernel execution time: 251.649 ms +taco reference time: 252.229 +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 62451, B2_dimension: 62451, vals: 4007383 +C1_dimension: 62451, C2_dimension: 128, vals: 7993728 +D1_dimension: 62451, D2_dimension: 128, vals: 7993728 +E1_dimension: 128, E2_dimension: 128, vals: 16384 + + +kernel execution time: 20.2137 ms +fused time: 20.7037 + +kernel execution time: 19.6828 ms +hadamard time: 20.2722 + +kernel execution time: 18.5323 ms +gemm time: 19.0234 + +kernel execution time: 415.255 ms +taco reference time: 415.805 +/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 83334, B2_dimension: 83334, vals: 6010480 +C1_dimension: 83334, C2_dimension: 128, vals: 10666752 +D1_dimension: 83334, D2_dimension: 128, vals: 10666752 +E1_dimension: 128, E2_dimension: 128, vals: 16384 + + +kernel execution time: 28.1295 ms +fused time: 28.6289 + +kernel execution time: 28.2393 ms +hadamard time: 28.8514 + +kernel execution time: 24.2246 ms +gemm time: 24.7551 + +kernel execution time: 597.455 ms +taco reference time: 598.049 +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 +C1_dimension: 121192, C2_dimension: 128, vals: 15512576 +D1_dimension: 121192, D2_dimension: 128, vals: 15512576 +E1_dimension: 128, E2_dimension: 128, vals: 16384 + + +kernel execution time: 49.6444 ms +fused time: 50.1899 + +kernel execution time: 45.97 ms +hadamard time: 46.6381 + +kernel execution time: 33.5119 ms +gemm time: 34.0815 + +kernel execution time: 258.507 ms +taco reference time: 259.153 +/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 140874, B2_dimension: 140874, vals: 7813404 +C1_dimension: 140874, C2_dimension: 128, vals: 18031872 +D1_dimension: 140874, D2_dimension: 128, vals: 18031872 +E1_dimension: 128, E2_dimension: 128, vals: 16384 + + +kernel execution time: 42.1499 ms +fused time: 42.7069 + +kernel execution time: 41.9158 ms +hadamard time: 42.597 + +kernel execution time: 37.5761 ms +gemm time: 38.1603 + +kernel execution time: 748.178 ms +taco reference time: 748.913 +/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 170998, B2_dimension: 170998, vals: 958936 +C1_dimension: 170998, C2_dimension: 128, vals: 21887744 +D1_dimension: 170998, D2_dimension: 128, vals: 21887744 +E1_dimension: 128, E2_dimension: 128, vals: 16384 + + +kernel execution time: 32.0664 ms +fused time: 32.5614 + +kernel execution time: 27.8304 ms +hadamard time: 28.5102 + +kernel execution time: 45.5743 ms +gemm time: 46.1921 + +kernel execution time: 97.9936 ms +taco reference time: 98.6611 +/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 206500, B2_dimension: 206500, vals: 1273389 +C1_dimension: 206500, C2_dimension: 128, vals: 26432000 +D1_dimension: 206500, D2_dimension: 128, vals: 26432000 +E1_dimension: 128, E2_dimension: 128, vals: 16384 + + +kernel execution time: 42.0101 ms +fused time: 42.5555 + +kernel execution time: 38.2596 ms +hadamard time: 38.9704 + +kernel execution time: 55.2502 ms +gemm time: 55.8132 + +kernel execution time: 128.93 ms +taco reference time: 129.615 +/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536 +C1_dimension: 1000005, C2_dimension: 128, vals: 128000640 +D1_dimension: 1000005, D2_dimension: 128, vals: 128000640 +E1_dimension: 128, E2_dimension: 128, vals: 16384 + + +kernel execution time: 156.672 ms +fused time: 157.149 + +kernel execution time: 108.579 ms +hadamard time: 109.187 + +kernel execution time: 266.855 ms +gemm time: 267.343 + +kernel execution time: 325.2 ms +taco reference time: 325.907 +/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291 +C1_dimension: 5558326, C2_dimension: 128, vals: 711465728 +D1_dimension: 5558326, D2_dimension: 128, vals: 711465728 +E1_dimension: 128, E2_dimension: 128, vals: 16384 + + +kernel execution time: 1267.69 ms +fused time: 1268.78 + +kernel execution time: 1173.34 ms +hadamard time: 1174.13 + +kernel execution time: 1502.45 ms +gemm time: 1503.33 + +kernel execution time: 12918.1 ms +taco reference time: 12919.5 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 140874, B2_dimension: 140874, vals: 7813404 +C1_dimension: 140874, C2_dimension: 128, vals: 18031872 +D1_dimension: 140874, D2_dimension: 128, vals: 18031872 +E1_dimension: 128, E2_dimension: 128, vals: 16384 + + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 140874, B2_dimension: 140874, vals: 7813404 +C1_dimension: 140874, C2_dimension: 128, vals: 18031872 +D1_dimension: 140874, D2_dimension: 128, vals: 18031872 +E1_dimension: 128, E2_dimension: 128, vals: 16384 + + +kernel execution time: 44.4685 ms +fused time: 47.652 + +kernel execution time: 39.859 ms +hadamard time: 40.465 + +kernel execution time: 40.2328 ms +gemm time: 40.7652 + +kernel execution time: 770.504 ms +taco reference time: 771.113 +/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 170998, B2_dimension: 170998, vals: 958936 +C1_dimension: 170998, C2_dimension: 128, vals: 21887744 +D1_dimension: 170998, D2_dimension: 128, vals: 21887744 +E1_dimension: 128, E2_dimension: 128, vals: 16384 + + +kernel execution time: 32.6037 ms +fused time: 36.0777 + +kernel execution time: 27.1815 ms +hadamard time: 27.8676 + +kernel execution time: 46.1458 ms +gemm time: 46.6699 + +kernel execution time: 97.8299 ms +taco reference time: 98.5149 +/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 206500, B2_dimension: 206500, vals: 1273389 +C1_dimension: 206500, C2_dimension: 128, vals: 26432000 +D1_dimension: 206500, D2_dimension: 128, vals: 26432000 +E1_dimension: 128, E2_dimension: 128, vals: 16384 + + +kernel execution time: 42.3414 ms +fused time: 46.4717 + +kernel execution time: 37.0604 ms +hadamard time: 37.7717 + +kernel execution time: 55.4753 ms +gemm time: 56.0538 + +kernel execution time: 129.339 ms +taco reference time: 130.028 +/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536 +C1_dimension: 1000005, C2_dimension: 128, vals: 128000640 +D1_dimension: 1000005, D2_dimension: 128, vals: 128000640 +E1_dimension: 128, E2_dimension: 128, vals: 16384 + + +kernel execution time: 159.647 ms +fused time: 164.344 + +kernel execution time: 110.823 ms +hadamard time: 111.516 + +kernel execution time: 268.805 ms +gemm time: 269.465 + +kernel execution time: 326.437 ms +taco reference time: 327.144 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 128, vals: 1404672 +D1_dimension: 10974, D2_dimension: 128, vals: 1404672 +E1_dimension: 128, E2_dimension: 128, vals: 16384 + + +kernel execution time: 80.3808 ms +fused time: 82.9372 + +kernel execution time: 17.8402 ms +hadamard time: 18.4152 + +kernel execution time: 127.495 ms +gemm time: 128.275 + +kernel execution time: 1763.16 ms +taco reference time: 1763.78 +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 128, vals: 4661376 +D1_dimension: 36417, D2_dimension: 128, vals: 4661376 +E1_dimension: 128, E2_dimension: 128, vals: 16384 + + +kernel execution time: 352.899 ms +fused time: 356.76 + +kernel execution time: 157.362 ms +hadamard time: 157.893 + +kernel execution time: 406.42 ms +gemm time: 407.203 + +kernel execution time: 17839.4 ms +taco reference time: 17840.5 +/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 46835, B2_dimension: 46835, vals: 2374001 +C1_dimension: 46835, C2_dimension: 128, vals: 5994880 +D1_dimension: 46835, D2_dimension: 128, vals: 5994880 +E1_dimension: 128, E2_dimension: 128, vals: 16384 + + +kernel execution time: 360.403 ms +fused time: 364.207 + +kernel execution time: 92.7639 ms +hadamard time: 93.2881 + +kernel execution time: 519.132 ms +gemm time: 519.668 + +kernel execution time: 9767.06 ms +taco reference time: 9767.66 +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 62451, B2_dimension: 62451, vals: 4007383 +C1_dimension: 62451, C2_dimension: 128, vals: 7993728 +D1_dimension: 62451, D2_dimension: 128, vals: 7993728 +E1_dimension: 128, E2_dimension: 128, vals: 16384 + + +kernel execution time: 499.64 ms +fused time: 503.449 + +kernel execution time: 148.888 ms +hadamard time: 149.416 + +kernel execution time: 689.134 ms +gemm time: 689.652 + +kernel execution time: 16929 ms +taco reference time: 16930 +/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 83334, B2_dimension: 83334, vals: 6010480 +C1_dimension: 83334, C2_dimension: 128, vals: 10666752 +D1_dimension: 83334, D2_dimension: 128, vals: 10666752 +E1_dimension: 128, E2_dimension: 128, vals: 16384 + + +kernel execution time: 690.556 ms +fused time: 694.221 + +kernel execution time: 230.454 ms +hadamard time: 230.979 + +kernel execution time: 922.831 ms +gemm time: 923.322 + +kernel execution time: 24781.4 ms +taco reference time: 24782.4 +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 +C1_dimension: 121192, C2_dimension: 128, vals: 15512576 +D1_dimension: 121192, D2_dimension: 128, vals: 15512576 +E1_dimension: 128, E2_dimension: 128, vals: 16384 + + +kernel execution time: 871.577 ms +fused time: 876.166 + +kernel execution time: 213.157 ms +hadamard time: 213.706 + +kernel execution time: 1342.88 ms +gemm time: 1343.39 + +kernel execution time: 10845 ms +taco reference time: 10846.1 +/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 140874, B2_dimension: 140874, vals: 7813404 +C1_dimension: 140874, C2_dimension: 128, vals: 18031872 +D1_dimension: 140874, D2_dimension: 128, vals: 18031872 +E1_dimension: 128, E2_dimension: 128, vals: 16384 + + +kernel execution time: 1074.54 ms +fused time: 1078.91 + +kernel execution time: 302.447 ms +hadamard time: 302.972 + +kernel execution time: 1560.59 ms +gemm time: 1561.07 + +kernel execution time: 32089.4 ms +taco reference time: 32090.3 +/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 170998, B2_dimension: 170998, vals: 958936 +C1_dimension: 170998, C2_dimension: 128, vals: 21887744 +D1_dimension: 170998, D2_dimension: 128, vals: 21887744 +E1_dimension: 128, E2_dimension: 128, vals: 16384 + + +kernel execution time: 1034.29 ms +fused time: 1037.96 + +kernel execution time: 85.577 ms +hadamard time: 86.1357 + +kernel execution time: 1881.63 ms +gemm time: 1882.13 + +kernel execution time: 3962.92 ms +taco reference time: 3963.97 +/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 206500, B2_dimension: 206500, vals: 1273389 +C1_dimension: 206500, C2_dimension: 128, vals: 26432000 +D1_dimension: 206500, D2_dimension: 128, vals: 26432000 +E1_dimension: 128, E2_dimension: 128, vals: 16384 + + +kernel execution time: 1241.65 ms +fused time: 1244.6 + +kernel execution time: 87.8479 ms +hadamard time: 88.3878 + +kernel execution time: 2286.72 ms +gemm time: 2287.22 + +kernel execution time: 5303.69 ms +taco reference time: 5304.69 +/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536 +C1_dimension: 1000005, C2_dimension: 128, vals: 128000640 +D1_dimension: 1000005, D2_dimension: 128, vals: 128000640 +E1_dimension: 128, E2_dimension: 128, vals: 16384 + + +kernel execution time: 5642.42 ms +fused time: 5643.31 + +kernel execution time: 264.874 ms +hadamard time: 265.396 + +kernel execution time: 10966.5 ms +gemm time: 10967.4 + +kernel execution time: 12863.7 ms +taco reference time: 12864.8 +/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291 +C1_dimension: 5558326, C2_dimension: 128, vals: 711465728 +D1_dimension: 5558326, D2_dimension: 128, vals: 711465728 +E1_dimension: 128, E2_dimension: 128, vals: 16384 + + +kernel execution time: 35075.5 ms +fused time: 35079.3 + +kernel execution time: 3869.9 ms +hadamard time: 3870.98 + +kernel execution time: 61504.6 ms +gemm time: 61505.4 + +kernel execution time: 245613 ms +taco reference time: 245614 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 128, E2_dimension: 128, vals: 16384 + + +kernel execution time: 18.3809 ms +fused time: 19.1229 + +kernel execution time: 0.635828 ms +hadamard time: 0.983143 + +kernel execution time: 30.5122 ms +gemm time: 30.7819 + +kernel execution time: 23.6746 ms +taco reference time: 24.0784 +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 548551, B2_dimension: 548551, vals: 1851744 +C1_dimension: 548551, C2_dimension: 128, vals: 70214528 +D1_dimension: 548551, D2_dimension: 128, vals: 70214528 +E1_dimension: 128, E2_dimension: 128, vals: 16384 + + +kernel execution time: 3580.2 ms +fused time: 3581 + +kernel execution time: 567.762 ms +hadamard time: 568.301 + +kernel execution time: 6079.96 ms +gemm time: 6080.46 + +kernel execution time: 8129.78 ms +taco reference time: 8130.38 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 128, E2_dimension: 128, vals: 16384 + + +kernel execution time: 18.4625 ms +fused time: 19.1824 + +kernel execution time: 0.520446 ms +hadamard time: 0.824011 + +kernel execution time: 30.2097 ms +gemm time: 30.46 + +kernel execution time: 23.4681 ms +taco reference time: 23.826 +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 548551, B2_dimension: 548551, vals: 1851744 +C1_dimension: 548551, C2_dimension: 128, vals: 70214528 +D1_dimension: 548551, D2_dimension: 128, vals: 70214528 +E1_dimension: 128, E2_dimension: 128, vals: 16384 + + +kernel execution time: 3528.39 ms +fused time: 3529.23 + +kernel execution time: 558.625 ms +hadamard time: 559.16 + +kernel execution time: 6157.3 ms +gemm time: 6158.14 + +kernel execution time: 8131.73 ms +taco reference time: 8132.69 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 128, E2_dimension: 128, vals: 16384 + + +kernel execution time: 2.27347 ms +fused time: 2.7115 + +kernel execution time: 0.180952 ms +hadamard time: 0.76318 + +kernel execution time: 2.72672 ms +gemm time: 3.22211 + +kernel execution time: 5.227 ms +taco reference time: 5.75632 +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 548551, B2_dimension: 548551, vals: 1851744 +C1_dimension: 548551, C2_dimension: 128, vals: 70214528 +D1_dimension: 548551, D2_dimension: 128, vals: 70214528 +E1_dimension: 128, E2_dimension: 128, vals: 16384 + + +kernel execution time: 164.815 ms +fused time: 165.539 + +kernel execution time: 96.629 ms +hadamard time: 97.303 + +kernel execution time: 202.068 ms +gemm time: 202.628 + +kernel execution time: 273.96 ms +taco reference time: 274.643 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 128, E2_dimension: 128, vals: 16384 + + +kernel execution time: 2.37004 ms +fused time: 3.11591 + +kernel execution time: 0.176612 ms +hadamard time: 0.833621 + +kernel execution time: 2.08823 ms +gemm time: 2.59022 + +kernel execution time: 3.36531 ms +taco reference time: 4.11087 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 128, E2_dimension: 128, vals: 16384 + + +kernel execution time: 19.3307 ms +fused time: 20.0662 + +kernel execution time: 0.496176 ms +hadamard time: 0.931803 + +kernel execution time: 30.1194 ms +gemm time: 30.3654 + +kernel execution time: 23.3946 ms +taco reference time: 23.7411 diff --git a/test/stats/mttkrp-spmm.txt b/test/stats/mttkrp-spmm.txt new file mode 100644 index 000000000..fd6226179 --- /dev/null +++ b/test/stats/mttkrp-spmm.txt @@ -0,0 +1,1090 @@ + + mttkrp-spmm execution +A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) +B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 +C1_dimension: 25, C2_dimension: 25, vals: 625 +D1_dimension: 25, D2_dimension: 25, vals: 625 +E1_dimension: 25, E2_dimension: 48, vals: 1200 + + +kernel execution time: 0.03045 ms +fused time: 0.870912 + +kernel execution time: 0.168452 ms +reference asymptotic blowup time: 0.983003 + +kernel execution time: 0.015 ms +mttkrp time: 0.493997 + +kernel execution time: 0.0267 ms +spmm time: 0.74405 + +mttkrp-spmm execution + +0.015 0.0267 0.03045 0.168452 + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/delicious-3d.tns +A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) +B1_dimension: 532924, B2_dimension: 17262471, B3_dimension: 532924, vals: 140126181 +C1_dimension: 17262471, C2_dimension: 25, vals: 431561775 +D1_dimension: 2480308, D2_dimension: 25, vals: 62007700 +E1_dimension: 25, E2_dimension: 48, vals: 1200 + + +kernel execution time: 8190.76 ms +fused time: 8191.78 + +kernel execution time: 112801 ms +reference asymptotic blowup time: 112802 + +kernel execution time: 11198.5 ms +mttkrp time: 11199.5 + +kernel execution time: 238.88 ms +spmm time: 239.385 + +0.015 0.0267 0.03045 0.168452 +11198.5 238.88 8190.76 112801 + + +mttkrp-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns +A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) +B1_dimension: 319686, B2_dimension: 28153045, B3_dimension: 319686, vals: 112890310 +C1_dimension: 28153045, C2_dimension: 25, vals: 703826125 +D1_dimension: 1607191, D2_dimension: 25, vals: 40179775 +E1_dimension: 25, E2_dimension: 48, vals: 1200 + + +kernel execution time: 3951.18 ms +fused time: 3952.21 + +kernel execution time: 76964 ms +reference asymptotic blowup time: 76965.1 + +kernel execution time: 6212.97 ms +mttkrp time: 6213.89 + +kernel execution time: 142.233 ms +spmm time: 142.726 + +mttkrp-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns +A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) +B1_dimension: 12092, B2_dimension: 9184, B3_dimension: 12092, vals: 76879419 +C1_dimension: 9184, C2_dimension: 25, vals: 229600 +D1_dimension: 28818, D2_dimension: 25, vals: 720450 +E1_dimension: 25, E2_dimension: 48, vals: 1200 + + +kernel execution time: 997.696 ms +fused time: 998.725 + +kernel execution time: 55544.7 ms +reference asymptotic blowup time: 55545.9 + +kernel execution time: 1944.26 ms +mttkrp time: 1944.75 + +kernel execution time: 5.40774 ms +spmm time: 5.8765 + +mttkrp-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/nell-1.tns +A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) +B1_dimension: 2902330, B2_dimension: 2143368, B3_dimension: 2902330, vals: 143599552 +C1_dimension: 2143368, C2_dimension: 25, vals: 53584200 +D1_dimension: 25495389, D2_dimension: 25, vals: 637384725 +E1_dimension: 25, E2_dimension: 48, vals: 1200 + + +kernel execution time: 16275.3 ms +fused time: 16276.4 + +kernel execution time: 325523 ms +reference asymptotic blowup time: 325525 + +kernel execution time: 29202.5 ms +mttkrp time: 29203.5 + +kernel execution time: 1240.14 ms +spmm time: 1240.66 + +mttkrp-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns +A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) +B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854 +C1_dimension: 11374, C2_dimension: 25, vals: 284350 +D1_dimension: 2, D2_dimension: 25, vals: 50 +E1_dimension: 25, E2_dimension: 48, vals: 1200 + + +kernel execution time: 400.942 ms +fused time: 401.47 + +kernel execution time: 21565.2 ms +reference asymptotic blowup time: 21566.3 + +kernel execution time: 1292.53 ms +mttkrp time: 1293.05 + +kernel execution time: 72.2856 ms +spmm time: 72.8001 + +mttkrp-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/darpa1998.tns +A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) +B1_dimension: 22476, B2_dimension: 22476, B3_dimension: 22476, vals: 28421307 +C1_dimension: 22476, C2_dimension: 25, vals: 561900 +D1_dimension: 23776223, D2_dimension: 25, vals: 594405575 +E1_dimension: 25, E2_dimension: 48, vals: 1200 + + +kernel execution time: 1397.54 ms +fused time: 1398.54 + +kernel execution time: 39690 ms +reference asymptotic blowup time: 39691 + +kernel execution time: 4004.71 ms +mttkrp time: 4005.68 + +kernel execution time: 7.97584 ms +spmm time: 8.44535 + +mttkrp-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/freebase_music.tns +A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) +B1_dimension: 23343790, B2_dimension: 23344784, B3_dimension: 23343790, vals: 99546550 +C1_dimension: 23344784, C2_dimension: 25, vals: 583619600 +D1_dimension: 166, D2_dimension: 25, vals: 4150 +E1_dimension: 25, E2_dimension: 48, vals: 1200 + + +kernel execution time: 15804.8 ms +fused time: 15805.9 + +kernel execution time: 79175 ms +reference asymptotic blowup time: 79176.1 + +kernel execution time: 10624.7 ms +mttkrp time: 10625.6 + +kernel execution time: 10007.2 ms +spmm time: 10008.2 + +mttkrp-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/freebase_sampled.tns +A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) +B1_dimension: 38954435, B2_dimension: 38955429, B3_dimension: 38954435, vals: 139920770 +C1_dimension: 38955429, C2_dimension: 25, vals: 973885725 +D1_dimension: 532, D2_dimension: 25, vals: 13300 +E1_dimension: 25, E2_dimension: 48, vals: 1200 + + +kernel execution time: 23869.4 ms +fused time: 23870.5 + +kernel execution time: 113144 ms +reference asymptotic blowup time: 113145 + +kernel execution time: 15284.7 ms +mttkrp time: 15285.7 + +kernel execution time: 15154.3 ms +spmm time: 15155.6 + +mttkrp-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/delicious-3d.tns +A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) +B1_dimension: 958893, B2_dimension: 55584242, B3_dimension: 958893, vals: 140126165 +C1_dimension: 55584242, C2_dimension: 25, vals: 1389606050 +D1_dimension: 2480308, D2_dimension: 25, vals: 62007700 +E1_dimension: 25, E2_dimension: 48, vals: 1200 + + +mttkrp-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns +A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) +B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 +C1_dimension: 25, C2_dimension: 25, vals: 625 +D1_dimension: 25, D2_dimension: 25, vals: 625 +E1_dimension: 25, E2_dimension: 48, vals: 1200 + + +kernel execution time: 0.043711 ms +fused time: 0.864271 + +kernel execution time: 0.027391 ms +mttkrp time: 0.889931 + +kernel execution time: 0.02264 ms +spmm time: 1.09649 + +kernel execution time: 0.04233 ms +reference asymptotic blowup time: 1.01915 + +mttkrp-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns +A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) +B1_dimension: 12092, B2_dimension: 9184, B3_dimension: 12092, vals: 76879419 +C1_dimension: 9184, C2_dimension: 25, vals: 229600 +D1_dimension: 28818, D2_dimension: 25, vals: 720450 +E1_dimension: 25, E2_dimension: 48, vals: 1200 + + +kernel execution time: 813.743 ms +fused time: 814.267 + +kernel execution time: 458.835 ms +mttkrp time: 459.4 + +kernel execution time: 3.56961 ms +spmm time: 4.08913 + +kernel execution time: 13803.8 ms +reference asymptotic blowup time: 13804.8 + +mttkrp-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns +A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) +B1_dimension: 12092, B2_dimension: 9184, B3_dimension: 12092, vals: 76879419 +C1_dimension: 9184, C2_dimension: 25, vals: 229600 +D1_dimension: 28818, D2_dimension: 25, vals: 720450 +E1_dimension: 25, E2_dimension: 48, vals: 1200 + + +kernel execution time: 224.386 ms +fused time: 224.986 + +kernel execution time: 101.692 ms +mttkrp time: 102.264 + +kernel execution time: 5.95563 ms +spmm time: 6.44162 + +kernel execution time: 2647.79 ms +reference asymptotic blowup time: 2648.57 + +mttkrp-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/darpa1998.tns +A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) +B1_dimension: 22476, B2_dimension: 22476, B3_dimension: 22476, vals: 28421307 +C1_dimension: 22476, C2_dimension: 25, vals: 561900 +D1_dimension: 23776223, D2_dimension: 25, vals: 594405575 +E1_dimension: 25, E2_dimension: 48, vals: 1200 + + +kernel execution time: 208.602 ms +fused time: 209.122 + +kernel execution time: 631.37 ms +mttkrp time: 631.981 + +kernel execution time: 7.20919 ms +spmm time: 7.81651 + +kernel execution time: 6749.05 ms +reference asymptotic blowup time: 6750.17 + +mttkrp-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns +A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) +B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854 +C1_dimension: 11374, C2_dimension: 25, vals: 284350 +D1_dimension: 2, D2_dimension: 25, vals: 50 +E1_dimension: 25, E2_dimension: 48, vals: 1200 + + +kernel execution time: 95.6907 ms +fused time: 96.2212 + +kernel execution time: 59.1475 ms +mttkrp time: 59.7153 + +kernel execution time: 63.6734 ms +spmm time: 64.1704 + +kernel execution time: 884.275 ms +reference asymptotic blowup time: 884.934 + +mttkrp-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns +A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) +B1_dimension: 12092, B2_dimension: 9184, B3_dimension: 12092, vals: 76879419 +C1_dimension: 9184, C2_dimension: 25, vals: 229600 +D1_dimension: 28818, D2_dimension: 25, vals: 720450 +E1_dimension: 25, E2_dimension: 48, vals: 1200 + + +kernel execution time: 225.843 ms +fused time: 226.345 + +kernel execution time: 100.14 ms +mttkrp time: 100.738 + +kernel execution time: 6.32395 ms +spmm time: 6.85452 + +kernel execution time: 2678.56 ms +reference asymptotic blowup time: 2679.35 + +mttkrp-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns +A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) +B1_dimension: 319686, B2_dimension: 28153045, B3_dimension: 319686, vals: 112890310 +C1_dimension: 28153045, C2_dimension: 25, vals: 703826125 +D1_dimension: 1607191, D2_dimension: 25, vals: 40179775 +E1_dimension: 25, E2_dimension: 48, vals: 1200 + + +kernel execution time: 503.61 ms +fused time: 504.129 + +kernel execution time: 314.899 ms +mttkrp time: 315.501 + +kernel execution time: 125.456 ms +spmm time: 125.953 + +kernel execution time: 3415.65 ms +reference asymptotic blowup time: 3416.62 + +mttkrp-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/delicious-3d.tns +A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) +B1_dimension: 958893, B2_dimension: 55584242, B3_dimension: 958893, vals: 140126165 +C1_dimension: 55584242, C2_dimension: 25, vals: 1389606050 +D1_dimension: 2480308, D2_dimension: 25, vals: 62007700 +E1_dimension: 25, E2_dimension: 48, vals: 1200 + + +mttkrp-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/nell-1.tns +A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) +B1_dimension: 2902330, B2_dimension: 2143368, B3_dimension: 2902330, vals: 143599552 +C1_dimension: 2143368, C2_dimension: 25, vals: 53584200 +D1_dimension: 25495389, D2_dimension: 25, vals: 637384725 +E1_dimension: 25, E2_dimension: 48, vals: 1200 + + +kernel execution time: 1501.57 ms +fused time: 1502.59 + +kernel execution time: 1748.65 ms +mttkrp time: 1749.21 + +kernel execution time: 1135.01 ms +spmm time: 1135.51 + +kernel execution time: 16178.4 ms +reference asymptotic blowup time: 16179.5 + +mttkrp-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/nell-1.tns +A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) +B1_dimension: 2902330, B2_dimension: 2143368, B3_dimension: 2902330, vals: 143599552 +C1_dimension: 2143368, C2_dimension: 25, vals: 53584200 +D1_dimension: 25495389, D2_dimension: 25, vals: 637384725 +E1_dimension: 25, E2_dimension: 48, vals: 1200 + + +kernel execution time: 16005.7 ms +fused time: 16006.6 + +kernel execution time: 29157.8 ms +mttkrp time: 29158.8 + +kernel execution time: 1247.23 ms +spmm time: 1247.75 + +kernel execution time: 329124 ms +reference asymptotic blowup time: 329125 + +mttkrp-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/delicious-3d.tns +A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) +B1_dimension: 958893, B2_dimension: 55584242, B3_dimension: 958893, vals: 140126165 +C1_dimension: 55584242, C2_dimension: 25, vals: 1389606050 +D1_dimension: 2480308, D2_dimension: 25, vals: 62007700 +E1_dimension: 25, E2_dimension: 48, vals: 1200 + + +mttkrp-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/delicious-3d.tns +A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) +B1_dimension: 958893, B2_dimension: 55584242, B3_dimension: 958893, vals: 140126165 +C1_dimension: 55584242, C2_dimension: 25, vals: 1389606050 +D1_dimension: 2480308, D2_dimension: 25, vals: 62007700 +E1_dimension: 25, E2_dimension: 48, vals: 1200 + + +mttkrp-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/delicious-3d.tns +A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) +B1_dimension: 958893, B2_dimension: 55584242, B3_dimension: 958893, vals: 140126165 +C1_dimension: 55584242, C2_dimension: 25, vals: 1389606050 +D1_dimension: 2480308, D2_dimension: 25, vals: 62007700 +E1_dimension: 25, E2_dimension: 48, vals: 1200 + + +mttkrp-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/delicious-3d.tns +A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) +B1_dimension: 958893, B2_dimension: 55584242, B3_dimension: 958893, vals: 140126165 +C1_dimension: 55584242, C2_dimension: 25, vals: 1389606050 +D1_dimension: 2480308, D2_dimension: 25, vals: 62007700 +E1_dimension: 25, E2_dimension: 48, vals: 1200 + + +mttkrp-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns +A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) +B1_dimension: 12092, B2_dimension: 9184, B3_dimension: 12092, vals: 76879419 +C1_dimension: 9184, C2_dimension: 25, vals: 229600 +D1_dimension: 28818, D2_dimension: 25, vals: 720450 +E1_dimension: 25, E2_dimension: 48, vals: 1200 + + +kernel execution time: 2651.26 ms +reference asymptotic blowup time: 2652.08 + +mttkrp-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/delicious-3d.tns +A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) +B1_dimension: 958893, B2_dimension: 55584242, B3_dimension: 958893, vals: 140126165 +C1_dimension: 55584242, C2_dimension: 25, vals: 1389606050 +D1_dimension: 2480308, D2_dimension: 25, vals: 62007700 +E1_dimension: 25, E2_dimension: 48, vals: 1200 + + +mttkrp-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns +A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) +B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 +C1_dimension: 25, C2_dimension: 32, vals: 800 +D1_dimension: 25, D2_dimension: 32, vals: 800 +E1_dimension: 32, E2_dimension: 64, vals: 2048 + + +kernel execution time: 0.286814 ms +reference asymptotic blowup time: 1.00956 + +mttkrp-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns +A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) +B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 +C1_dimension: 25, C2_dimension: 32, vals: 800 +D1_dimension: 25, D2_dimension: 32, vals: 800 +E1_dimension: 32, E2_dimension: 64, vals: 2048 + + +kernel execution time: 0.036661 ms +mttkrp time: 0.77391 + +kernel execution time: 0.02948 ms +mttkrp ryan time: 0.932103 + +kernel execution time: 0.264104 ms +reference asymptotic blowup time: 1.32301 + +mttkrp-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns +A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) +B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 +C1_dimension: 25, C2_dimension: 32, vals: 800 +D1_dimension: 25, D2_dimension: 32, vals: 800 +E1_dimension: 32, E2_dimension: 64, vals: 2048 + + +kernel execution time: 0.04003 ms +mttkrp time: 0.779201 + +kernel execution time: 0.022291 ms +mttkrp ryan time: 0.821601 + +kernel execution time: 0.268404 ms +reference asymptotic blowup time: 1.28741 + +mttkrp-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns +A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) +B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 +C1_dimension: 25, C2_dimension: 32, vals: 800 +D1_dimension: 25, D2_dimension: 32, vals: 800 +E1_dimension: 32, E2_dimension: 64, vals: 2048 + + +kernel execution time: 0.03006 ms +default mttkrp time: 0.641369 + +kernel execution time: 0.023191 ms +ryan mttkrp workspace time: 0.982223 + +kernel execution time: 0.084371 ms +spmm time: 0.944412 + +kernel execution time: 0.262723 ms +reference asymptotic blowup time: 0.927732 + +mttkrp-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns +A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) +B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 +C1_dimension: 25, C2_dimension: 32, vals: 800 +D1_dimension: 25, D2_dimension: 32, vals: 800 +E1_dimension: 32, E2_dimension: 64, vals: 2048 + + +kernel execution time: 0.046181 ms +default mttkrp time: 0.459706 + +kernel execution time: 0.076311 ms +ryan mttkrp workspace time: 1.1076 + +kernel execution time: 0.06528 ms +GeMM time: 0.307835 + +kernel execution time: 0.230713 ms +reference asymptotic blowup time: 0.942012 + +kernel execution time: 0.081741 ms +fused mttkrp+gemm time: 0.885412 + +mttkrp-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns +A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) +B1_dimension: 319686, B2_dimension: 28153045, B3_dimension: 319686, vals: 112890310 +C1_dimension: 28153045, C2_dimension: 32, vals: 900897440 +D1_dimension: 1607191, D2_dimension: 32, vals: 51430112 +E1_dimension: 32, E2_dimension: 64, vals: 2048 + + +kernel execution time: 233.898 ms +default mttkrp time: 234.426 + +kernel execution time: 293.46 ms +ryan mttkrp workspace time: 294.21 + +kernel execution time: 23.4947 ms +GeMM time: 24.009 + +kernel execution time: 2753.37 ms +reference asymptotic blowup time: 2754.12 + +kernel execution time: 287.939 ms +fused mttkrp+gemm time: 288.576 + +mttkrp-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns +A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) +B1_dimension: 12092, B2_dimension: 9184, B3_dimension: 12092, vals: 76879419 +C1_dimension: 9184, C2_dimension: 32, vals: 293888 +D1_dimension: 28818, D2_dimension: 32, vals: 922176 +E1_dimension: 32, E2_dimension: 64, vals: 2048 + + +kernel execution time: 140.989 ms +default mttkrp time: 141.517 + +kernel execution time: 36.4285 ms +ryan mttkrp workspace time: 37.0544 + +kernel execution time: 1.06091 ms +GeMM time: 1.6425 + +kernel execution time: 3142.38 ms +reference asymptotic blowup time: 3143.28 + +kernel execution time: 43.1867 ms +fused mttkrp+gemm time: 43.8393 + +mttkrp-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/nell-1.tns +A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) +B1_dimension: 2902330, B2_dimension: 2143368, B3_dimension: 2902330, vals: 143599552 +C1_dimension: 2143368, C2_dimension: 32, vals: 68587776 +D1_dimension: 25495389, D2_dimension: 32, vals: 815852448 +E1_dimension: 32, E2_dimension: 64, vals: 2048 + + +kernel execution time: 2635.67 ms +default mttkrp time: 2636.7 + +kernel execution time: 913.661 ms +ryan mttkrp workspace time: 914.435 + +kernel execution time: 166.615 ms +GeMM time: 167.532 + +kernel execution time: 39080.1 ms +reference asymptotic blowup time: 39080.8 + +kernel execution time: 1141.77 ms +fused mttkrp+gemm time: 1142.88 + +mttkrp-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns +A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) +B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854 +C1_dimension: 11374, C2_dimension: 32, vals: 363968 +D1_dimension: 2, D2_dimension: 32, vals: 64 +E1_dimension: 32, E2_dimension: 64, vals: 2048 + + +kernel execution time: 38.5561 ms +default mttkrp time: 39.0876 + +kernel execution time: 18.0733 ms +ryan mttkrp workspace time: 18.6685 + +kernel execution time: 9.91856 ms +GeMM time: 10.4003 + +kernel execution time: 663.996 ms +reference asymptotic blowup time: 664.529 + +kernel execution time: 15.476 ms +fused mttkrp+gemm time: 16.1515 + +mttkrp-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/darpa1998.tns +A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) +B1_dimension: 22476, B2_dimension: 22476, B3_dimension: 22476, vals: 28421307 +C1_dimension: 22476, C2_dimension: 32, vals: 719232 +D1_dimension: 23776223, D2_dimension: 32, vals: 760839136 +E1_dimension: 32, E2_dimension: 64, vals: 2048 + + +kernel execution time: 893.657 ms +default mttkrp time: 894.664 + +kernel execution time: 228.227 ms +ryan mttkrp workspace time: 228.852 + +kernel execution time: 1.81839 ms +GeMM time: 2.27454 + +kernel execution time: 13301.8 ms +reference asymptotic blowup time: 13302.7 + +kernel execution time: 238.142 ms +fused mttkrp+gemm time: 238.778 + +mttkrp-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/delicious-3d.tns +A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) +B1_dimension: 958893, B2_dimension: 55584242, B3_dimension: 958893, vals: 140126164 +C1_dimension: 55584242, C2_dimension: 32, vals: 1778695744 +D1_dimension: 2480308, D2_dimension: 32, vals: 79369856 +E1_dimension: 32, E2_dimension: 64, vals: 2048 + + +mttkrp-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns +A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) +B1_dimension: 319686, B2_dimension: 28153045, B3_dimension: 319686, vals: 112890310 +C1_dimension: 28153045, C2_dimension: 32, vals: 900897440 +D1_dimension: 1607191, D2_dimension: 32, vals: 51430112 +E1_dimension: 32, E2_dimension: 64, vals: 2048 + + +kernel execution time: 6303 ms +default mttkrp time: 6303.86 + +kernel execution time: 4378.98 ms +ryan mttkrp workspace time: 4380.07 + +kernel execution time: 449.512 ms +GeMM time: 450.037 + +kernel execution time: 116274 ms +reference asymptotic blowup time: 116275 + +kernel execution time: 4299.26 ms +fused mttkrp+gemm time: 4300.33 + +mttkrp-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns +A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) +B1_dimension: 12092, B2_dimension: 9184, B3_dimension: 12092, vals: 76879419 +C1_dimension: 9184, C2_dimension: 32, vals: 293888 +D1_dimension: 28818, D2_dimension: 32, vals: 922176 +E1_dimension: 32, E2_dimension: 64, vals: 2048 + + +kernel execution time: 2606.24 ms +default mttkrp time: 2607.1 + +kernel execution time: 878.486 ms +ryan mttkrp workspace time: 879.009 + +kernel execution time: 17.5967 ms +GeMM time: 18.0274 + +kernel execution time: 93762.9 ms +reference asymptotic blowup time: 93763.7 + +kernel execution time: 1052.15 ms +fused mttkrp+gemm time: 1052.76 + +mttkrp-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/nell-1.tns +A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) +B1_dimension: 2902330, B2_dimension: 2143368, B3_dimension: 2902330, vals: 143599552 +C1_dimension: 2143368, C2_dimension: 32, vals: 68587776 +D1_dimension: 25495389, D2_dimension: 32, vals: 815852448 +E1_dimension: 32, E2_dimension: 64, vals: 2048 + + +kernel execution time: 36869.4 ms +default mttkrp time: 36870.3 + +kernel execution time: 17566.6 ms +ryan mttkrp workspace time: 17567.6 + +kernel execution time: 4060.98 ms +GeMM time: 4061.93 + +kernel execution time: 720483 ms +reference asymptotic blowup time: 720484 + +kernel execution time: 17354.7 ms +fused mttkrp+gemm time: 17355.9 + +mttkrp-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns +A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) +B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854 +C1_dimension: 11374, C2_dimension: 32, vals: 363968 +D1_dimension: 2, D2_dimension: 32, vals: 64 +E1_dimension: 32, E2_dimension: 64, vals: 2048 + + +kernel execution time: 1680.94 ms +default mttkrp time: 1681.8 + +kernel execution time: 615.002 ms +ryan mttkrp workspace time: 615.585 + +kernel execution time: 231.923 ms +GeMM time: 232.453 + +kernel execution time: 28415.3 ms +reference asymptotic blowup time: 28416.4 + +kernel execution time: 453.141 ms +fused mttkrp+gemm time: 453.827 + +mttkrp-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/darpa1998.tns +A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) +B1_dimension: 22476, B2_dimension: 22476, B3_dimension: 22476, vals: 28421307 +C1_dimension: 22476, C2_dimension: 32, vals: 719232 +D1_dimension: 23776223, D2_dimension: 32, vals: 760839136 +E1_dimension: 32, E2_dimension: 64, vals: 2048 + + +kernel execution time: 4430.77 ms +default mttkrp time: 4431.71 + +kernel execution time: 1465.2 ms +ryan mttkrp workspace time: 1465.77 + +kernel execution time: 32.1871 ms +GeMM time: 32.6436 + +kernel execution time: 71199.8 ms +reference asymptotic blowup time: 71200.9 + +kernel execution time: 1570.11 ms +fused mttkrp+gemm time: 1570.76 + +mttkrp-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/darpa1998.tns +A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) +B1_dimension: 22476, B2_dimension: 22476, B3_dimension: 22476, vals: 28421307 +C1_dimension: 22476, C2_dimension: 32, vals: 719232 +D1_dimension: 23776223, D2_dimension: 32, vals: 760839136 +E1_dimension: 32, E2_dimension: 64, vals: 2048 + + +kernel execution time: 882.674 ms +default mttkrp time: 883.69 + +kernel execution time: 231.925 ms +ryan mttkrp workspace time: 232.94 + +kernel execution time: 1.87878 ms +GeMM time: 2.38818 + +kernel execution time: 13018.7 ms +reference asymptotic blowup time: 13019.7 + +kernel execution time: 227.495 ms +fused mttkrp+gemm time: 228.182 + +mttkrp-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/darpa1998.tns +A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) +B1_dimension: 22476, B2_dimension: 22476, B3_dimension: 22476, vals: 28421307 +C1_dimension: 22476, C2_dimension: 32, vals: 719232 +D1_dimension: 23776223, D2_dimension: 32, vals: 760839136 +E1_dimension: 32, E2_dimension: 64, vals: 2048 + + +kernel execution time: 874.742 ms +default mttkrp time: 875.218 + +kernel execution time: 231.556 ms +ryan mttkrp workspace time: 232.223 + +kernel execution time: 1.7427 ms +GeMM time: 2.19512 + +kernel execution time: 13047.8 ms +reference asymptotic blowup time: 13048.7 + +kernel execution time: 232.174 ms +fused mttkrp+gemm time: 232.85 + +mttkrp-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns +A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) +B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854 +C1_dimension: 11374, C2_dimension: 32, vals: 363968 +D1_dimension: 2, D2_dimension: 32, vals: 64 +E1_dimension: 32, E2_dimension: 64, vals: 2048 + + +kernel execution time: 40.9013 ms +default mttkrp time: 41.4712 + +kernel execution time: 18.9468 ms +ryan mttkrp workspace time: 19.5875 + +kernel execution time: 10.8838 ms +GeMM time: 11.3865 + +kernel execution time: 700.825 ms +reference asymptotic blowup time: 701.445 + +kernel execution time: 15.8743 ms +fused mttkrp+gemm time: 16.5313 + +mttkrp-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns +A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) +B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 +C1_dimension: 25, C2_dimension: 32, vals: 800 +D1_dimension: 25, D2_dimension: 32, vals: 800 +E1_dimension: 32, E2_dimension: 64, vals: 2048 + + +kernel execution time: 0.02019 ms +default mttkrp time: 3.8105 + +kernel execution time: 0.01628 ms +ryan mttkrp workspace time: 0.602618 + +kernel execution time: 0.075521 ms +GeMM time: 0.491146 + +kernel execution time: 0.254864 ms +reference asymptotic blowup time: 0.897372 + +kernel execution time: 0.038201 ms +fused mttkrp+gemm time: 4.54224 + +mttkrp-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns +A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) +B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 +C1_dimension: 25, C2_dimension: 32, vals: 800 +D1_dimension: 25, D2_dimension: 32, vals: 800 +E1_dimension: 32, E2_dimension: 64, vals: 2048 + + +kernel execution time: 0.02015 ms +default mttkrp time: 3.93207 + +kernel execution time: 0.015561 ms +ryan mttkrp workspace time: 0.559818 + +kernel execution time: 0.074741 ms +GeMM time: 0.880342 + +kernel execution time: 0.250803 ms +reference asymptotic blowup time: 0.892052 + +kernel execution time: 0.038071 ms +fused mttkrp+gemm time: 3.0867 + +mttkrp-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns +A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) +B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 +C1_dimension: 25, C2_dimension: 32, vals: 800 +D1_dimension: 25, D2_dimension: 32, vals: 800 +E1_dimension: 32, E2_dimension: 64, vals: 2048 + + +kernel execution time: 0.02689 ms +default mttkrp time: 0.73934 + +kernel execution time: 0.02205 ms +ryan mttkrp workspace time: 0.863852 + +kernel execution time: 0.081811 ms +GeMM time: 0.527658 + +kernel execution time: 0.259993 ms +reference asymptotic blowup time: 0.923212 + +kernel execution time: 0.042261 ms +fused mttkrp+gemm time: 0.703349 + +mttkrp-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns +A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) +B1_dimension: 12092, B2_dimension: 9184, B3_dimension: 12092, vals: 76879419 +C1_dimension: 9184, C2_dimension: 32, vals: 293888 +D1_dimension: 28818, D2_dimension: 32, vals: 922176 +E1_dimension: 32, E2_dimension: 64, vals: 2048 + + +kernel execution time: 141.637 ms +default mttkrp time: 142.17 + +kernel execution time: 41.1194 ms +ryan mttkrp workspace time: 41.7838 + +kernel execution time: 1.06942 ms +GeMM time: 1.50588 + +kernel execution time: 3218.72 ms +reference asymptotic blowup time: 3219.51 + +kernel execution time: 145.235 ms +fused mttkrp+gemm time: 145.866 + +mttkrp-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns +A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) +B1_dimension: 12092, B2_dimension: 9184, B3_dimension: 12092, vals: 76879419 +C1_dimension: 9184, C2_dimension: 32, vals: 293888 +D1_dimension: 28818, D2_dimension: 32, vals: 922176 +E1_dimension: 32, E2_dimension: 64, vals: 2048 + + +kernel execution time: 148.092 ms +default mttkrp time: 148.691 + +kernel execution time: 41.3947 ms +ryan mttkrp workspace time: 42.046 + +kernel execution time: 1.03445 ms +GeMM time: 1.45556 + +kernel execution time: 3211.6 ms +reference asymptotic blowup time: 3212.43 + +kernel execution time: 45.5971 ms +fused mttkrp+gemm time: 46.2057 diff --git a/test/stats/sddmm-spmm-gemm.txt b/test/stats/sddmm-spmm-gemm.txt new file mode 100644 index 000000000..02665478f --- /dev/null +++ b/test/stats/sddmm-spmm-gemm.txt @@ -0,0 +1,1471 @@ + +sddmm-spmm-gemm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 10974, D2_dimension: 64, vals: 702336 +E1_dimension: 10974, E2_dimension: 64, vals: 702336 +G1_dimension: 10974, G2_dimension: 64, vals: 4096 + + +kernel execution time: 2.51139 ms +fused time: 3.49403 + +kernel execution time: 3.80634 ms +sddmm time: 4.13132 + +kernel execution time: 0.75853 ms +sddmm ryan time: 1.07946 + +kernel execution time: 0.968473 ms +spmm ryan time: 1.2051 + +kernel execution time: 1.39879 ms +gemm time: 1.6602 + +kernel execution time: 1070.79 ms +taco reference time: 1071.2 + +sddmm-spmm-gemm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 64, vals: 2330688 +D1_dimension: 36417, D2_dimension: 64, vals: 2330688 +E1_dimension: 36417, E2_dimension: 64, vals: 2330688 +G1_dimension: 36417, G2_dimension: 64, vals: 4096 + + +kernel execution time: 8.43361 ms +fused time: 9.03941 + +kernel execution time: 13.3195 ms +sddmm time: 13.9487 + +kernel execution time: 4.73639 ms +sddmm ryan time: 5.32202 + +kernel execution time: 4.735 ms +spmm ryan time: 5.22103 + +kernel execution time: 3.66798 ms +gemm time: 4.15167 + +kernel execution time: 10658.4 ms +taco reference time: 10659.3 + +sddmm-spmm-gemm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 46835, B2_dimension: 46835, vals: 2374001 +C1_dimension: 46835, C2_dimension: 64, vals: 2997440 +D1_dimension: 46835, D2_dimension: 64, vals: 2997440 +E1_dimension: 46835, E2_dimension: 64, vals: 2997440 +G1_dimension: 46835, G2_dimension: 64, vals: 4096 + + +kernel execution time: 7.54896 ms +fused time: 8.15687 + +kernel execution time: 15.1277 ms +sddmm time: 15.796 + +kernel execution time: 3.51464 ms +sddmm ryan time: 4.10653 + +kernel execution time: 4.21975 ms +spmm ryan time: 4.6923 + +kernel execution time: 4.74088 ms +gemm time: 5.2156 + +kernel execution time: 5949.54 ms +taco reference time: 5950.52 + +sddmm-spmm-gemm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 62451, B2_dimension: 62451, vals: 4007383 +C1_dimension: 62451, C2_dimension: 64, vals: 3996864 +D1_dimension: 62451, D2_dimension: 64, vals: 3996864 +E1_dimension: 62451, E2_dimension: 64, vals: 3996864 +G1_dimension: 62451, G2_dimension: 64, vals: 4096 + + +kernel execution time: 11.7188 ms +fused time: 12.3427 + +kernel execution time: 18.5962 ms +sddmm time: 19.2831 + +kernel execution time: 6.5821 ms +sddmm ryan time: 7.20737 + +kernel execution time: 6.6327 ms +spmm ryan time: 7.20703 + +kernel execution time: 6.06003 ms +gemm time: 6.61794 + +kernel execution time: 9765.93 ms +taco reference time: 9766.85 + +sddmm-spmm-gemm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 83334, B2_dimension: 83334, vals: 6010480 +C1_dimension: 83334, C2_dimension: 64, vals: 5333376 +D1_dimension: 83334, D2_dimension: 64, vals: 5333376 +E1_dimension: 83334, E2_dimension: 64, vals: 5333376 +G1_dimension: 83334, G2_dimension: 64, vals: 4096 + + +kernel execution time: 16.3022 ms +fused time: 16.877 + +kernel execution time: 26.4065 ms +sddmm time: 26.9999 + +kernel execution time: 9.6103 ms +sddmm ryan time: 10.1859 + +kernel execution time: 9.5796 ms +spmm ryan time: 10.139 + +kernel execution time: 7.75909 ms +gemm time: 8.27337 + +kernel execution time: 14674.3 ms +taco reference time: 14675.2 + +sddmm-spmm-gemm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 +C1_dimension: 121192, C2_dimension: 64, vals: 7756288 +D1_dimension: 121192, D2_dimension: 64, vals: 7756288 +E1_dimension: 121192, E2_dimension: 64, vals: 7756288 +G1_dimension: 121192, G2_dimension: 64, vals: 4096 + + +kernel execution time: 28.3919 ms +fused time: 29.022 + +kernel execution time: 28.7666 ms +sddmm time: 29.4282 + +kernel execution time: 10.9353 ms +sddmm ryan time: 11.5639 + +kernel execution time: 12.2792 ms +spmm ryan time: 12.86 + +kernel execution time: 12.0463 ms +gemm time: 12.6219 + +kernel execution time: 6496.16 ms +taco reference time: 6497.16 + +sddmm-spmm-gemm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 140874, B2_dimension: 140874, vals: 7813404 +C1_dimension: 140874, C2_dimension: 64, vals: 9015936 +D1_dimension: 140874, D2_dimension: 64, vals: 9015936 +E1_dimension: 140874, E2_dimension: 64, vals: 9015936 +G1_dimension: 140874, G2_dimension: 64, vals: 4096 + + +kernel execution time: 23.8673 ms +fused time: 24.4851 + +kernel execution time: 38.4245 ms +sddmm time: 39.0808 + +kernel execution time: 13.3169 ms +sddmm ryan time: 13.9402 + +kernel execution time: 13.8214 ms +spmm ryan time: 14.3969 + +kernel execution time: 13.3955 ms +gemm time: 14.0084 + +kernel execution time: 19010.9 ms +taco reference time: 19012 + +sddmm-spmm-gemm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 170998, B2_dimension: 170998, vals: 958936 +C1_dimension: 170998, C2_dimension: 64, vals: 10943872 +D1_dimension: 170998, D2_dimension: 64, vals: 10943872 +E1_dimension: 170998, E2_dimension: 64, vals: 10943872 +G1_dimension: 170998, G2_dimension: 64, vals: 4096 + + +kernel execution time: 19.1593 ms +fused time: 19.7496 + +kernel execution time: 31.0395 ms +sddmm time: 31.6882 + +kernel execution time: 7.35776 ms +sddmm ryan time: 7.96434 + +kernel execution time: 9.33589 ms +spmm ryan time: 9.89731 + +kernel execution time: 16.4733 ms +gemm time: 17.0352 + +kernel execution time: 2397 ms +taco reference time: 2397.64 + +sddmm-spmm-gemm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536 +C1_dimension: 1000005, C2_dimension: 64, vals: 64000320 +D1_dimension: 1000005, D2_dimension: 64, vals: 64000320 +E1_dimension: 1000005, E2_dimension: 64, vals: 64000320 +G1_dimension: 1000005, G2_dimension: 64, vals: 4096 + + +kernel execution time: 66.7468 ms +fused time: 67.289 + +kernel execution time: 69.5837 ms +sddmm time: 70.1602 + +kernel execution time: 23.2899 ms +sddmm ryan time: 23.8277 + +kernel execution time: 41.9566 ms +spmm ryan time: 42.5095 + +kernel execution time: 93.8383 ms +gemm time: 94.3738 + +kernel execution time: 7587.7 ms +taco reference time: 7588.87 + +sddmm-spmm-gemm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291 +C1_dimension: 5558326, C2_dimension: 64, vals: 355732864 +D1_dimension: 5558326, D2_dimension: 64, vals: 355732864 +E1_dimension: 5558326, E2_dimension: 64, vals: 355732864 +G1_dimension: 5558326, G2_dimension: 64, vals: 4096 + + +kernel execution time: 688.492 ms +fused time: 689.478 + +kernel execution time: 979.86 ms +sddmm time: 980.45 + +kernel execution time: 318.248 ms +sddmm ryan time: 318.831 + +kernel execution time: 449.669 ms +spmm ryan time: 450.215 + +kernel execution time: 503.695 ms +gemm time: 504.291 + +kernel execution time: 326798 ms +taco reference time: 326799 + +sddmm-spmm-gemm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291 +C1_dimension: 5558326, C2_dimension: 64, vals: 355732864 +D1_dimension: 5558326, D2_dimension: 64, vals: 355732864 +E1_dimension: 5558326, E2_dimension: 64, vals: 355732864 +G1_dimension: 5558326, G2_dimension: 64, vals: 4096 + + +kernel execution time: 9624.7 ms +fused time: 9625.73 + +kernel execution time: 1635.76 ms +sddmm time: 1636.3 + +kernel execution time: 1636.41 ms +sddmm ryan time: 1636.96 + +kernel execution time: 2930.01 ms +spmm ryan time: 2930.5 + +kernel execution time: 15204.2 ms +gemm time: 15205.2 + +sddmm-spmm-gemm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 10974, D2_dimension: 64, vals: 702336 +E1_dimension: 10974, E2_dimension: 64, vals: 702336 +G1_dimension: 10974, G2_dimension: 64, vals: 4096 + + +kernel execution time: 31.0958 ms +fused time: 31.6403 + +kernel execution time: 9.52362 ms +sddmm time: 10.0411 + +kernel execution time: 9.50283 ms +sddmm ryan time: 9.98181 + +kernel execution time: 9.9883 ms +spmm ryan time: 10.3927 + +kernel execution time: 30.6724 ms +gemm time: 31.0956 + +kernel execution time: 50903.4 ms +taco reference time: 50904.4 + +sddmm-spmm-gemm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 64, vals: 2330688 +D1_dimension: 36417, D2_dimension: 64, vals: 2330688 +E1_dimension: 36417, E2_dimension: 64, vals: 2330688 +G1_dimension: 36417, G2_dimension: 64, vals: 4096 + + +kernel execution time: 221.251 ms +fused time: 223.31 + +kernel execution time: 90.6291 ms +sddmm time: 91.9017 + +kernel execution time: 92.6299 ms +sddmm ryan time: 93.1693 + +kernel execution time: 70.0109 ms +spmm ryan time: 70.4884 + +kernel execution time: 103.984 ms +gemm time: 105.217 + +kernel execution time: 441848 ms +taco reference time: 441849 + +sddmm-spmm-gemm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 46835, B2_dimension: 46835, vals: 2374001 +C1_dimension: 46835, C2_dimension: 64, vals: 2997440 +D1_dimension: 46835, D2_dimension: 64, vals: 2997440 +E1_dimension: 46835, E2_dimension: 64, vals: 2997440 +G1_dimension: 46835, G2_dimension: 64, vals: 4096 + + +kernel execution time: 156.706 ms +fused time: 158.878 + +kernel execution time: 53.3541 ms +sddmm time: 53.8804 + +kernel execution time: 53.6128 ms +sddmm ryan time: 54.7942 + +kernel execution time: 51.5253 ms +spmm ryan time: 52.5961 + +kernel execution time: 130.147 ms +gemm time: 131.306 + +kernel execution time: 243737 ms +taco reference time: 243739 +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 62451, B2_dimension: 62451, vals: 4007383 +C1_dimension: 62451, C2_dimension: 64, vals: 3996864 +D1_dimension: 62451, D2_dimension: 64, vals: 3996864 +E1_dimension: 62451, E2_dimension: 64, vals: 3996864 +G1_dimension: 62451, G2_dimension: 64, vals: 4096 + + +kernel execution time: 238.619 ms +fused time: 240.152 + +kernel execution time: 84.8828 ms +sddmm time: 85.4286 + +kernel execution time: 80.7058 ms +sddmm ryan time: 81.2588 + +kernel execution time: 75.2549 ms +spmm ryan time: 75.7338 + +kernel execution time: 174.145 ms +gemm time: 174.654 + +kernel execution time: 412699 ms +taco reference time: 412701 +/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 83334, B2_dimension: 83334, vals: 6010480 +C1_dimension: 83334, C2_dimension: 64, vals: 5333376 +D1_dimension: 83334, D2_dimension: 64, vals: 5333376 +E1_dimension: 83334, E2_dimension: 64, vals: 5333376 +G1_dimension: 83334, G2_dimension: 64, vals: 4096 + + +kernel execution time: 350.004 ms +fused time: 351.319 + +kernel execution time: 123.574 ms +sddmm time: 124.101 + +kernel execution time: 126.113 ms +sddmm ryan time: 127.971 + +kernel execution time: 113.146 ms +spmm ryan time: 113.615 + +kernel execution time: 234.287 ms +gemm time: 235.546 + +kernel execution time: 619783 ms +taco reference time: 619784 +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 +C1_dimension: 121192, C2_dimension: 64, vals: 7756288 +D1_dimension: 121192, D2_dimension: 64, vals: 7756288 +E1_dimension: 121192, E2_dimension: 64, vals: 7756288 +G1_dimension: 121192, G2_dimension: 64, vals: 4096 + + +kernel execution time: 335.548 ms +fused time: 337.292 + +kernel execution time: 90.8795 ms +sddmm time: 91.3981 + +kernel execution time: 87.7678 ms +sddmm ryan time: 88.2879 + +kernel execution time: 111.725 ms +spmm ryan time: 113.063 + +kernel execution time: 338.451 ms +gemm time: 340.2 + +kernel execution time: 268303 ms +taco reference time: 268304 +/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 140874, B2_dimension: 140874, vals: 7813404 +C1_dimension: 140874, C2_dimension: 64, vals: 9015936 +D1_dimension: 140874, D2_dimension: 64, vals: 9015936 +E1_dimension: 140874, E2_dimension: 64, vals: 9015936 +G1_dimension: 140874, G2_dimension: 64, vals: 4096 + + +kernel execution time: 488.065 ms +fused time: 489.312 + +kernel execution time: 161.434 ms +sddmm time: 163.199 + +kernel execution time: 164.295 ms +sddmm ryan time: 165.567 + +kernel execution time: 154.131 ms +spmm ryan time: 154.61 + +kernel execution time: 391.972 ms +gemm time: 393.242 + +kernel execution time: 798245 ms +taco reference time: 798247 +/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 170998, B2_dimension: 170998, vals: 958936 +C1_dimension: 170998, C2_dimension: 64, vals: 10943872 +D1_dimension: 170998, D2_dimension: 64, vals: 10943872 +E1_dimension: 170998, E2_dimension: 64, vals: 10943872 +G1_dimension: 170998, G2_dimension: 64, vals: 4096 + + +kernel execution time: 279.308 ms +fused time: 280.422 + +kernel execution time: 41.2598 ms +sddmm time: 41.7727 + +kernel execution time: 40.3132 ms +sddmm ryan time: 40.882 + +kernel execution time: 72.4795 ms +spmm ryan time: 73.6321 + +kernel execution time: 473.298 ms +gemm time: 474.582 + +kernel execution time: 98095.7 ms +taco reference time: 98098.4 +/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 206500, B2_dimension: 206500, vals: 1273389 +C1_dimension: 206500, C2_dimension: 64, vals: 13216000 +D1_dimension: 206500, D2_dimension: 64, vals: 13216000 +E1_dimension: 206500, E2_dimension: 64, vals: 13216000 +G1_dimension: 206500, G2_dimension: 64, vals: 4096 + + +kernel execution time: 321.827 ms +fused time: 322.725 + +kernel execution time: 43.7794 ms +sddmm time: 44.8964 + +kernel execution time: 42.531 ms +sddmm ryan time: 43.7502 + +kernel execution time: 83.5305 ms +spmm ryan time: 84.0178 + +kernel execution time: 567.368 ms +gemm time: 567.876 + +kernel execution time: 130204 ms +taco reference time: 130207 +/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536 +C1_dimension: 1000005, C2_dimension: 64, vals: 64000320 +D1_dimension: 1000005, D2_dimension: 64, vals: 64000320 +E1_dimension: 1000005, E2_dimension: 64, vals: 64000320 +G1_dimension: 1000005, G2_dimension: 64, vals: 4096 + + +kernel execution time: 1355.72 ms +fused time: 1357.14 + +kernel execution time: 98.94 ms +sddmm time: 101.488 + +kernel execution time: 97.8972 ms +sddmm ryan time: 98.4423 + +kernel execution time: 218.188 ms +spmm ryan time: 219.39 + +kernel execution time: 2744.38 ms +gemm time: 2744.89 + +kernel execution time: 320035 ms +taco reference time: 320037 +/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291 +C1_dimension: 5558326, C2_dimension: 64, vals: 355732864 +D1_dimension: 5558326, D2_dimension: 64, vals: 355732864 +E1_dimension: 5558326, E2_dimension: 64, vals: 355732864 +G1_dimension: 5558326, G2_dimension: 64, vals: 4096 + + +kernel execution time: 9682.48 ms +fused time: 9684.45 + +kernel execution time: 1640.01 ms +sddmm time: 1641.3 + +kernel execution time: 1626.66 ms +sddmm ryan time: 1628.12 + +kernel execution time: 2908.47 ms +spmm ryan time: 2908.94 + +kernel execution time: 15252.4 ms +gemm time: 15253.4 + +kernel execution time: 6.11703e+06 ms +taco reference time: 6.11703e+06 + +sddmm-spmm-gemm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 46835, B2_dimension: 46835, vals: 2374001 +C1_dimension: 46835, C2_dimension: 64, vals: 2997440 +D1_dimension: 46835, D2_dimension: 64, vals: 2997440 +E1_dimension: 46835, E2_dimension: 64, vals: 2997440 +G1_dimension: 46835, G2_dimension: 64, vals: 4096 + + +kernel execution time: 7.90719 ms +fused time: 12.4475 + +kernel execution time: 15.0235 ms +sddmm time: 18.4078 + +kernel execution time: 3.60187 ms +sddmm ryan time: 7.64096 + +kernel execution time: 4.26585 ms +spmm ryan time: 7.23736 + +kernel execution time: 5.51232 ms +gemm time: 8.94274 + +kernel execution time: 5900.92 ms +taco reference time: 5901.77 +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 62451, B2_dimension: 62451, vals: 4007383 +C1_dimension: 62451, C2_dimension: 64, vals: 3996864 +D1_dimension: 62451, D2_dimension: 64, vals: 3996864 +E1_dimension: 62451, E2_dimension: 64, vals: 3996864 +G1_dimension: 62451, G2_dimension: 64, vals: 4096 + + +kernel execution time: 11.9944 ms +fused time: 15.5065 + +kernel execution time: 17.5788 ms +sddmm time: 18.2088 + +kernel execution time: 6.90362 ms +sddmm ryan time: 9.18146 + +kernel execution time: 6.52502 ms +spmm ryan time: 7.08577 + +kernel execution time: 5.70869 ms +gemm time: 6.23327 + +kernel execution time: 9752.35 ms +taco reference time: 9753.37 +/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 83334, B2_dimension: 83334, vals: 6010480 +C1_dimension: 83334, C2_dimension: 64, vals: 5333376 +D1_dimension: 83334, D2_dimension: 64, vals: 5333376 +E1_dimension: 83334, E2_dimension: 64, vals: 5333376 +G1_dimension: 83334, G2_dimension: 64, vals: 4096 + + +kernel execution time: 16.1703 ms +fused time: 19.9224 + +kernel execution time: 26.3346 ms +sddmm time: 30.1538 + +kernel execution time: 9.47197 ms +sddmm ryan time: 12.7137 + +kernel execution time: 9.14926 ms +spmm ryan time: 9.78178 + +kernel execution time: 8.06171 ms +gemm time: 8.592 + +kernel execution time: 14612.6 ms +taco reference time: 14617.7 +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 +C1_dimension: 121192, C2_dimension: 64, vals: 7756288 +D1_dimension: 121192, D2_dimension: 64, vals: 7756288 +E1_dimension: 121192, E2_dimension: 64, vals: 7756288 +G1_dimension: 121192, G2_dimension: 64, vals: 4096 + + +kernel execution time: 28.2581 ms +fused time: 32.7167 + +kernel execution time: 30.162 ms +sddmm time: 33.8587 + +kernel execution time: 11.0142 ms +sddmm ryan time: 15.2742 + +kernel execution time: 12.1744 ms +spmm ryan time: 15.0065 + +kernel execution time: 11.4579 ms +gemm time: 14.5527 + +kernel execution time: 6379.22 ms +taco reference time: 6380.3 +/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 140874, B2_dimension: 140874, vals: 7813404 +C1_dimension: 140874, C2_dimension: 64, vals: 9015936 +D1_dimension: 140874, D2_dimension: 64, vals: 9015936 +E1_dimension: 140874, E2_dimension: 64, vals: 9015936 +G1_dimension: 140874, G2_dimension: 64, vals: 4096 + + +kernel execution time: 24.3937 ms +fused time: 28.6422 + +kernel execution time: 37.2457 ms +sddmm time: 41.311 + +kernel execution time: 13.8503 ms +sddmm ryan time: 17.9583 + +kernel execution time: 14.2713 ms +spmm ryan time: 17.1402 + +kernel execution time: 13.6024 ms +gemm time: 16.6078 + +kernel execution time: 18993.5 ms +taco reference time: 18994.5 +/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 170998, B2_dimension: 170998, vals: 958936 +C1_dimension: 170998, C2_dimension: 64, vals: 10943872 +D1_dimension: 170998, D2_dimension: 64, vals: 10943872 +E1_dimension: 170998, E2_dimension: 64, vals: 10943872 +G1_dimension: 170998, G2_dimension: 64, vals: 4096 + + +kernel execution time: 18.4645 ms +fused time: 22.0711 + +kernel execution time: 31.6844 ms +sddmm time: 34.9774 + +kernel execution time: 7.19931 ms +sddmm ryan time: 11.584 + +kernel execution time: 9.40139 ms +spmm ryan time: 10.002 + +kernel execution time: 16.3933 ms +gemm time: 19.0699 + +kernel execution time: 2325.51 ms +taco reference time: 2326.19 +/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 206500, B2_dimension: 206500, vals: 1273389 +C1_dimension: 206500, C2_dimension: 64, vals: 13216000 +D1_dimension: 206500, D2_dimension: 64, vals: 13216000 +E1_dimension: 206500, E2_dimension: 64, vals: 13216000 +G1_dimension: 206500, G2_dimension: 64, vals: 4096 + + +kernel execution time: 25.9398 ms +fused time: 30.7713 + +kernel execution time: 43.1619 ms +sddmm time: 47.1566 + +kernel execution time: 9.47076 ms +sddmm ryan time: 12.9736 + +kernel execution time: 12.1315 ms +spmm ryan time: 12.7125 + +kernel execution time: 19.8795 ms +gemm time: 23.9233 + +kernel execution time: 3085.34 ms +taco reference time: 3087.4 +/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536 +C1_dimension: 1000005, C2_dimension: 64, vals: 64000320 +D1_dimension: 1000005, D2_dimension: 64, vals: 64000320 +E1_dimension: 1000005, E2_dimension: 64, vals: 64000320 +G1_dimension: 1000005, G2_dimension: 64, vals: 4096 + + +kernel execution time: 68.9391 ms +fused time: 73.2143 + +kernel execution time: 68.0597 ms +sddmm time: 71.8136 + +kernel execution time: 23.658 ms +sddmm ryan time: 27.2015 + +kernel execution time: 42.2166 ms +spmm ryan time: 45.3816 + +kernel execution time: 91.7085 ms +gemm time: 94.965 + +kernel execution time: 7504.53 ms +taco reference time: 7510.21 +/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291 +C1_dimension: 5558326, C2_dimension: 64, vals: 355732864 +D1_dimension: 5558326, D2_dimension: 64, vals: 355732864 +E1_dimension: 5558326, E2_dimension: 64, vals: 355732864 +G1_dimension: 5558326, G2_dimension: 64, vals: 4096 + + +kernel execution time: 685.25 ms +fused time: 691.004 + +kernel execution time: 978.107 ms +sddmm time: 982.105 + +kernel execution time: 314.889 ms +sddmm ryan time: 319.437 + +kernel execution time: 451.321 ms +spmm ryan time: 454.339 + +kernel execution time: 511.771 ms +gemm time: 516.049 + +kernel execution time: 324954 ms +taco reference time: 324960 + +sddmm-spmm-gemm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 10974, D2_dimension: 64, vals: 702336 +E1_dimension: 10974, E2_dimension: 64, vals: 702336 +G1_dimension: 10974, G2_dimension: 64, vals: 4096 + + +kernel execution time: 2.03017 ms +fused time: 6.89988 + +kernel execution time: 4.23176 ms +sddmm time: 4.56628 + +kernel execution time: 1.07066 ms +sddmm ryan time: 1.60331 + +kernel execution time: 1.04047 ms +spmm ryan time: 1.84411 + +kernel execution time: 1.58419 ms +gemm time: 3.49011 + +kernel execution time: 1168.5 ms +taco reference time: 1172.82 +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 64, vals: 2330688 +D1_dimension: 36417, D2_dimension: 64, vals: 2330688 +E1_dimension: 36417, E2_dimension: 64, vals: 2330688 +G1_dimension: 36417, G2_dimension: 64, vals: 4096 + + +kernel execution time: 8.02954 ms +fused time: 12.4005 + +kernel execution time: 12.7753 ms +sddmm time: 15.6047 + +kernel execution time: 4.73627 ms +sddmm ryan time: 8.24994 + +kernel execution time: 4.90489 ms +spmm ryan time: 5.40766 + +kernel execution time: 2.99487 ms +gemm time: 3.53289 + +kernel execution time: 10658.1 ms +taco reference time: 10661.2 +/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 46835, B2_dimension: 46835, vals: 2374001 +C1_dimension: 46835, C2_dimension: 64, vals: 2997440 +D1_dimension: 46835, D2_dimension: 64, vals: 2997440 +E1_dimension: 46835, E2_dimension: 64, vals: 2997440 +G1_dimension: 46835, G2_dimension: 64, vals: 4096 + + +kernel execution time: 7.15818 ms +fused time: 11.6143 + +kernel execution time: 15.0391 ms +sddmm time: 18.5456 + +kernel execution time: 3.33442 ms +sddmm ryan time: 6.94621 + +kernel execution time: 4.13895 ms +spmm ryan time: 7.49526 + +kernel execution time: 3.79939 ms +gemm time: 4.19085 + +kernel execution time: 5801.87 ms +taco reference time: 5803.1 +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 62451, B2_dimension: 62451, vals: 4007383 +C1_dimension: 62451, C2_dimension: 64, vals: 3996864 +D1_dimension: 62451, D2_dimension: 64, vals: 3996864 +E1_dimension: 62451, E2_dimension: 64, vals: 3996864 +G1_dimension: 62451, G2_dimension: 64, vals: 4096 + + +kernel execution time: 12.0771 ms +fused time: 16.6939 + +kernel execution time: 17.5697 ms +sddmm time: 18.7919 + +kernel execution time: 6.94731 ms +sddmm ryan time: 11.0254 + +kernel execution time: 7.03752 ms +spmm ryan time: 8.55729 + +kernel execution time: 5.18056 ms +gemm time: 8.22984 + +kernel execution time: 9735.41 ms +taco reference time: 9737.5 +/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 83334, B2_dimension: 83334, vals: 6010480 +C1_dimension: 83334, C2_dimension: 64, vals: 5333376 +D1_dimension: 83334, D2_dimension: 64, vals: 5333376 +E1_dimension: 83334, E2_dimension: 64, vals: 5333376 +G1_dimension: 83334, G2_dimension: 64, vals: 4096 + + +kernel execution time: 16.2173 ms +fused time: 20.4628 + +kernel execution time: 26.5883 ms +sddmm time: 30.2732 + +kernel execution time: 9.67928 ms +sddmm ryan time: 13.4002 + +kernel execution time: 9.46597 ms +spmm ryan time: 12.3215 + +kernel execution time: 6.14851 ms +gemm time: 6.79689 + +kernel execution time: 14647.4 ms +taco reference time: 14648.9 +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 +C1_dimension: 121192, C2_dimension: 64, vals: 7756288 +D1_dimension: 121192, D2_dimension: 64, vals: 7756288 +E1_dimension: 121192, E2_dimension: 64, vals: 7756288 +G1_dimension: 121192, G2_dimension: 64, vals: 4096 + + +kernel execution time: 28.0895 ms +fused time: 33.0632 + +kernel execution time: 29.4447 ms +sddmm time: 33.2669 + +kernel execution time: 10.992 ms +sddmm ryan time: 15.1462 + +kernel execution time: 12.2197 ms +spmm ryan time: 14.8823 + +kernel execution time: 9.1576 ms +gemm time: 12.476 + +kernel execution time: 6388.6 ms +taco reference time: 6389.71 +/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 140874, B2_dimension: 140874, vals: 7813404 +C1_dimension: 140874, C2_dimension: 64, vals: 9015936 +D1_dimension: 140874, D2_dimension: 64, vals: 9015936 +E1_dimension: 140874, E2_dimension: 64, vals: 9015936 +G1_dimension: 140874, G2_dimension: 64, vals: 4096 + + +kernel execution time: 24.4023 ms +fused time: 28.7813 + +kernel execution time: 37.3163 ms +sddmm time: 41.2616 + +kernel execution time: 13.8084 ms +sddmm ryan time: 17.1208 + +kernel execution time: 14.1626 ms +spmm ryan time: 17.3487 + +kernel execution time: 10.2461 ms +gemm time: 10.8026 + +kernel execution time: 19008 ms +taco reference time: 19013 +/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 170998, B2_dimension: 170998, vals: 958936 +C1_dimension: 170998, C2_dimension: 64, vals: 10943872 +D1_dimension: 170998, D2_dimension: 64, vals: 10943872 +E1_dimension: 170998, E2_dimension: 64, vals: 10943872 +G1_dimension: 170998, G2_dimension: 64, vals: 4096 + + +kernel execution time: 18.5328 ms +fused time: 21.8578 + +kernel execution time: 29.8727 ms +sddmm time: 32.6967 + +kernel execution time: 7.1244 ms +sddmm ryan time: 10.2857 + +kernel execution time: 8.9243 ms +spmm ryan time: 9.54503 + +kernel execution time: 12.6159 ms +gemm time: 13.2038 + +kernel execution time: 2326 ms +taco reference time: 2326.66 +/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 206500, B2_dimension: 206500, vals: 1273389 +C1_dimension: 206500, C2_dimension: 64, vals: 13216000 +D1_dimension: 206500, D2_dimension: 64, vals: 13216000 +E1_dimension: 206500, E2_dimension: 64, vals: 13216000 +G1_dimension: 206500, G2_dimension: 64, vals: 4096 + + +kernel execution time: 25.7525 ms +fused time: 27.0427 + +kernel execution time: 40.701 ms +sddmm time: 44.8629 + +kernel execution time: 9.61808 ms +sddmm ryan time: 13.4076 + +kernel execution time: 12.4322 ms +spmm ryan time: 15.2811 + +kernel execution time: 15.1033 ms +gemm time: 17.9102 + +kernel execution time: 3091.33 ms +taco reference time: 3092.53 +/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536 +C1_dimension: 1000005, C2_dimension: 64, vals: 64000320 +D1_dimension: 1000005, D2_dimension: 64, vals: 64000320 +E1_dimension: 1000005, E2_dimension: 64, vals: 64000320 +G1_dimension: 1000005, G2_dimension: 64, vals: 4096 + + +kernel execution time: 68.4469 ms +fused time: 72.7982 + +kernel execution time: 52.1276 ms +sddmm time: 56.0577 + +kernel execution time: 23.4796 ms +sddmm ryan time: 27.0851 + +kernel execution time: 42.2008 ms +spmm ryan time: 45.2618 + +kernel execution time: 74.1167 ms +gemm time: 78.5888 + +kernel execution time: 7502.71 ms +taco reference time: 7508.45 +/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291 +C1_dimension: 5558326, C2_dimension: 64, vals: 355732864 +D1_dimension: 5558326, D2_dimension: 64, vals: 355732864 +E1_dimension: 5558326, E2_dimension: 64, vals: 355732864 +G1_dimension: 5558326, G2_dimension: 64, vals: 4096 + + +kernel execution time: 684.483 ms +fused time: 689.124 + +kernel execution time: 889.925 ms +sddmm time: 894.03 + +kernel execution time: 315.322 ms +sddmm ryan time: 319.629 + +kernel execution time: 449.91 ms +spmm ryan time: 453.686 + +kernel execution time: 417.449 ms +gemm time: 421.26 + +kernel execution time: 326305 ms +taco reference time: 326311 + +sddmm-spmm-gemm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 64, vals: 173312 +D1_dimension: 2708, D2_dimension: 64, vals: 173312 +E1_dimension: 2708, E2_dimension: 64, vals: 173312 +G1_dimension: 2708, G2_dimension: 64, vals: 4096 + + +kernel execution time: 5.08607 ms +fused time: 5.61989 + +kernel execution time: 0.557608 ms +sddmm time: 0.871642 + +kernel execution time: 0.465526 ms +sddmm ryan time: 0.7713 + +kernel execution time: 0.498686 ms +spmm ryan time: 0.739309 + +kernel execution time: 0.7957 ms +gemm time: 1.05919 + +kernel execution time: 42.447 ms +taco reference time: 42.885 +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 548551, B2_dimension: 548551, vals: 1851744 +C1_dimension: 548551, C2_dimension: 64, vals: 35107264 +D1_dimension: 548551, D2_dimension: 64, vals: 35107264 +E1_dimension: 548551, E2_dimension: 64, vals: 35107264 +G1_dimension: 548551, G2_dimension: 64, vals: 4096 + + +kernel execution time: 89.9099 ms +fused time: 90.5117 + +kernel execution time: 29.9086 ms +sddmm time: 30.4936 + +kernel execution time: 29.1529 ms +sddmm ryan time: 29.7063 + +kernel execution time: 34.6318 ms +spmm ryan time: 35.1535 + +kernel execution time: 66.4663 ms +gemm time: 67.0316 + +kernel execution time: 6272.25 ms +taco reference time: 6273.42 + +sddmm-spmm-gemm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 64, vals: 173312 +D1_dimension: 2708, D2_dimension: 64, vals: 173312 +E1_dimension: 2708, E2_dimension: 64, vals: 173312 +G1_dimension: 2708, G2_dimension: 64, vals: 4096 + + +kernel execution time: 3.72391 ms +fused time: 4.19698 + +kernel execution time: 0.585647 ms +sddmm time: 0.893112 + +kernel execution time: 0.483056 ms +sddmm ryan time: 0.79108 + +kernel execution time: 0.567518 ms +spmm ryan time: 0.808711 + +kernel execution time: 0.929183 ms +gemm time: 1.32543 + +kernel execution time: 35.7066 ms +taco reference time: 36.3331 +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 548551, B2_dimension: 548551, vals: 1851744 +C1_dimension: 548551, C2_dimension: 64, vals: 35107264 +D1_dimension: 548551, D2_dimension: 64, vals: 35107264 +E1_dimension: 548551, E2_dimension: 64, vals: 35107264 +G1_dimension: 548551, G2_dimension: 64, vals: 4096 + + +kernel execution time: 94.9377 ms +fused time: 95.7687 + +kernel execution time: 32.2051 ms +sddmm time: 32.7881 + +kernel execution time: 30.3982 ms +sddmm ryan time: 30.95 + +kernel execution time: 34.4172 ms +spmm ryan time: 34.9049 + +kernel execution time: 67.2709 ms +gemm time: 67.8035 + +kernel execution time: 6215.08 ms +taco reference time: 6216.26 + +sddmm-spmm-gemm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 64, vals: 173312 +D1_dimension: 2708, D2_dimension: 64, vals: 173312 +E1_dimension: 2708, E2_dimension: 64, vals: 173312 +G1_dimension: 2708, G2_dimension: 64, vals: 4096 + + +kernel execution time: 6.99173 ms +fused time: 7.86448 + +kernel execution time: 0.78061 ms +sddmm time: 1.28867 + +kernel execution time: 0.554227 ms +sddmm ryan time: 0.837111 + +kernel execution time: 0.909912 ms +spmm ryan time: 1.12908 + +kernel execution time: 7.60724 ms +gemm time: 7.85047 + +kernel execution time: 652.888 ms +taco reference time: 653.271 +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 548551, B2_dimension: 548551, vals: 1851744 +C1_dimension: 548551, C2_dimension: 64, vals: 35107264 +D1_dimension: 548551, D2_dimension: 64, vals: 35107264 +E1_dimension: 548551, E2_dimension: 64, vals: 35107264 +G1_dimension: 548551, G2_dimension: 64, vals: 4096 + + +kernel execution time: 1236.33 ms +fused time: 1236.87 + +kernel execution time: 249.805 ms +sddmm time: 250.356 + +kernel execution time: 247.195 ms +sddmm ryan time: 247.729 + +kernel execution time: 285.764 ms +spmm ryan time: 286.235 + +kernel execution time: 1529.34 ms +gemm time: 1529.83 + +kernel execution time: 190620 ms +taco reference time: 190621 + +sddmm-spmm-gemm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 64, vals: 173312 +D1_dimension: 2708, D2_dimension: 64, vals: 173312 +E1_dimension: 2708, E2_dimension: 64, vals: 173312 +G1_dimension: 2708, G2_dimension: 64, vals: 4096 + + +kernel execution time: 1.86163 ms +fused time: 2.34746 + +kernel execution time: 0.542927 ms +sddmm time: 1.05528 + +kernel execution time: 0.541998 ms +sddmm ryan time: 1.07672 + +kernel execution time: 0.524767 ms +spmm ryan time: 0.944293 + +kernel execution time: 0.75947 ms +gemm time: 1.2162 + +kernel execution time: 36.3755 ms +taco reference time: 37.0989 + +sddmm-spmm-gemm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 64, vals: 173312 +D1_dimension: 2708, D2_dimension: 64, vals: 173312 +E1_dimension: 2708, E2_dimension: 64, vals: 173312 +G1_dimension: 2708, G2_dimension: 64, vals: 4096 + + +kernel execution time: 1.97375 ms +fused time: 2.84436 + +kernel execution time: 0.881212 ms +sddmm time: 1.38907 + +kernel execution time: 0.545557 ms +sddmm ryan time: 1.0807 + +kernel execution time: 0.548488 ms +spmm ryan time: 0.978813 + +kernel execution time: 0.72955 ms +gemm time: 1.2023 + +kernel execution time: 34.867 ms +taco reference time: 35.5819 + +sddmm-spmm-gemm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 64, vals: 173312 +D1_dimension: 2708, D2_dimension: 64, vals: 173312 +E1_dimension: 2708, E2_dimension: 64, vals: 173312 +G1_dimension: 2708, G2_dimension: 64, vals: 4096 + + +kernel execution time: 1.69165 ms +fused time: 2.2114 + +kernel execution time: 0.908102 ms +sddmm time: 1.19792 + +kernel execution time: 0.513137 ms +sddmm ryan time: 0.807571 + +kernel execution time: 0.510327 ms +spmm ryan time: 0.76134 + +kernel execution time: 0.803101 ms +gemm time: 1.0684 + +kernel execution time: 45.9784 ms +taco reference time: 46.3901 + +sddmm-spmm-gemm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 64, vals: 173312 +D1_dimension: 2708, D2_dimension: 64, vals: 173312 +E1_dimension: 2708, E2_dimension: 64, vals: 173312 +G1_dimension: 2708, G2_dimension: 64, vals: 4096 + + +kernel execution time: 1.82354 ms +fused time: 2.81223 + +kernel execution time: 0.926052 ms +sddmm time: 1.48292 + +kernel execution time: 0.564157 ms +sddmm ryan time: 1.14611 + +kernel execution time: 0.512447 ms +spmm ryan time: 0.925102 + +kernel execution time: 0.689109 ms +gemm time: 1.08196 + +kernel execution time: 34.7847 ms +taco reference time: 35.4182 + +sddmm-spmm-gemm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 64, vals: 173312 +D1_dimension: 2708, D2_dimension: 64, vals: 173312 +E1_dimension: 2708, E2_dimension: 64, vals: 173312 +G1_dimension: 2708, G2_dimension: 64, vals: 4096 + + +kernel execution time: 6.8174 ms +fused time: 7.69061 + +kernel execution time: 0.935843 ms +sddmm time: 1.46847 + +kernel execution time: 0.612468 ms +sddmm ryan time: 0.880662 + +kernel execution time: 0.831351 ms +spmm ryan time: 1.05745 + +kernel execution time: 7.58342 ms +gemm time: 7.82297 + +kernel execution time: 566.881 ms +taco reference time: 567.264 diff --git a/test/stats/sddmm-spmm.txt b/test/stats/sddmm-spmm.txt new file mode 100644 index 000000000..df8d924b8 --- /dev/null +++ b/test/stats/sddmm-spmm.txt @@ -0,0 +1,5995 @@ + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 102, B2_dimension: 103, vals: 3149 +C1_dimension: 102, C2_dimension: 64, vals: 6528 +D1_dimension: 103, D2_dimension: 64, vals: 6592 +E1_dimension: 103, E2_dimension: 48, vals: 4944 + + +kernel execution time: 6223.98 ms +fused time: 6225.14 + +kernel execution time: 3659.4 ms +sddmm time: 3660.83 + +kernel execution time: 3145.85 ms +spmm time: 3146.77 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 10974, D2_dimension: 64, vals: 702336 +E1_dimension: 10974, E2_dimension: 64, vals: 702336 + + +kernel execution time: 17.1703 ms +fused time: 17.6378 + +kernel execution time: 8.23135 ms +sddmm time: 8.77073 + +kernel execution time: 19.3034 ms +spmm time: 19.7426 + +kernel execution time: 514.133 ms +taco reference time: 514.662 + +mtx dim1 dim2 nnz fused sddmm spmm taco-original +bcsstk17 10974 10974 428650 17.1703 8.23135 19.3034 514.662 + + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 64, vals: 2330688 +D1_dimension: 36417, D2_dimension: 64, vals: 2330688 +E1_dimension: 36417, E2_dimension: 64, vals: 2330688 + + +kernel execution time: 163.616 ms +fused time: 164.099 + +kernel execution time: 81.2672 ms +sddmm time: 81.8014 + +kernel execution time: 294.454 ms +spmm time: 294.968 + +kernel execution time: 5149.58 ms +taco reference time: 5150.58 + +mtx dim1 dim2 nnz fused sddmm spmm taco-original +bcsstk17 10974 10974 428650 17.1703 8.23135 19.3034 514.662 +pdb1HYS 36417 36417 4344765 163.616 81.2672 294.454 5149.58 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 46835, B2_dimension: 46835, vals: 2374001 +C1_dimension: 46835, C2_dimension: 64, vals: 2997440 +D1_dimension: 46835, D2_dimension: 64, vals: 2997440 +E1_dimension: 46835, E2_dimension: 64, vals: 2997440 + + +kernel execution time: 92.8319 ms +fused time: 93.3139 + +kernel execution time: 45.3221 ms +sddmm time: 45.8599 + +kernel execution time: 136.693 ms +spmm time: 137.198 + +kernel execution time: 2824.95 ms +taco reference time: 2825.53 + +mtx dim1 dim2 nnz fused sddmm spmm taco-original +bcsstk17 10974 10974 428650 17.1703 8.23135 19.3034 514.662 +pdb1HYS 36417 36417 4344765 163.616 81.2672 294.454 5149.58 +rma10 46835 46835 2374001 92.8319 45.3221 136.693 2824.95 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 62451, B2_dimension: 62451, vals: 4007383 +C1_dimension: 62451, C2_dimension: 64, vals: 3996864 +D1_dimension: 62451, D2_dimension: 64, vals: 3996864 +E1_dimension: 62451, E2_dimension: 64, vals: 3996864 + + +kernel execution time: 153.867 ms +fused time: 154.368 + +kernel execution time: 74.9071 ms +sddmm time: 75.4719 + +kernel execution time: 258.678 ms +spmm time: 259.209 + +kernel execution time: 4786.95 ms +taco reference time: 4788.05 + +mtx dim1 dim2 nnz fused sddmm spmm taco-original +bcsstk17 10974 10974 428650 17.1703 8.23135 19.3034 514.662 +pdb1HYS 36417 36417 4344765 163.616 81.2672 294.454 5149.58 +rma10 46835 46835 2374001 92.8319 45.3221 136.693 2824.95 +cant 62451 62451 4007383 153.867 74.9071 258.678 4786.95 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 83334, B2_dimension: 83334, vals: 6010480 +C1_dimension: 83334, C2_dimension: 64, vals: 5333376 +D1_dimension: 83334, D2_dimension: 64, vals: 5333376 +E1_dimension: 83334, E2_dimension: 64, vals: 5333376 + + +kernel execution time: 231.253 ms +fused time: 231.75 + +kernel execution time: 112.863 ms +sddmm time: 113.405 + +kernel execution time: 417.749 ms +spmm time: 418.285 + +kernel execution time: 7133.75 ms +taco reference time: 7134.88 + +consph 83334 83334 6010480 231.253 112.863 417.749 7133.75 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 +C1_dimension: 121192, C2_dimension: 64, vals: 7756288 +D1_dimension: 121192, D2_dimension: 64, vals: 7756288 +E1_dimension: 121192, E2_dimension: 64, vals: 7756288 + + +kernel execution time: 192.743 ms +fused time: 193.23 + +kernel execution time: 85.0563 ms +sddmm time: 85.6227 + +kernel execution time: 150.367 ms +spmm time: 150.908 + +kernel execution time: 3285.24 ms +taco reference time: 3286.37 + +cop20k_A 121192 121192 2624331 192.743 85.0563 150.367 3285.24 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 140874, B2_dimension: 140874, vals: 7813404 +C1_dimension: 140874, C2_dimension: 64, vals: 9015936 +D1_dimension: 140874, D2_dimension: 64, vals: 9015936 +E1_dimension: 140874, E2_dimension: 64, vals: 9015936 + + +kernel execution time: 307.481 ms +fused time: 307.98 + +kernel execution time: 150.621 ms +sddmm time: 151.15 + +kernel execution time: 451.195 ms +spmm time: 451.689 + +kernel execution time: 9393.95 ms +taco reference time: 9395.02 + +shipsec1 140874 140874 7813404 307.481 150.621 451.195 9393.95 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 170998, B2_dimension: 170998, vals: 958936 +C1_dimension: 170998, C2_dimension: 64, vals: 10943872 +D1_dimension: 170998, D2_dimension: 64, vals: 10943872 +E1_dimension: 170998, E2_dimension: 64, vals: 10943872 + + +kernel execution time: 85.4659 ms +fused time: 85.9614 + +kernel execution time: 34.7139 ms +sddmm time: 35.2946 + +kernel execution time: 71.0646 ms +spmm time: 71.6139 + +kernel execution time: 1234.06 ms +taco reference time: 1234.68 + +scircuit 170998 170998 958936 85.4659 34.7139 71.0646 1234.06 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 206500, B2_dimension: 206500, vals: 1273389 +C1_dimension: 206500, C2_dimension: 64, vals: 13216000 +D1_dimension: 206500, D2_dimension: 64, vals: 13216000 +E1_dimension: 206500, E2_dimension: 64, vals: 13216000 + + +kernel execution time: 88.3959 ms +fused time: 88.8687 + +kernel execution time: 36.7565 ms +sddmm time: 37.3021 + +kernel execution time: 80.2217 ms +spmm time: 80.7621 + +kernel execution time: 1588.94 ms +taco reference time: 1589.58 + +mac_econ_fwd500 206500 206500 1273389 88.3959 36.7565 80.2217 1588.94 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536 +C1_dimension: 1000005, C2_dimension: 64, vals: 64000320 +D1_dimension: 1000005, D2_dimension: 64, vals: 64000320 +E1_dimension: 1000005, E2_dimension: 64, vals: 64000320 + + +kernel execution time: 244.992 ms +fused time: 245.482 + +kernel execution time: 86.8711 ms +sddmm time: 87.4084 + +kernel execution time: 245.054 ms +spmm time: 245.552 + +kernel execution time: 3952.47 ms +taco reference time: 3953.57 + +webbase-1M 1000005 1000005 3105536 244.992 86.8711 245.054 3952.47 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291 +C1_dimension: 5558326, C2_dimension: 64, vals: 355732864 +D1_dimension: 5558326, D2_dimension: 64, vals: 355732864 +E1_dimension: 5558326, E2_dimension: 64, vals: 355732864 + + +kernel execution time: 3275.48 ms +fused time: 3276.44 + +kernel execution time: 1522.51 ms +sddmm time: 1523.05 + +kernel execution time: 7164.88 ms +spmm time: 7165.87 + +kernel execution time: 84078.7 ms +taco reference time: 84079.8 + +circuit5M 5558326 5558326 59524291 3275.48 1522.51 7164.88 84078.7 +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 +C1_dimension: 121192, C2_dimension: 64, vals: 7756288 +D1_dimension: 121192, D2_dimension: 64, vals: 7756288 +E1_dimension: 121192, E2_dimension: 64, vals: 7756288 + + +kernel execution time: 62.8847 ms +fused time: 63.418 + +kernel execution time: 561.815 ms +sddmm time: 562.479 + +kernel execution time: 62.7688 ms +spmm time: 63.4747 + +kernel execution time: 727.65 ms +taco reference time: 728.755 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 +C1_dimension: 121192, C2_dimension: 64, vals: 7756288 +D1_dimension: 121192, D2_dimension: 64, vals: 7756288 +E1_dimension: 121192, E2_dimension: 64, vals: 7756288 + + +kernel execution time: 1121.74 ms +fused time: 1122.26 + +kernel execution time: 524.494 ms +sddmm time: 525.084 + +kernel execution time: 602.517 ms +spmm time: 603.056 + +kernel execution time: 38095.2 ms +taco reference time: 38096.3 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 +C1_dimension: 121192, C2_dimension: 64, vals: 7756288 +D1_dimension: 121192, D2_dimension: 64, vals: 7756288 +E1_dimension: 121192, E2_dimension: 64, vals: 7756288 + + +kernel execution time: 1129.96 ms +fused time: 1130.47 + +kernel execution time: 528.571 ms +sddmm time: 529.152 + +kernel execution time: 611.108 ms +spmm time: 611.643 + +kernel execution time: 38230.1 ms +taco reference time: 38231.1 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 +C1_dimension: 121192, C2_dimension: 64, vals: 7756288 +D1_dimension: 121192, D2_dimension: 64, vals: 7756288 +E1_dimension: 121192, E2_dimension: 64, vals: 7756288 + + +kernel execution time: 63.6404 ms +fused time: 64.1428 + +kernel execution time: 562.966 ms +sddmm time: 563.609 + +kernel execution time: 62.5981 ms +spmm time: 63.1044 + +kernel execution time: 728.068 ms +taco reference time: 729.005 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 +C1_dimension: 121192, C2_dimension: 64, vals: 7756288 +D1_dimension: 121192, D2_dimension: 64, vals: 7756288 +E1_dimension: 121192, E2_dimension: 64, vals: 7756288 + + +kernel execution time: 62.7795 ms +fused time: 63.2831 + +kernel execution time: 564.376 ms +sddmm time: 565.025 + +kernel execution time: 62.8883 ms +spmm time: 63.4116 + +kernel execution time: 727.567 ms +taco reference time: 728.511 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 +C1_dimension: 121192, C2_dimension: 64, vals: 7756288 +D1_dimension: 121192, D2_dimension: 64, vals: 7756288 +E1_dimension: 121192, E2_dimension: 64, vals: 7756288 + + +kernel execution time: 68.4674 ms +fused time: 68.9896 + +kernel execution time: 563.596 ms +sddmm time: 564.267 + +kernel execution time: 62.5779 ms +spmm time: 63.0812 + +kernel execution time: 730.226 ms +taco reference time: 731.124 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 +C1_dimension: 121192, C2_dimension: 64, vals: 7756288 +D1_dimension: 121192, D2_dimension: 64, vals: 7756288 +E1_dimension: 121192, E2_dimension: 64, vals: 7756288 + + +kernel execution time: 56.5639 ms +fused time: 57.0618 + +kernel execution time: 562.554 ms +sddmm time: 563.193 + +kernel execution time: 62.6038 ms +spmm time: 63.1209 + +kernel execution time: 730.018 ms +taco reference time: 730.906 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 +C1_dimension: 121192, C2_dimension: 64, vals: 7756288 +D1_dimension: 121192, D2_dimension: 64, vals: 7756288 +E1_dimension: 121192, E2_dimension: 64, vals: 7756288 + + +kernel execution time: 66.7636 ms +fused time: 67.2669 + +kernel execution time: 564.075 ms +sddmm time: 564.809 + +kernel execution time: 62.9335 ms +spmm time: 63.4347 + +kernel execution time: 727.588 ms +taco reference time: 728.484 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 +C1_dimension: 121192, C2_dimension: 64, vals: 7756288 +D1_dimension: 121192, D2_dimension: 64, vals: 7756288 +E1_dimension: 121192, E2_dimension: 64, vals: 7756288 + + +kernel execution time: 55.1612 ms +fused time: 55.6765 + +kernel execution time: 574.602 ms +sddmm time: 575.262 + +kernel execution time: 62.2801 ms +spmm time: 62.7918 + +kernel execution time: 738.027 ms +taco reference time: 738.739 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 +C1_dimension: 121192, C2_dimension: 64, vals: 7756288 +D1_dimension: 121192, D2_dimension: 64, vals: 7756288 +E1_dimension: 121192, E2_dimension: 64, vals: 7756288 + + +kernel execution time: 864.868 ms +fused time: 865.374 + +kernel execution time: 544.426 ms +sddmm time: 545.045 + +kernel execution time: 377.977 ms +spmm time: 378.522 + +kernel execution time: 19947 ms +taco reference time: 19948.1 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 +C1_dimension: 121192, C2_dimension: 64, vals: 7756288 +D1_dimension: 121192, D2_dimension: 64, vals: 7756288 +E1_dimension: 121192, E2_dimension: 64, vals: 7756288 + + +kernel execution time: 71.685 ms +fused time: 72.1905 + +kernel execution time: 548.984 ms +sddmm time: 549.581 + +kernel execution time: 51.9969 ms +spmm time: 52.562 + +kernel execution time: 969.838 ms +taco reference time: 970.48 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 +C1_dimension: 121192, C2_dimension: 64, vals: 7756288 +D1_dimension: 121192, D2_dimension: 64, vals: 7756288 +E1_dimension: 121192, E2_dimension: 64, vals: 7756288 + + +kernel execution time: 56.1268 ms +fused time: 56.6263 + +kernel execution time: 566.523 ms +sddmm time: 567.123 + +kernel execution time: 60.4097 ms +spmm time: 60.9402 + +kernel execution time: 757.174 ms +taco reference time: 757.82 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 +C1_dimension: 121192, C2_dimension: 64, vals: 7756288 +D1_dimension: 121192, D2_dimension: 64, vals: 7756288 +E1_dimension: 121192, E2_dimension: 64, vals: 7756288 + +24 threads + +kernel execution time: 119.302 ms +fused time: 119.817 + +kernel execution time: 550.24 ms +sddmm time: 550.791 + +kernel execution time: 49.3294 ms +spmm time: 49.8462 + +kernel execution time: 1710.98 ms +taco reference time: 1711.58 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 +C1_dimension: 121192, C2_dimension: 64, vals: 7756288 +D1_dimension: 121192, D2_dimension: 64, vals: 7756288 +E1_dimension: 121192, E2_dimension: 64, vals: 7756288 + +2 threads + +kernel execution time: 832.831 ms +fused time: 833.337 + +kernel execution time: 543.518 ms +sddmm time: 544.133 + +kernel execution time: 372.721 ms +spmm time: 373.277 + +kernel execution time: 19871.7 ms +taco reference time: 19873 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 +C1_dimension: 121192, C2_dimension: 64, vals: 7756288 +D1_dimension: 121192, D2_dimension: 64, vals: 7756288 +E1_dimension: 121192, E2_dimension: 64, vals: 7756288 + + +kernel execution time: 1177.5 ms +fused time: 1178 + +kernel execution time: 547.532 ms +sddmm time: 548.083 + +kernel execution time: 618.83 ms +spmm time: 619.38 + +kernel execution time: 39590.7 ms +taco reference time: 39591.8 + + + +---------- 24 threads + + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 10974, D2_dimension: 64, vals: 702336 +E1_dimension: 10974, E2_dimension: 64, vals: 702336 + + +kernel execution time: 18.2194 ms +fused time: 18.6902 + +kernel execution time: 80.3278 ms +sddmm time: 80.7347 + +kernel execution time: 5.17506 ms +spmm time: 5.64137 + +kernel execution time: 275.571 ms +taco reference time: 275.978 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 64, vals: 2330688 +D1_dimension: 36417, D2_dimension: 64, vals: 2330688 +E1_dimension: 36417, E2_dimension: 64, vals: 2330688 + + +kernel execution time: 159.53 ms +fused time: 160.016 + +kernel execution time: 814.453 ms +sddmm time: 814.988 + +kernel execution time: 41.9148 ms +spmm time: 42.4142 + +kernel execution time: 2782.76 ms +taco reference time: 2783.34 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 46835, B2_dimension: 46835, vals: 2374001 +C1_dimension: 46835, C2_dimension: 64, vals: 2997440 +D1_dimension: 46835, D2_dimension: 64, vals: 2997440 +E1_dimension: 46835, E2_dimension: 64, vals: 2997440 + + +kernel execution time: 80.1703 ms +fused time: 80.65 + +kernel execution time: 442.648 ms +sddmm time: 443.191 + +kernel execution time: 27.375 ms +spmm time: 27.8981 + +kernel execution time: 1518.49 ms +taco reference time: 1519.1 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 62451, B2_dimension: 62451, vals: 4007383 +C1_dimension: 62451, C2_dimension: 64, vals: 3996864 +D1_dimension: 62451, D2_dimension: 64, vals: 3996864 +E1_dimension: 62451, E2_dimension: 64, vals: 3996864 + + +kernel execution time: 147.378 ms +fused time: 147.862 + +kernel execution time: 746.182 ms +sddmm time: 746.722 + +kernel execution time: 43.521 ms +spmm time: 44.0217 + +kernel execution time: 2560.78 ms +taco reference time: 2561.36 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 83334, B2_dimension: 83334, vals: 6010480 +C1_dimension: 83334, C2_dimension: 64, vals: 5333376 +D1_dimension: 83334, D2_dimension: 64, vals: 5333376 +E1_dimension: 83334, E2_dimension: 64, vals: 5333376 + + +kernel execution time: 220.568 ms +fused time: 221.066 + +kernel execution time: 1121.47 ms +sddmm time: 1122.03 + +kernel execution time: 61.8518 ms +spmm time: 62.3779 + +kernel execution time: 3844.87 ms +taco reference time: 3845.8 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 +C1_dimension: 121192, C2_dimension: 64, vals: 7756288 +D1_dimension: 121192, D2_dimension: 64, vals: 7756288 +E1_dimension: 121192, E2_dimension: 64, vals: 7756288 + + +kernel execution time: 118.211 ms +fused time: 118.715 + +kernel execution time: 552.77 ms +sddmm time: 553.326 + +kernel execution time: 49.2278 ms +spmm time: 49.7369 + +kernel execution time: 1713.01 ms +taco reference time: 1713.63 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 140874, B2_dimension: 140874, vals: 7813404 +C1_dimension: 140874, C2_dimension: 64, vals: 9015936 +D1_dimension: 140874, D2_dimension: 64, vals: 9015936 +E1_dimension: 140874, E2_dimension: 64, vals: 9015936 + + +kernel execution time: 300.972 ms +fused time: 301.471 + +kernel execution time: 1461.86 ms +sddmm time: 1462.45 + +kernel execution time: 89.5313 ms +spmm time: 90.0418 + +kernel execution time: 5010.7 ms +taco reference time: 5011.67 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 170998, B2_dimension: 170998, vals: 958936 +C1_dimension: 170998, C2_dimension: 64, vals: 10943872 +D1_dimension: 170998, D2_dimension: 64, vals: 10943872 +E1_dimension: 170998, E2_dimension: 64, vals: 10943872 + + +kernel execution time: 52.5196 ms +fused time: 53.0296 + +kernel execution time: 210.075 ms +sddmm time: 210.666 + +kernel execution time: 67.487 ms +spmm time: 68.0293 + +kernel execution time: 632.81 ms +taco reference time: 633.445 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 206500, B2_dimension: 206500, vals: 1273389 +C1_dimension: 206500, C2_dimension: 64, vals: 13216000 +D1_dimension: 206500, D2_dimension: 64, vals: 13216000 +E1_dimension: 206500, E2_dimension: 64, vals: 13216000 + + +kernel execution time: 60.3333 ms +fused time: 60.8277 + +kernel execution time: 261.834 ms +sddmm time: 262.379 + +kernel execution time: 82.326 ms +spmm time: 82.838 + +kernel execution time: 836.401 ms +taco reference time: 837.023 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536 +C1_dimension: 1000005, C2_dimension: 64, vals: 64000320 +D1_dimension: 1000005, D2_dimension: 64, vals: 64000320 +E1_dimension: 1000005, E2_dimension: 64, vals: 64000320 + + +kernel execution time: 187.296 ms +fused time: 187.792 + +kernel execution time: 616.026 ms +sddmm time: 616.601 + +kernel execution time: 382.801 ms +spmm time: 383.307 + +kernel execution time: 2082.34 ms +taco reference time: 2082.95 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291 +C1_dimension: 5558326, C2_dimension: 64, vals: 355732864 +D1_dimension: 5558326, D2_dimension: 64, vals: 355732864 +E1_dimension: 5558326, E2_dimension: 64, vals: 355732864 + + +kernel execution time: 2499.56 ms +fused time: 2500.39 + +kernel execution time: 11463.5 ms +sddmm time: 11464.5 + +kernel execution time: 2581.49 ms +spmm time: 2582.04 + +kernel execution time: 39683.3 ms +taco reference time: 39684.4 + + + + + +-------------------- +--------------------- + + + + + + + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 10974, D2_dimension: 64, vals: 702336 +E1_dimension: 10974, E2_dimension: 64, vals: 702336 + + +kernel execution time: 163.669 ms +fused time: 164.155 + +kernel execution time: 79.1673 ms +sddmm time: 79.7118 + +kernel execution time: 88.6347 ms +spmm time: 89.0784 + +kernel execution time: 6143.97 ms +taco reference time: 6144.94 + + +163.669 79.1673 88.6347 6144.94 + + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 10974, D2_dimension: 64, vals: 702336 +E1_dimension: 10974, E2_dimension: 64, vals: 702336 + + +kernel execution time: 17.2275 ms +fused time: 17.6988 + +kernel execution time: 8.26223 ms +sddmm time: 8.8233 + +kernel execution time: 19.3989 ms +spmm time: 19.8422 + +kernel execution time: 519.537 ms +taco reference time: 520.073 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 10974, D2_dimension: 64, vals: 702336 +E1_dimension: 10974, E2_dimension: 64, vals: 702336 + + +kernel execution time: 3.03999 ms +fused time: 3.51084 + +kernel execution time: 8.19604 ms +sddmm time: 8.67702 + +kernel execution time: 5.63342 ms +spmm time: 6.05327 + +kernel execution time: 25.6437 ms +taco reference time: 26.0382 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 64, vals: 2330688 +D1_dimension: 36417, D2_dimension: 64, vals: 2330688 +E1_dimension: 36417, E2_dimension: 64, vals: 2330688 + + +kernel execution time: 41.03 ms +fused time: 41.5262 + +kernel execution time: 82.5401 ms +sddmm time: 83.1745 + +kernel execution time: 15.9687 ms +spmm time: 16.5644 + +kernel execution time: 244.774 ms +taco reference time: 245.387 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 46835, B2_dimension: 46835, vals: 2374001 +C1_dimension: 46835, C2_dimension: 64, vals: 2997440 +D1_dimension: 46835, D2_dimension: 64, vals: 2997440 +E1_dimension: 46835, E2_dimension: 64, vals: 2997440 + + +kernel execution time: 27.5081 ms +fused time: 28.0034 + +kernel execution time: 45.9865 ms +sddmm time: 46.5649 + +kernel execution time: 20.0912 ms +spmm time: 20.6288 + +kernel execution time: 138.544 ms +taco reference time: 139.148 + + +---------- +----------- + + + + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 10974, D2_dimension: 64, vals: 702336 +E1_dimension: 10974, E2_dimension: 64, vals: 702336 + + +kernel execution time: 3.25222 ms +fused time: 3.71775 + +kernel execution time: 8.13173 ms +sddmm time: 8.56798 + +kernel execution time: 5.42295 ms +spmm time: 5.85093 + +kernel execution time: 25.1419 ms +taco reference time: 25.5332 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 64, vals: 2330688 +D1_dimension: 36417, D2_dimension: 64, vals: 2330688 +E1_dimension: 36417, E2_dimension: 64, vals: 2330688 + + +kernel execution time: 40.046 ms +fused time: 40.5327 + +kernel execution time: 82.7374 ms +sddmm time: 83.308 + +kernel execution time: 17.148 ms +spmm time: 17.6723 + +kernel execution time: 244.434 ms +taco reference time: 245.084 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 46835, B2_dimension: 46835, vals: 2374001 +C1_dimension: 46835, C2_dimension: 64, vals: 2997440 +D1_dimension: 46835, D2_dimension: 64, vals: 2997440 +E1_dimension: 46835, E2_dimension: 64, vals: 2997440 + + +kernel execution time: 27.3917 ms +fused time: 27.8878 + +kernel execution time: 46.1218 ms +sddmm time: 46.7015 + +kernel execution time: 19.567 ms +spmm time: 20.0877 + +kernel execution time: 136.269 ms +taco reference time: 136.877 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 62451, B2_dimension: 62451, vals: 4007383 +C1_dimension: 62451, C2_dimension: 64, vals: 3996864 +D1_dimension: 62451, D2_dimension: 64, vals: 3996864 +E1_dimension: 62451, E2_dimension: 64, vals: 3996864 + + +kernel execution time: 42.3074 ms +fused time: 42.8144 + +kernel execution time: 75.8411 ms +sddmm time: 76.427 + +kernel execution time: 25.5141 ms +spmm time: 26.0647 + +kernel execution time: 229.9 ms +taco reference time: 230.514 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 83334, B2_dimension: 83334, vals: 6010480 +C1_dimension: 83334, C2_dimension: 64, vals: 5333376 +D1_dimension: 83334, D2_dimension: 64, vals: 5333376 +E1_dimension: 83334, E2_dimension: 64, vals: 5333376 + + +kernel execution time: 57.3193 ms +fused time: 57.8292 + +kernel execution time: 115.953 ms +sddmm time: 116.536 + +kernel execution time: 31.4256 ms +spmm time: 31.9698 + +kernel execution time: 344.97 ms +taco reference time: 345.594 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 +C1_dimension: 121192, C2_dimension: 64, vals: 7756288 +D1_dimension: 121192, D2_dimension: 64, vals: 7756288 +E1_dimension: 121192, E2_dimension: 64, vals: 7756288 + + +kernel execution time: 58.8731 ms +fused time: 59.371 + +kernel execution time: 96.3746 ms +sddmm time: 96.9431 + +kernel execution time: 52.3502 ms +spmm time: 52.8781 + +kernel execution time: 176.858 ms +taco reference time: 177.482 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 140874, B2_dimension: 140874, vals: 7813404 +C1_dimension: 140874, C2_dimension: 64, vals: 9015936 +D1_dimension: 140874, D2_dimension: 64, vals: 9015936 +E1_dimension: 140874, E2_dimension: 64, vals: 9015936 + + +kernel execution time: 97.3646 ms +fused time: 97.869 + +kernel execution time: 154.708 ms +sddmm time: 155.284 + +kernel execution time: 61.8392 ms +spmm time: 62.3666 + +kernel execution time: 455.127 ms +taco reference time: 455.719 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 170998, B2_dimension: 170998, vals: 958936 +C1_dimension: 170998, C2_dimension: 64, vals: 10943872 +D1_dimension: 170998, D2_dimension: 64, vals: 10943872 +E1_dimension: 170998, E2_dimension: 64, vals: 10943872 + + +kernel execution time: 30.2488 ms +fused time: 30.744 + +kernel execution time: 39.9852 ms +sddmm time: 40.5654 + +kernel execution time: 67.5062 ms +spmm time: 68.0413 + +kernel execution time: 74.4023 ms +taco reference time: 75.0271 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 206500, B2_dimension: 206500, vals: 1273389 +C1_dimension: 206500, C2_dimension: 64, vals: 13216000 +D1_dimension: 206500, D2_dimension: 64, vals: 13216000 +E1_dimension: 206500, E2_dimension: 64, vals: 13216000 + + +kernel execution time: 34.9737 ms +fused time: 35.4724 + +kernel execution time: 39.6662 ms +sddmm time: 40.2179 + +kernel execution time: 82.4413 ms +spmm time: 82.9627 + +kernel execution time: 91.1415 ms +taco reference time: 91.8035 + +sddmm-spmm execution + +----------------------------------------- + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536 +C1_dimension: 1000005, C2_dimension: 64, vals: 64000320 +D1_dimension: 1000005, D2_dimension: 64, vals: 64000320 +E1_dimension: 1000005, E2_dimension: 64, vals: 64000320 + + +kernel execution time: 118.92 ms +fused time: 119.4 + +kernel execution time: 90.6065 ms +sddmm time: 91.1522 + +kernel execution time: 390.342 ms +spmm time: 390.863 + +kernel execution time: 423.16 ms +taco reference time: 423.757 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291 +C1_dimension: 5558326, C2_dimension: 64, vals: 355732864 +D1_dimension: 5558326, D2_dimension: 64, vals: 355732864 +E1_dimension: 5558326, E2_dimension: 64, vals: 355732864 + + +kernel execution time: 1158.96 ms +fused time: 1159.93 + +kernel execution time: 1561.31 ms +sddmm time: 1561.87 + +kernel execution time: 2533.87 ms +spmm time: 2534.43 + +kernel execution time: 6529.81 ms +taco reference time: 6530.95 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 10974, D2_dimension: 64, vals: 702336 +E1_dimension: 10974, E2_dimension: 64, vals: 702336 + + +kernel execution time: 3.12799 ms +fused time: 3.5888 + +kernel execution time: 8.20063 ms +sddmm time: 8.64883 + +kernel execution time: 5.23889 ms +spmm time: 5.67244 + +kernel execution time: 25.0758 ms +taco reference time: 25.4671 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 64, vals: 2330688 +D1_dimension: 36417, D2_dimension: 64, vals: 2330688 +E1_dimension: 36417, E2_dimension: 64, vals: 2330688 + + +kernel execution time: 39.3104 ms +fused time: 39.7945 + +kernel execution time: 82.5126 ms +sddmm time: 83.0785 + +kernel execution time: 15.6324 ms +spmm time: 16.1739 + +kernel execution time: 245.768 ms +taco reference time: 246.406 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 102, B2_dimension: 103, vals: 3149 +C1_dimension: 102, C2_dimension: 64, vals: 6528 +D1_dimension: 103, D2_dimension: 64, vals: 6592 +E1_dimension: 103, E2_dimension: 64, vals: 6592 + + +kernel execution time: 0.160132 ms +fused time: 0.567098 + +kernel execution time: 0.065981 ms +sddmm time: 0.853092 + +kernel execution time: 0.081641 ms +spmm time: 0.331655 + +kernel execution time: 0.336385 ms +taco reference time: 1.05356 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 5, B2_dimension: 5, vals: 19 +C1_dimension: 5, C2_dimension: 64, vals: 320 +D1_dimension: 5, D2_dimension: 64, vals: 320 +E1_dimension: 5, E2_dimension: 64, vals: 320 + + +kernel execution time: 0.0165 ms +fused time: 0.78845 + +kernel execution time: 0.011641 ms +sddmm time: 0.873231 + +kernel execution time: 0.011011 ms +spmm time: 0.486977 + +kernel execution time: 0.059631 ms +taco reference time: 0.958413 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 5, B2_dimension: 5, vals: 19 +C1_dimension: 5, C2_dimension: 64, vals: 320 +D1_dimension: 5, D2_dimension: 64, vals: 320 +E1_dimension: 5, E2_dimension: 64, vals: 320 + + +kernel execution time: 0.01989 ms +fused time: 0.813381 + +kernel execution time: 0.01392 ms +sddmm time: 0.976913 + +kernel execution time: 0.013151 ms +spmm time: 0.497287 + +kernel execution time: 0.058 ms +taco reference time: 0.974083 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 5, B2_dimension: 5, vals: 19 +C1_dimension: 5, C2_dimension: 64, vals: 320 +D1_dimension: 5, D2_dimension: 64, vals: 320 +E1_dimension: 5, E2_dimension: 64, vals: 320 + + +kernel execution time: 0.0192 ms +fused time: 0.8019 + +kernel execution time: 0.012991 ms +sddmm time: 0.990253 + +kernel execution time: 0.01291 ms +spmm time: 0.490396 + +kernel execution time: 0.057891 ms +taco reference time: 0.929332 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 5, B2_dimension: 5, vals: 19 +C1_dimension: 5, C2_dimension: 64, vals: 320 +D1_dimension: 5, D2_dimension: 64, vals: 320 +E1_dimension: 5, E2_dimension: 64, vals: 320 + + +kernel execution time: 0.01797 ms +fused time: 0.779061 + +kernel execution time: 0.013 ms +sddmm time: 0.7717 + +kernel execution time: 0.01429 ms +spmm time: 0.487296 + +kernel execution time: 0.05764 ms +taco reference time: 0.72862 + +sddmm-spmm execution + + + +sddmm with parallel execution +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 5, B2_dimension: 5, vals: 19 +C1_dimension: 5, C2_dimension: 64, vals: 320 +D1_dimension: 5, D2_dimension: 64, vals: 320 +E1_dimension: 5, E2_dimension: 64, vals: 320 + + +kernel execution time: 0.02088 ms +fused time: 0.912153 + +kernel execution time: 0.01161 ms +sddmm time: 0.944402 + +kernel execution time: 0.01292 ms +spmm time: 0.562267 + +kernel execution time: 0.067781 ms +taco reference time: 1.10908 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 64, vals: 2330688 +D1_dimension: 36417, D2_dimension: 64, vals: 2330688 +E1_dimension: 36417, E2_dimension: 64, vals: 2330688 + + +kernel execution time: 166.429 ms +fused time: 166.938 + +kernel execution time: 83.0174 ms +sddmm time: 83.5946 + +kernel execution time: 303.7 ms +spmm time: 304.246 + +kernel execution time: 5227.75 ms +taco reference time: 5228.77 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 64, vals: 2330688 +D1_dimension: 36417, D2_dimension: 64, vals: 2330688 +E1_dimension: 36417, E2_dimension: 64, vals: 2330688 + + +kernel execution time: 166.755 ms +fused time: 167.262 + +kernel execution time: 83.1762 ms +sddmm time: 83.7333 + +kernel execution time: 303.525 ms +spmm time: 304.051 + +kernel execution time: 5232.78 ms +taco reference time: 5233.91 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 64, vals: 2330688 +D1_dimension: 36417, D2_dimension: 64, vals: 2330688 +E1_dimension: 36417, E2_dimension: 64, vals: 2330688 + + +kernel execution time: 27.2912 ms +fused time: 27.7968 + +kernel execution time: 84.1751 ms +sddmm time: 84.7569 + +kernel execution time: 12.6781 ms +spmm time: 13.1881 + +kernel execution time: 134.209 ms +taco reference time: 134.846 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 64, vals: 2330688 +D1_dimension: 36417, D2_dimension: 64, vals: 2330688 +E1_dimension: 36417, E2_dimension: 64, vals: 2330688 + + +kernel execution time: 26.6207 ms +fused time: 27.1299 + +kernel execution time: 86.3046 ms +sddmm time: 86.9394 + +kernel execution time: 12.7749 ms +spmm time: 13.2807 + +kernel execution time: 130.582 ms +taco reference time: 131.278 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 64, vals: 2330688 +D1_dimension: 36417, D2_dimension: 64, vals: 2330688 +E1_dimension: 36417, E2_dimension: 64, vals: 2330688 + + +kernel execution time: 101.848 ms +fused time: 102.362 + +kernel execution time: 83.9029 ms +sddmm time: 84.4969 + +kernel execution time: 42.5674 ms +spmm time: 43.1242 + +kernel execution time: 708.807 ms +taco reference time: 709.518 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 64, vals: 2330688 +D1_dimension: 36417, D2_dimension: 64, vals: 2330688 +E1_dimension: 36417, E2_dimension: 64, vals: 2330688 + + +kernel execution time: 107.29 ms +fused time: 107.797 + +kernel execution time: 83.8499 ms +sddmm time: 84.3953 + +kernel execution time: 43.5065 ms +spmm time: 44.0135 + +kernel execution time: 705.909 ms +taco reference time: 706.511 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 64, vals: 2330688 +D1_dimension: 36417, D2_dimension: 64, vals: 2330688 +E1_dimension: 36417, E2_dimension: 64, vals: 2330688 + + +kernel execution time: 26.2026 ms +fused time: 26.7322 + +kernel execution time: 86.809 ms +sddmm time: 87.4374 + +kernel execution time: 12.6681 ms +spmm time: 13.1758 + +kernel execution time: 130.015 ms +taco reference time: 130.717 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 10974, D2_dimension: 64, vals: 702336 +E1_dimension: 10974, E2_dimension: 64, vals: 702336 + + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 10974, D2_dimension: 64, vals: 702336 +E1_dimension: 10974, E2_dimension: 64, vals: 702336 + + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 10974, D2_dimension: 64, vals: 702336 +E1_dimension: 10974, E2_dimension: 64, vals: 702336 + + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 10974, D2_dimension: 64, vals: 702336 +E1_dimension: 10974, E2_dimension: 64, vals: 702336 + + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 10974, D2_dimension: 64, vals: 702336 +E1_dimension: 10974, E2_dimension: 64, vals: 702336 + + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 10974, D2_dimension: 64, vals: 702336 +E1_dimension: 10974, E2_dimension: 64, vals: 702336 + + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 10974, D2_dimension: 64, vals: 702336 +E1_dimension: 10974, E2_dimension: 64, vals: 702336 + + +kernel execution time: 4.9002 ms +fused time: 5.40296 + +kernel execution time: 9.21483 ms +sddmm time: 9.69115 + +kernel execution time: 5.35955 ms +spmm time: 5.79675 + +kernel execution time: 14.9148 ms +taco reference time: 15.4012 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 10974, D2_dimension: 64, vals: 702336 +E1_dimension: 10974, E2_dimension: 64, vals: 702336 + + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 10974, D2_dimension: 64, vals: 702336 +E1_dimension: 10974, E2_dimension: 64, vals: 702336 + + +kernel execution time: 2.39607 ms +fused time: 2.86927 + +kernel execution time: 8.62899 ms +sddmm time: 8.97544 + +kernel execution time: 5.41841 ms +spmm time: 5.83089 + +kernel execution time: 14.2058 ms +taco reference time: 14.5956 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 10974, D2_dimension: 64, vals: 702336 +E1_dimension: 10974, E2_dimension: 64, vals: 702336 + + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 10974, D2_dimension: 64, vals: 702336 +E1_dimension: 10974, E2_dimension: 64, vals: 702336 + + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 10974, D2_dimension: 64, vals: 702336 +E1_dimension: 10974, E2_dimension: 64, vals: 702336 + + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 10974, D2_dimension: 64, vals: 702336 +E1_dimension: 10974, E2_dimension: 64, vals: 702336 + + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 10974, D2_dimension: 64, vals: 702336 +E1_dimension: 10974, E2_dimension: 64, vals: 702336 + + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 10974, D2_dimension: 64, vals: 702336 +E1_dimension: 10974, E2_dimension: 64, vals: 702336 + + +kernel execution time: 1.85339 ms +fused time: 2.66762 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 10974, D2_dimension: 64, vals: 702336 +E1_dimension: 10974, E2_dimension: 64, vals: 702336 + + +kernel execution time: 4.94195 ms +fused time: 6.0647 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 10974, D2_dimension: 64, vals: 702336 +E1_dimension: 10974, E2_dimension: 64, vals: 702336 + + +kernel execution time: 5.09918 ms +fused time: 6.23075 + +kernel execution time: 14.2105 ms +sddmm time: 15.026 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 10974, D2_dimension: 64, vals: 702336 +E1_dimension: 10974, E2_dimension: 64, vals: 702336 + + +kernel execution time: 4.93573 ms +fused time: 5.42636 + +kernel execution time: 8.35333 ms +sddmm time: 8.77215 + +kernel execution time: 5.35189 ms +spmm time: 5.7874 + +kernel execution time: 15.4744 ms +taco reference time: 15.8619 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 10974, D2_dimension: 64, vals: 702336 +E1_dimension: 10974, E2_dimension: 64, vals: 702336 + + +kernel execution time: 1.72938 ms +fused time: 2.19226 + +kernel execution time: 8.38474 ms +sddmm time: 8.70208 + +kernel execution time: 5.55896 ms +spmm time: 5.96847 + +kernel execution time: 13.8271 ms +taco reference time: 14.2228 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 10974, D2_dimension: 64, vals: 702336 +E1_dimension: 10974, E2_dimension: 64, vals: 702336 + + +kernel execution time: 1.99224 ms +fused time: 2.45758 + +kernel execution time: 8.4613 ms +sddmm time: 8.79168 + +kernel execution time: 5.51595 ms +spmm time: 5.95761 + +kernel execution time: 13.5919 ms +taco reference time: 13.973 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 10974, D2_dimension: 64, vals: 702336 +E1_dimension: 10974, E2_dimension: 64, vals: 702336 + + +kernel execution time: 2.17974 ms +fused time: 2.64915 + +kernel execution time: 9.49553 ms +sddmm time: 9.89178 + +kernel execution time: 5.3851 ms +spmm time: 5.80552 + +kernel execution time: 15.1854 ms +taco reference time: 15.6294 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 10974, D2_dimension: 64, vals: 702336 +E1_dimension: 10974, E2_dimension: 64, vals: 702336 + + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 10974, D2_dimension: 64, vals: 702336 +E1_dimension: 10974, E2_dimension: 64, vals: 702336 + + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 10974, D2_dimension: 64, vals: 702336 +E1_dimension: 10974, E2_dimension: 64, vals: 702336 + + +kernel execution time: 1.77985 ms +fused time: 2.24554 + +kernel execution time: 9.31643 ms +sddmm time: 9.66639 + +kernel execution time: 5.48351 ms +spmm time: 5.89775 + +kernel execution time: 15.1635 ms +taco reference time: 15.6173 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 10974, D2_dimension: 64, vals: 702336 +E1_dimension: 10974, E2_dimension: 64, vals: 702336 + + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 10974, D2_dimension: 64, vals: 702336 +E1_dimension: 10974, E2_dimension: 64, vals: 702336 + + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 10974, D2_dimension: 64, vals: 702336 +E1_dimension: 10974, E2_dimension: 64, vals: 702336 + + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 10974, D2_dimension: 64, vals: 702336 +E1_dimension: 10974, E2_dimension: 64, vals: 702336 + + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 10974, D2_dimension: 64, vals: 702336 +E1_dimension: 10974, E2_dimension: 64, vals: 702336 + + +kernel execution time: 2.09062 ms +fused time: 2.75986 + +kernel execution time: 8.53961 ms +sddmm time: 8.99868 + +kernel execution time: 5.43386 ms +spmm time: 5.86914 + +kernel execution time: 14.7848 ms +taco reference time: 15.2128 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 10974, D2_dimension: 64, vals: 702336 +E1_dimension: 10974, E2_dimension: 64, vals: 702336 + + +kernel execution time: 1.99345 ms +fused time: 2.4639 + +kernel execution time: 10.0509 ms +sddmm time: 10.4945 + +kernel execution time: 5.37643 ms +spmm time: 5.82607 + +kernel execution time: 15.0911 ms +taco reference time: 15.5753 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 10974, D2_dimension: 64, vals: 702336 +E1_dimension: 10974, E2_dimension: 64, vals: 702336 + + +kernel execution time: 2.14705 ms +fused time: 2.62359 + +kernel execution time: 9.35781 ms +sddmm time: 9.71116 + +kernel execution time: 6.0153 ms +spmm time: 6.42121 + +kernel execution time: 14.8814 ms +taco reference time: 15.3035 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 10974, D2_dimension: 64, vals: 702336 +E1_dimension: 10974, E2_dimension: 64, vals: 702336 + + +kernel execution time: 3.85621 ms +fused time: 4.31728 + +kernel execution time: 8.49591 ms +sddmm time: 8.85325 + +kernel execution time: 4.55458 ms +spmm time: 5.00309 + +kernel execution time: 71.693 ms +taco reference time: 72.1249 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 10974, D2_dimension: 64, vals: 702336 +E1_dimension: 10974, E2_dimension: 64, vals: 702336 + + +kernel execution time: 4.4083 ms +fused time: 4.87449 + +kernel execution time: 9.23609 ms +sddmm time: 9.68592 + +kernel execution time: 4.52337 ms +spmm time: 4.93316 + +kernel execution time: 75.7983 ms +taco reference time: 76.2419 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 10974, D2_dimension: 64, vals: 702336 +E1_dimension: 10974, E2_dimension: 64, vals: 702336 + + +kernel execution time: 2.02675 ms +fused time: 2.47188 + +kernel execution time: 9.25498 ms +sddmm time: 9.67129 + +kernel execution time: 5.23325 ms +spmm time: 5.68302 + +kernel execution time: 14.8775 ms +taco reference time: 15.3813 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 10974, D2_dimension: 64, vals: 702336 +E1_dimension: 10974, E2_dimension: 64, vals: 702336 + + +kernel execution time: 1.94846 ms +fused time: 2.40322 + +kernel execution time: 9.52502 ms +sddmm time: 9.90909 + +kernel execution time: 5.31443 ms +spmm time: 5.71988 + +kernel execution time: 15.7004 ms +taco reference time: 16.1456 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 10974, D2_dimension: 64, vals: 702336 +E1_dimension: 10974, E2_dimension: 64, vals: 702336 + + +kernel execution time: 1.79798 ms +fused time: 2.25022 + +kernel execution time: 9.43793 ms +sddmm time: 9.82708 + +kernel execution time: 5.29275 ms +spmm time: 5.69457 + +kernel execution time: 14.9269 ms +taco reference time: 15.3874 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 10974, D2_dimension: 64, vals: 702336 +E1_dimension: 10974, E2_dimension: 64, vals: 702336 + + +kernel execution time: 1.75935 ms +fused time: 2.20095 + +kernel execution time: 8.58506 ms +sddmm time: 8.92534 + +kernel execution time: 5.5533 ms +spmm time: 5.93899 + +kernel execution time: 14.2327 ms +taco reference time: 14.5943 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 10974, D2_dimension: 64, vals: 702336 +E1_dimension: 10974, E2_dimension: 64, vals: 702336 + + +kernel execution time: 2.04599 ms +fused time: 2.50059 + +kernel execution time: 9.39166 ms +sddmm time: 9.80431 + +kernel execution time: 5.3514 ms +spmm time: 5.75487 + +kernel execution time: 15.0619 ms +taco reference time: 15.497 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 10974, D2_dimension: 64, vals: 702336 +E1_dimension: 10974, E2_dimension: 64, vals: 702336 + + +kernel execution time: 1.9781 ms +fused time: 2.41055 + +kernel execution time: 8.50024 ms +sddmm time: 8.81933 + +kernel execution time: 5.28711 ms +spmm time: 5.68452 + +kernel execution time: 13.5108 ms +taco reference time: 13.8766 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 5, B2_dimension: 5, vals: 19 +C1_dimension: 5, C2_dimension: 64, vals: 320 +D1_dimension: 5, D2_dimension: 64, vals: 320 +E1_dimension: 5, E2_dimension: 64, vals: 320 + + +kernel execution time: 11.5205 ms +fused time: 12.2496 + +kernel execution time: 0.00954 ms +sddmm time: 0.935822 + +kernel execution time: 0.02342 ms +spmm time: 0.324625 + +kernel execution time: 0.050091 ms +taco reference time: 0.727519 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 5, B2_dimension: 5, vals: 19 +C1_dimension: 5, C2_dimension: 64, vals: 320 +D1_dimension: 5, D2_dimension: 64, vals: 320 +E1_dimension: 5, E2_dimension: 64, vals: 320 + + +kernel execution time: 0.235743 ms +fused time: 0.969273 + +kernel execution time: 0.01214 ms +sddmm time: 0.981613 + +kernel execution time: 0.03193 ms +spmm time: 0.521637 + +kernel execution time: 0.059391 ms +taco reference time: 0.945792 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 5, B2_dimension: 5, vals: 19 +C1_dimension: 5, C2_dimension: 64, vals: 320 +D1_dimension: 5, D2_dimension: 64, vals: 320 +E1_dimension: 5, E2_dimension: 64, vals: 320 + + +kernel execution time: 0.235003 ms +fused time: 0.964663 + +kernel execution time: 0.013771 ms +sddmm time: 1.23201 + +kernel execution time: 0.027521 ms +spmm time: 0.470876 + +kernel execution time: 0.043441 ms +taco reference time: 0.814271 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 5, B2_dimension: 5, vals: 19 +C1_dimension: 5, C2_dimension: 64, vals: 320 +D1_dimension: 5, D2_dimension: 64, vals: 320 +E1_dimension: 5, E2_dimension: 64, vals: 320 + + +kernel execution time: 0.242774 ms +fused time: 0.984063 + +kernel execution time: 0.01744 ms +sddmm time: 1.07782 + +kernel execution time: 0.03915 ms +spmm time: 0.602928 + +kernel execution time: 0.073381 ms +taco reference time: 0.858301 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 5, B2_dimension: 5, vals: 19 +C1_dimension: 5, C2_dimension: 64, vals: 320 +D1_dimension: 5, D2_dimension: 64, vals: 320 +E1_dimension: 5, E2_dimension: 64, vals: 320 + + +kernel execution time: 0.199533 ms +fused time: 0.604928 + +kernel execution time: 0.00675 ms +sddmm time: 0.983573 + +kernel execution time: 0.02448 ms +spmm time: 0.300224 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 5, B2_dimension: 5, vals: 19 +C1_dimension: 5, C2_dimension: 64, vals: 320 +D1_dimension: 5, D2_dimension: 64, vals: 320 +E1_dimension: 5, E2_dimension: 64, vals: 320 + + +kernel execution time: 0.192703 ms +fused time: 0.575667 + +kernel execution time: 0.00622 ms +sddmm time: 0.863292 + +kernel execution time: 0.0221 ms +spmm time: 0.270204 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 5, B2_dimension: 5, vals: 19 +C1_dimension: 5, C2_dimension: 64, vals: 320 +D1_dimension: 5, D2_dimension: 64, vals: 320 +E1_dimension: 5, E2_dimension: 64, vals: 320 + + +kernel execution time: 0.195482 ms +fused time: 0.580768 + +kernel execution time: 0.00652 ms +sddmm time: 0.957703 + +kernel execution time: 0.025451 ms +spmm time: 0.313074 + +kernel execution time: 0.085611 ms +taco reference time: 0.970753 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 10974, D2_dimension: 64, vals: 702336 +E1_dimension: 10974, E2_dimension: 64, vals: 702336 + + +kernel execution time: 2.00856 ms +fused time: 2.45147 + +kernel execution time: 8.5121 ms +sddmm time: 8.95565 + +kernel execution time: 5.46083 ms +spmm time: 5.93676 + +kernel execution time: 14.1411 ms +taco reference time: 14.7397 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 64, vals: 2330688 +D1_dimension: 36417, D2_dimension: 64, vals: 2330688 +E1_dimension: 36417, E2_dimension: 64, vals: 2330688 + + +kernel execution time: 9.91597 ms +fused time: 10.4166 + +kernel execution time: 85.127 ms +sddmm time: 85.7297 + +kernel execution time: 12.8101 ms +spmm time: 13.3194 + +kernel execution time: 129.721 ms +taco reference time: 130.362 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 64, vals: 2330688 +D1_dimension: 36417, D2_dimension: 64, vals: 2330688 +E1_dimension: 36417, E2_dimension: 64, vals: 2330688 + + +kernel execution time: 9.9746 ms +fused time: 10.4536 + +kernel execution time: 85.6921 ms +sddmm time: 86.3192 + +kernel execution time: 12.752 ms +spmm time: 13.2448 + +kernel execution time: 135.682 ms +taco reference time: 136.351 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 64, vals: 2330688 +D1_dimension: 36417, D2_dimension: 64, vals: 2330688 +E1_dimension: 36417, E2_dimension: 64, vals: 2330688 + + +kernel execution time: 10.0998 ms +fused time: 10.5872 + +kernel execution time: 85.0064 ms +sddmm time: 85.6385 + +kernel execution time: 12.6128 ms +spmm time: 13.1169 + +kernel execution time: 134.629 ms +taco reference time: 135.323 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 64, vals: 2330688 +D1_dimension: 36417, D2_dimension: 64, vals: 2330688 +E1_dimension: 36417, E2_dimension: 64, vals: 2330688 + + +kernel execution time: 10.1006 ms +fused time: 10.5902 + +kernel execution time: 88.2603 ms +sddmm time: 88.897 + +kernel execution time: 12.5197 ms +spmm time: 13.0137 + +kernel execution time: 130.3 ms +taco reference time: 130.977 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 64, vals: 2330688 +D1_dimension: 36417, D2_dimension: 64, vals: 2330688 +E1_dimension: 36417, E2_dimension: 64, vals: 2330688 + + +kernel execution time: 27.6596 ms +fused time: 28.2096 + +kernel execution time: 85.6018 ms +sddmm time: 86.213 + +kernel execution time: 12.8244 ms +spmm time: 13.3343 + +kernel execution time: 131.089 ms +taco reference time: 131.789 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 64, vals: 2330688 +D1_dimension: 36417, D2_dimension: 64, vals: 2330688 +E1_dimension: 36417, E2_dimension: 64, vals: 2330688 + + +kernel execution time: 26.582 ms +fused time: 27.0673 + +kernel execution time: 87.6048 ms +sddmm time: 88.2462 + +kernel execution time: 12.5643 ms +spmm time: 13.0723 + +kernel execution time: 130.366 ms +taco reference time: 131.043 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 64, vals: 2330688 +D1_dimension: 36417, D2_dimension: 64, vals: 2330688 +E1_dimension: 36417, E2_dimension: 64, vals: 2330688 + + +kernel execution time: 26.5615 ms +fused time: 27.0713 + +kernel execution time: 87.5473 ms +sddmm time: 88.1848 + +kernel execution time: 12.6726 ms +spmm time: 13.152 + +kernel execution time: 131.024 ms +taco reference time: 131.701 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 64, vals: 2330688 +D1_dimension: 36417, D2_dimension: 64, vals: 2330688 +E1_dimension: 36417, E2_dimension: 64, vals: 2330688 + + +kernel execution time: 26.3835 ms +fused time: 26.8768 + +kernel execution time: 84.7609 ms +sddmm time: 85.3584 + +kernel execution time: 12.8437 ms +spmm time: 13.346 + +kernel execution time: 132.548 ms +taco reference time: 133.168 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 64, vals: 2330688 +D1_dimension: 36417, D2_dimension: 64, vals: 2330688 +E1_dimension: 36417, E2_dimension: 64, vals: 2330688 + + +kernel execution time: 26.6808 ms +fused time: 27.1679 + +kernel execution time: 87.0948 ms +sddmm time: 87.7219 + +kernel execution time: 12.695 ms +spmm time: 13.1923 + +kernel execution time: 134.587 ms +taco reference time: 135.255 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 5, B2_dimension: 5, vals: 19 +C1_dimension: 5, C2_dimension: 64, vals: 320 +D1_dimension: 5, D2_dimension: 64, vals: 320 +E1_dimension: 5, E2_dimension: 64, vals: 320 + + +kernel execution time: 0.235254 ms +fused time: 1.04843 + +kernel execution time: 0.01102 ms +sddmm time: 0.989634 + +kernel execution time: 0.028701 ms +spmm time: 0.574108 + +kernel execution time: 0.04363 ms +taco reference time: 0.840431 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 10974, D2_dimension: 64, vals: 702336 +E1_dimension: 10974, E2_dimension: 64, vals: 702336 + + +kernel execution time: 4.9177 ms +fused time: 5.37305 + +kernel execution time: 8.31608 ms +sddmm time: 8.76144 + +kernel execution time: 5.43042 ms +spmm time: 5.82157 + +kernel execution time: 15.0881 ms +taco reference time: 15.4618 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 64, vals: 2330688 +D1_dimension: 36417, D2_dimension: 64, vals: 2330688 +E1_dimension: 36417, E2_dimension: 64, vals: 2330688 + + +kernel execution time: 175.005 ms +fused time: 175.507 + +kernel execution time: 83.4127 ms +sddmm time: 83.9734 + +kernel execution time: 14.3027 ms +spmm time: 14.8133 + +kernel execution time: 5196.98 ms +taco reference time: 5198.39 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 46835, B2_dimension: 46835, vals: 2374001 +C1_dimension: 46835, C2_dimension: 64, vals: 2997440 +D1_dimension: 46835, D2_dimension: 64, vals: 2997440 +E1_dimension: 46835, E2_dimension: 64, vals: 2997440 + + +kernel execution time: 96.7809 ms +fused time: 97.2629 + +kernel execution time: 46.666 ms +sddmm time: 47.229 + +kernel execution time: 23.9017 ms +spmm time: 24.4045 + +kernel execution time: 2871.87 ms +taco reference time: 2872.47 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 46835, B2_dimension: 46835, vals: 2374001 +C1_dimension: 46835, C2_dimension: 64, vals: 2997440 +D1_dimension: 46835, D2_dimension: 64, vals: 2997440 +E1_dimension: 46835, E2_dimension: 64, vals: 2997440 + + +kernel execution time: 98.4225 ms +fused time: 98.9062 + +kernel execution time: 46.8647 ms +sddmm time: 47.4013 + +kernel execution time: 22.9253 ms +spmm time: 23.4505 + +kernel execution time: 2873.94 ms +taco reference time: 2874.59 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 64, vals: 2330688 +D1_dimension: 36417, D2_dimension: 64, vals: 2330688 +E1_dimension: 36417, E2_dimension: 64, vals: 2330688 + + +kernel execution time: 174.126 ms +fused time: 174.616 + +kernel execution time: 83.7673 ms +sddmm time: 84.3199 + +kernel execution time: 13.0437 ms +spmm time: 13.5625 + +kernel execution time: 5227.23 ms +taco reference time: 5228.25 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 64, vals: 2330688 +D1_dimension: 36417, D2_dimension: 64, vals: 2330688 +E1_dimension: 36417, E2_dimension: 64, vals: 2330688 + + +kernel execution time: 27.6542 ms +fused time: 28.1392 + +kernel execution time: 85.8985 ms +sddmm time: 86.5293 + +kernel execution time: 12.6722 ms +spmm time: 13.1883 + +kernel execution time: 130.948 ms +taco reference time: 131.642 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 64, vals: 2330688 +D1_dimension: 36417, D2_dimension: 64, vals: 2330688 +E1_dimension: 36417, E2_dimension: 64, vals: 2330688 + + +kernel execution time: 102.4 ms +fused time: 102.884 + +kernel execution time: 83.5498 ms +sddmm time: 84.1386 + +kernel execution time: 42.5049 ms +spmm time: 43.0426 + +kernel execution time: 710.168 ms +taco reference time: 710.765 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 64, vals: 2330688 +D1_dimension: 36417, D2_dimension: 64, vals: 2330688 +E1_dimension: 36417, E2_dimension: 64, vals: 2330688 + + +kernel execution time: 43.9551 ms +fused time: 44.6972 + +kernel execution time: 87.6996 ms +sddmm time: 89.4613 + +kernel execution time: 18.2632 ms +spmm time: 18.7804 + +kernel execution time: 122.262 ms +taco reference time: 123.152 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 64, vals: 2330688 +D1_dimension: 36417, D2_dimension: 64, vals: 2330688 +E1_dimension: 36417, E2_dimension: 64, vals: 2330688 + + +kernel execution time: 47.9407 ms +fused time: 48.4339 + +kernel execution time: 89.2157 ms +sddmm time: 89.8924 + +kernel execution time: 18.2009 ms +spmm time: 18.7261 + +kernel execution time: 123.559 ms +taco reference time: 124.405 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 64, vals: 2330688 +D1_dimension: 36417, D2_dimension: 64, vals: 2330688 +E1_dimension: 36417, E2_dimension: 64, vals: 2330688 + + +kernel execution time: 43.2059 ms +fused time: 43.6957 + +kernel execution time: 90.4258 ms +sddmm time: 91.1259 + +kernel execution time: 18.2655 ms +spmm time: 18.7701 + +kernel execution time: 123.565 ms +taco reference time: 124.302 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 64, vals: 2330688 +D1_dimension: 36417, D2_dimension: 64, vals: 2330688 +E1_dimension: 36417, E2_dimension: 64, vals: 2330688 + + +kernel execution time: 48.4004 ms +fused time: 48.9337 + +kernel execution time: 85.0973 ms +sddmm time: 85.6769 + +kernel execution time: 18.1666 ms +spmm time: 18.6607 + +kernel execution time: 123.347 ms +taco reference time: 124.257 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 64, vals: 2330688 +D1_dimension: 36417, D2_dimension: 64, vals: 2330688 +E1_dimension: 36417, E2_dimension: 64, vals: 2330688 + + +kernel execution time: 25.3405 ms +fused time: 25.8282 + +kernel execution time: 87.1326 ms +sddmm time: 87.7761 + +kernel execution time: 12.9441 ms +spmm time: 13.4425 + +kernel execution time: 132.388 ms +taco reference time: 133.056 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 64, vals: 2330688 +D1_dimension: 36417, D2_dimension: 64, vals: 2330688 +E1_dimension: 36417, E2_dimension: 64, vals: 2330688 + + +kernel execution time: 26.5881 ms +fused time: 27.0669 + +kernel execution time: 85.9749 ms +sddmm time: 86.5764 + +kernel execution time: 12.5752 ms +spmm time: 13.1009 + +kernel execution time: 131.368 ms +taco reference time: 132.072 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 64, vals: 2330688 +D1_dimension: 36417, D2_dimension: 64, vals: 2330688 +E1_dimension: 36417, E2_dimension: 64, vals: 2330688 + + +kernel execution time: 177.141 ms +fused time: 177.635 + +kernel execution time: 83.6231 ms +sddmm time: 84.2074 + +kernel execution time: 303.927 ms +spmm time: 304.455 + +kernel execution time: 5553.72 ms +taco reference time: 5554.89 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 64, vals: 2330688 +D1_dimension: 36417, D2_dimension: 64, vals: 2330688 +E1_dimension: 36417, E2_dimension: 64, vals: 2330688 + + +kernel execution time: 177.24 ms +fused time: 177.718 + +kernel execution time: 83.5235 ms +sddmm time: 84.0624 + +kernel execution time: 299.135 ms +spmm time: 299.642 + +kernel execution time: 5568.94 ms +taco reference time: 5570.07 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 64, vals: 2330688 +D1_dimension: 36417, D2_dimension: 64, vals: 2330688 +E1_dimension: 36417, E2_dimension: 64, vals: 2330688 + + +kernel execution time: 177.334 ms +fused time: 177.831 + +kernel execution time: 83.7814 ms +sddmm time: 84.3619 + +kernel execution time: 302.13 ms +spmm time: 302.653 + +kernel execution time: 5535.64 ms +taco reference time: 5536.87 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 64, vals: 2330688 +D1_dimension: 36417, D2_dimension: 64, vals: 2330688 +E1_dimension: 36417, E2_dimension: 64, vals: 2330688 + + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 64, vals: 2330688 +D1_dimension: 36417, D2_dimension: 64, vals: 2330688 +E1_dimension: 36417, E2_dimension: 64, vals: 2330688 + + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 64, vals: 2330688 +D1_dimension: 36417, D2_dimension: 64, vals: 2330688 +E1_dimension: 36417, E2_dimension: 64, vals: 2330688 + + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 64, vals: 2330688 +D1_dimension: 36417, D2_dimension: 64, vals: 2330688 +E1_dimension: 36417, E2_dimension: 64, vals: 2330688 + + +kernel execution time: 180.923 ms +fused time: 181.39 + +kernel execution time: 88.0592 ms +sddmm time: 88.6258 + +kernel execution time: 300.533 ms +spmm time: 301.047 + +kernel execution time: 5549.25 ms +taco reference time: 5550.45 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 64, vals: 2330688 +D1_dimension: 36417, D2_dimension: 64, vals: 2330688 +E1_dimension: 36417, E2_dimension: 64, vals: 2330688 + + +kernel execution time: 27.7589 ms +fused time: 28.2424 + +kernel execution time: 87.4027 ms +sddmm time: 88.0292 + +kernel execution time: 13.0621 ms +spmm time: 13.5896 + +kernel execution time: 131.501 ms +taco reference time: 132.191 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 64, vals: 2330688 +D1_dimension: 36417, D2_dimension: 64, vals: 2330688 +E1_dimension: 36417, E2_dimension: 64, vals: 2330688 + + +kernel execution time: 27.1159 ms +fused time: 27.6123 + +kernel execution time: 88.1805 ms +sddmm time: 88.8475 + +kernel execution time: 13.2301 ms +spmm time: 13.7512 + +kernel execution time: 130.96 ms +taco reference time: 131.633 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 64, vals: 2330688 +D1_dimension: 36417, D2_dimension: 64, vals: 2330688 +E1_dimension: 36417, E2_dimension: 64, vals: 2330688 + + +kernel execution time: 11.1791 ms +fused time: 11.6596 + +kernel execution time: 324.829 ms +sddmm time: 325.459 + +kernel execution time: 5.82413 ms +spmm time: 6.613 + +kernel execution time: 162.505 ms +taco reference time: 163.319 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 64, vals: 2330688 +D1_dimension: 36417, D2_dimension: 64, vals: 2330688 +E1_dimension: 36417, E2_dimension: 64, vals: 2330688 + + +kernel execution time: 167.093 ms +fused time: 167.577 + +kernel execution time: 264.158 ms +sddmm time: 264.712 + +kernel execution time: 68.6915 ms +spmm time: 69.2406 + +kernel execution time: 5581.71 ms +taco reference time: 5582.83 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 64, vals: 2330688 +D1_dimension: 36417, D2_dimension: 64, vals: 2330688 +E1_dimension: 36417, E2_dimension: 64, vals: 2330688 + + +kernel execution time: 170.702 ms +fused time: 171.176 + +kernel execution time: 88.5905 ms +sddmm time: 89.1447 + +kernel execution time: 68.5964 ms +spmm time: 69.1031 + +kernel execution time: 5551.85 ms +taco reference time: 5552.97 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 64, vals: 2330688 +D1_dimension: 36417, D2_dimension: 64, vals: 2330688 +E1_dimension: 36417, E2_dimension: 64, vals: 2330688 + + +kernel execution time: 10.8645 ms +fused time: 11.3531 + +kernel execution time: 9.04029 ms +sddmm time: 9.79108 + +kernel execution time: 5.63795 ms +spmm time: 6.23454 + +kernel execution time: 131.822 ms +taco reference time: 132.52 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 46835, B2_dimension: 46835, vals: 2374001 +C1_dimension: 46835, C2_dimension: 64, vals: 2997440 +D1_dimension: 46835, D2_dimension: 64, vals: 2997440 +E1_dimension: 46835, E2_dimension: 64, vals: 2997440 + + +kernel execution time: 9.65163 ms +fused time: 10.1436 + +kernel execution time: 9.70327 ms +sddmm time: 10.2929 + +kernel execution time: 4.85235 ms +spmm time: 5.40286 + +kernel execution time: 74.2349 ms +taco reference time: 74.8374 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 62451, B2_dimension: 62451, vals: 4007383 +C1_dimension: 62451, C2_dimension: 64, vals: 3996864 +D1_dimension: 62451, D2_dimension: 64, vals: 3996864 +E1_dimension: 62451, E2_dimension: 64, vals: 3996864 + + +kernel execution time: 15.2637 ms +fused time: 15.7881 + +kernel execution time: 12.0484 ms +sddmm time: 12.7139 + +kernel execution time: 7.9269 ms +spmm time: 8.5266 + +kernel execution time: 122.713 ms +taco reference time: 123.431 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291 +C1_dimension: 5558326, C2_dimension: 64, vals: 355732864 +D1_dimension: 5558326, D2_dimension: 64, vals: 355732864 +E1_dimension: 5558326, E2_dimension: 64, vals: 355732864 + + +kernel execution time: 750.953 ms +fused time: 751.849 + +kernel execution time: 410.668 ms +sddmm time: 411.252 + +kernel execution time: 490.401 ms +spmm time: 490.993 + +kernel execution time: 7382.94 ms +taco reference time: 7384.02 + + + +-------------------------------- + + + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 10974, D2_dimension: 64, vals: 702336 +E1_dimension: 10974, E2_dimension: 64, vals: 702336 + + +kernel execution time: 2.89124 ms +fused time: 3.33064 + +kernel execution time: 2.48885 ms +sddmm time: 2.80581 + +kernel execution time: 1.25714 ms +sddmm time: 1.58645 + +kernel execution time: 1.82611 ms +spmm time: 2.10693 + +kernel execution time: 14.7536 ms +taco reference time: 15.1553 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 64, vals: 2330688 +D1_dimension: 36417, D2_dimension: 64, vals: 2330688 +E1_dimension: 36417, E2_dimension: 64, vals: 2330688 + + +kernel execution time: 10.4526 ms +fused time: 10.9812 + +kernel execution time: 9.28251 ms +sddmm time: 9.93109 + +kernel execution time: 5.36035 ms +sddmm time: 5.99358 + +kernel execution time: 5.29728 ms +spmm time: 5.86825 + +kernel execution time: 132.268 ms +taco reference time: 132.952 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 46835, B2_dimension: 46835, vals: 2374001 +C1_dimension: 46835, C2_dimension: 64, vals: 2997440 +D1_dimension: 46835, D2_dimension: 64, vals: 2997440 +E1_dimension: 46835, E2_dimension: 64, vals: 2997440 + + +kernel execution time: 9.78667 ms +fused time: 10.2677 + +kernel execution time: 9.62847 ms +sddmm time: 10.2355 + +kernel execution time: 3.92285 ms +sddmm time: 4.52461 + +kernel execution time: 4.91246 ms +spmm time: 5.38467 + +kernel execution time: 74.8226 ms +taco reference time: 75.4131 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 83334, B2_dimension: 83334, vals: 6010480 +C1_dimension: 83334, C2_dimension: 64, vals: 5333376 +D1_dimension: 83334, D2_dimension: 64, vals: 5333376 +E1_dimension: 83334, E2_dimension: 64, vals: 5333376 + + +kernel execution time: 19.7265 ms +fused time: 20.2664 + +kernel execution time: 17.1571 ms +sddmm time: 17.8366 + +kernel execution time: 10.5179 ms +sddmm time: 11.1615 + +kernel execution time: 10.7719 ms +spmm time: 11.4141 + +kernel execution time: 186.633 ms +taco reference time: 187.406 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 +C1_dimension: 121192, C2_dimension: 64, vals: 7756288 +D1_dimension: 121192, D2_dimension: 64, vals: 7756288 +E1_dimension: 121192, E2_dimension: 64, vals: 7756288 + + +kernel execution time: 28.3142 ms +fused time: 28.8151 + +kernel execution time: 20.3455 ms +sddmm time: 21.0059 + +kernel execution time: 12.2316 ms +sddmm time: 12.8542 + +kernel execution time: 13.8246 ms +spmm time: 14.4268 + +kernel execution time: 100.583 ms +taco reference time: 101.304 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 170998, B2_dimension: 170998, vals: 958936 +C1_dimension: 170998, C2_dimension: 64, vals: 10943872 +D1_dimension: 170998, D2_dimension: 64, vals: 10943872 +E1_dimension: 170998, E2_dimension: 64, vals: 10943872 + + +kernel execution time: 20.038 ms +fused time: 20.555 + +kernel execution time: 11.3385 ms +sddmm time: 11.9822 + +kernel execution time: 8.08082 ms +sddmm time: 8.71341 + +kernel execution time: 10.9562 ms +spmm time: 11.5782 + +kernel execution time: 80.9289 ms +taco reference time: 81.6333 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 206500, B2_dimension: 206500, vals: 1273389 +C1_dimension: 206500, C2_dimension: 64, vals: 13216000 +D1_dimension: 206500, D2_dimension: 64, vals: 13216000 +E1_dimension: 206500, E2_dimension: 64, vals: 13216000 + + +kernel execution time: 25.3126 ms +fused time: 25.8254 + +kernel execution time: 15.9278 ms +sddmm time: 16.6406 + +kernel execution time: 10.5087 ms +sddmm time: 11.2503 + +kernel execution time: 14.3281 ms +spmm time: 14.9822 + +kernel execution time: 98.03 ms +taco reference time: 98.7014 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536 +C1_dimension: 1000005, C2_dimension: 64, vals: 64000320 +D1_dimension: 1000005, D2_dimension: 64, vals: 64000320 +E1_dimension: 1000005, E2_dimension: 64, vals: 64000320 + + +kernel execution time: 77.5645 ms +fused time: 78.0892 + +kernel execution time: 31.7247 ms +sddmm time: 32.4147 + +kernel execution time: 26.0367 ms +sddmm time: 26.7311 + +kernel execution time: 47.1564 ms +spmm time: 47.8767 + +kernel execution time: 444.658 ms +taco reference time: 445.356 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291 +C1_dimension: 5558326, C2_dimension: 64, vals: 355732864 +D1_dimension: 5558326, D2_dimension: 64, vals: 355732864 +E1_dimension: 5558326, E2_dimension: 64, vals: 355732864 + + +kernel execution time: 760.552 ms +fused time: 761.497 + +kernel execution time: 414.806 ms +sddmm time: 415.511 + +kernel execution time: 347.288 ms +sddmm time: 348.046 + +kernel execution time: 493.652 ms +spmm time: 494.215 + +kernel execution time: 7069.3 ms +taco reference time: 7070.64 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 62451, B2_dimension: 62451, vals: 4007383 +C1_dimension: 62451, C2_dimension: 64, vals: 3996864 +D1_dimension: 62451, D2_dimension: 64, vals: 3996864 +E1_dimension: 62451, E2_dimension: 64, vals: 3996864 + + +kernel execution time: 14.868 ms +fused time: 15.3593 + +kernel execution time: 12.1237 ms +sddmm time: 12.798 + +kernel execution time: 7.68559 ms +sddmm time: 8.34388 + +kernel execution time: 7.93647 ms +spmm time: 8.56812 + +kernel execution time: 122.125 ms +taco reference time: 122.846 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 140874, B2_dimension: 140874, vals: 7813404 +C1_dimension: 140874, C2_dimension: 64, vals: 9015936 +D1_dimension: 140874, D2_dimension: 64, vals: 9015936 +E1_dimension: 140874, E2_dimension: 64, vals: 9015936 + + +kernel execution time: 28.6635 ms +fused time: 29.1538 + +kernel execution time: 24.0642 ms +sddmm time: 24.694 + +kernel execution time: 15.2 ms +sddmm time: 15.875 + +kernel execution time: 16.0406 ms +spmm time: 16.6827 + +kernel execution time: 242.63 ms +taco reference time: 243.336 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 140874, B2_dimension: 140874, vals: 7813404 +C1_dimension: 140874, C2_dimension: 128, vals: 18031872 +D1_dimension: 140874, D2_dimension: 128, vals: 18031872 +E1_dimension: 140874, E2_dimension: 128, vals: 18031872 + + +kernel execution time: 50.9773 ms +fused time: 51.4656 + +kernel execution time: 42.0404 ms +sddmm time: 42.7352 + +kernel execution time: 24.4547 ms +sddmm time: 25.1418 + +kernel execution time: 28.4623 ms +spmm time: 29.1722 + +kernel execution time: 903.853 ms +taco reference time: 904.701 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 140874, B2_dimension: 140874, vals: 7813404 +C1_dimension: 140874, C2_dimension: 250, vals: 35218500 +D1_dimension: 140874, D2_dimension: 250, vals: 35218500 +E1_dimension: 140874, E2_dimension: 250, vals: 35218500 + + +kernel execution time: 97.1385 ms +fused time: 97.6193 + +kernel execution time: 87.9795 ms +sddmm time: 88.6535 + +kernel execution time: 41.8878 ms +sddmm time: 42.5463 + +kernel execution time: 54.1433 ms +spmm time: 54.7894 + +kernel execution time: 3669.52 ms +taco reference time: 3670.78 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 140874, B2_dimension: 140874, vals: 7813404 +C1_dimension: 140874, C2_dimension: 512, vals: 72127488 +D1_dimension: 140874, D2_dimension: 512, vals: 72127488 +E1_dimension: 140874, E2_dimension: 512, vals: 72127488 + + +kernel execution time: 200.849 ms +fused time: 201.329 + +kernel execution time: 208.737 ms +sddmm time: 209.393 + +kernel execution time: 81.0923 ms +sddmm time: 81.7181 + +kernel execution time: 106.669 ms +spmm time: 107.272 + +kernel execution time: 15631.7 ms +taco reference time: 15632.4 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 140874, B2_dimension: 140874, vals: 7813404 +C1_dimension: 140874, C2_dimension: 32, vals: 4507968 +D1_dimension: 140874, D2_dimension: 32, vals: 4507968 +E1_dimension: 140874, E2_dimension: 32, vals: 4507968 + + +kernel execution time: 16.5631 ms +fused time: 17.0602 + +kernel execution time: 15.2542 ms +sddmm time: 15.8919 + +kernel execution time: 9.9104 ms +sddmm time: 10.5671 + +kernel execution time: 9.61101 ms +spmm time: 10.2251 + +kernel execution time: 68.1735 ms +taco reference time: 68.8921 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 140874, B2_dimension: 140874, vals: 7813404 +C1_dimension: 140874, C2_dimension: 256, vals: 36063744 +D1_dimension: 140874, D2_dimension: 256, vals: 36063744 +E1_dimension: 140874, E2_dimension: 256, vals: 36063744 + + +kernel execution time: 98.882 ms +fused time: 99.3547 + +kernel execution time: 90.4755 ms +sddmm time: 91.136 + +kernel execution time: 42.7487 ms +sddmm time: 43.4726 + +kernel execution time: 55.0127 ms +spmm time: 55.731 + +kernel execution time: 3836.15 ms +taco reference time: 3837.42 + + + + + +--------- single threads + + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 10974, D2_dimension: 64, vals: 702336 +E1_dimension: 10974, E2_dimension: 64, vals: 702336 + + +kernel execution time: 22.3045 ms +fused time: 22.7793 + +kernel execution time: 8.91826 ms +sddmm time: 9.46409 + +kernel execution time: 9.62695 ms +sddmm time: 10.1105 + +kernel execution time: 10.8309 ms +spmm time: 11.2862 + +kernel execution time: 554.747 ms +taco reference time: 555.315 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 64, vals: 2330688 +D1_dimension: 36417, D2_dimension: 64, vals: 2330688 +E1_dimension: 36417, E2_dimension: 64, vals: 2330688 + + +kernel execution time: 166.569 ms +fused time: 167.058 + +kernel execution time: 83.9979 ms +sddmm time: 84.5309 + +kernel execution time: 88.9971 ms +sddmm time: 89.5559 + +kernel execution time: 68.5334 ms +spmm time: 69.0587 + +kernel execution time: 5562.04 ms +taco reference time: 5563.12 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 46835, B2_dimension: 46835, vals: 2374001 +C1_dimension: 46835, C2_dimension: 64, vals: 2997440 +D1_dimension: 46835, D2_dimension: 64, vals: 2997440 +E1_dimension: 46835, E2_dimension: 64, vals: 2997440 + + +kernel execution time: 94.7764 ms +fused time: 95.2526 + +kernel execution time: 47.3174 ms +sddmm time: 47.8674 + +kernel execution time: 49.7766 ms +sddmm time: 50.3372 + +kernel execution time: 51.3685 ms +spmm time: 51.8719 + +kernel execution time: 3073.44 ms +taco reference time: 3074.55 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 62451, B2_dimension: 62451, vals: 4007383 +C1_dimension: 62451, C2_dimension: 64, vals: 3996864 +D1_dimension: 62451, D2_dimension: 64, vals: 3996864 +E1_dimension: 62451, E2_dimension: 64, vals: 3996864 + + +kernel execution time: 158.175 ms +fused time: 158.637 + +kernel execution time: 78.3163 ms +sddmm time: 78.8675 + +kernel execution time: 82.3237 ms +sddmm time: 82.8606 + +kernel execution time: 76.2056 ms +spmm time: 76.7067 + +kernel execution time: 5178.46 ms +taco reference time: 5179.53 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 83334, B2_dimension: 83334, vals: 6010480 +C1_dimension: 83334, C2_dimension: 64, vals: 5333376 +D1_dimension: 83334, D2_dimension: 64, vals: 5333376 +E1_dimension: 83334, E2_dimension: 64, vals: 5333376 + + +kernel execution time: 241.194 ms +fused time: 241.676 + +kernel execution time: 117.775 ms +sddmm time: 118.325 + +kernel execution time: 124.006 ms +sddmm time: 124.563 + +kernel execution time: 117.052 ms +spmm time: 117.594 + +kernel execution time: 7844.57 ms +taco reference time: 7845.69 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 +C1_dimension: 121192, C2_dimension: 64, vals: 7756288 +D1_dimension: 121192, D2_dimension: 64, vals: 7756288 +E1_dimension: 121192, E2_dimension: 64, vals: 7756288 + + +kernel execution time: 201.49 ms +fused time: 201.973 + +kernel execution time: 90.6759 ms +sddmm time: 91.2506 + +kernel execution time: 93.0462 ms +sddmm time: 93.6053 + +kernel execution time: 119.005 ms +spmm time: 119.547 + +kernel execution time: 3567.55 ms +taco reference time: 3568.67 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 140874, B2_dimension: 140874, vals: 7813404 +C1_dimension: 140874, C2_dimension: 64, vals: 9015936 +D1_dimension: 140874, D2_dimension: 64, vals: 9015936 +E1_dimension: 140874, E2_dimension: 64, vals: 9015936 + + +kernel execution time: 315.238 ms +fused time: 315.723 + +kernel execution time: 156.048 ms +sddmm time: 156.588 + +kernel execution time: 164.148 ms +sddmm time: 164.747 + +kernel execution time: 162.502 ms +spmm time: 163.021 + +kernel execution time: 10131.2 ms +taco reference time: 10132.3 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 170998, B2_dimension: 170998, vals: 958936 +C1_dimension: 170998, C2_dimension: 64, vals: 10943872 +D1_dimension: 170998, D2_dimension: 64, vals: 10943872 +E1_dimension: 170998, E2_dimension: 64, vals: 10943872 + + +kernel execution time: 87.9511 ms +fused time: 88.4267 + +kernel execution time: 37.6228 ms +sddmm time: 38.1792 + +kernel execution time: 37.8418 ms +sddmm time: 38.3903 + +kernel execution time: 84.4997 ms +spmm time: 85.037 + +kernel execution time: 1330.01 ms +taco reference time: 1330.63 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 206500, B2_dimension: 206500, vals: 1273389 +C1_dimension: 206500, C2_dimension: 64, vals: 13216000 +D1_dimension: 206500, D2_dimension: 64, vals: 13216000 +E1_dimension: 206500, E2_dimension: 64, vals: 13216000 + + +kernel execution time: 92.8914 ms +fused time: 93.3697 + +kernel execution time: 39.7714 ms +sddmm time: 40.3051 + +kernel execution time: 40.1835 ms +sddmm time: 40.7458 + +kernel execution time: 98.0818 ms +spmm time: 98.5997 + +kernel execution time: 1721.01 ms +taco reference time: 1721.64 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536 +C1_dimension: 1000005, C2_dimension: 64, vals: 64000320 +D1_dimension: 1000005, D2_dimension: 64, vals: 64000320 +E1_dimension: 1000005, E2_dimension: 64, vals: 64000320 + + +kernel execution time: 259.845 ms +fused time: 260.329 + +kernel execution time: 95.8311 ms +sddmm time: 96.3809 + +kernel execution time: 97.6925 ms +sddmm time: 98.2397 + +kernel execution time: 292.415 ms +spmm time: 292.952 + +kernel execution time: 4292.03 ms +taco reference time: 4293.1 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291 +C1_dimension: 5558326, C2_dimension: 64, vals: 355732864 +D1_dimension: 5558326, D2_dimension: 64, vals: 355732864 +E1_dimension: 5558326, E2_dimension: 64, vals: 355732864 + + +kernel execution time: 3326.66 ms +fused time: 3327.64 + +kernel execution time: 1617.82 ms +sddmm time: 1618.36 + +kernel execution time: 1672.73 ms +sddmm time: 1673.27 + +kernel execution time: 3199.32 ms +spmm time: 3200.35 + +kernel execution time: 88682 ms +taco reference time: 88683.1 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291 +C1_dimension: 5558326, C2_dimension: 64, vals: 355732864 +D1_dimension: 5558326, D2_dimension: 64, vals: 355732864 +E1_dimension: 5558326, E2_dimension: 64, vals: 355732864 + + +kernel execution time: 722.484 ms +fused time: 723.506 + +kernel execution time: 613.844 ms +sddmm time: 614.401 + +kernel execution time: 331.43 ms +sddmm time: 331.978 + +kernel execution time: 463.752 ms +spmm time: 464.328 + +kernel execution time: 8864.13 ms +taco reference time: 8865.18 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 140874, B2_dimension: 140874, vals: 7813404 +C1_dimension: 140874, C2_dimension: 16, vals: 2253984 +D1_dimension: 140874, D2_dimension: 16, vals: 2253984 +E1_dimension: 140874, E2_dimension: 16, vals: 2253984 + + +kernel execution time: 10.0607 ms +fused time: 10.5457 + +kernel execution time: 8.70278 ms +sddmm time: 9.26539 + +kernel execution time: 6.88021 ms +sddmm time: 7.49853 + +kernel execution time: 5.91127 ms +spmm time: 6.50028 + +kernel execution time: 23.776 ms +taco reference time: 24.3947 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536 +C1_dimension: 1000005, C2_dimension: 64, vals: 64000320 +D1_dimension: 1000005, D2_dimension: 64, vals: 64000320 +E1_dimension: 1000005, E2_dimension: 64, vals: 64000320 + + +kernel execution time: 179.752 ms +fused time: 180.214 + +kernel execution time: 170.678 ms +sddmm time: 171.224 + +kernel execution time: 67.5166 ms +sddmm time: 68.0688 + +kernel execution time: 168.557 ms +spmm time: 169.083 + +kernel execution time: 2452.7 ms +taco reference time: 2453.34 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536 +C1_dimension: 1000005, C2_dimension: 64, vals: 64000320 +D1_dimension: 1000005, D2_dimension: 64, vals: 64000320 +E1_dimension: 1000005, E2_dimension: 64, vals: 64000320 + + +kernel execution time: 111.508 ms +fused time: 111.983 + +kernel execution time: 171.316 ms +sddmm time: 171.863 + +kernel execution time: 40.3219 ms +sddmm time: 40.8676 + +kernel execution time: 91.8855 ms +spmm time: 92.3888 + +kernel execution time: 1349.98 ms +taco reference time: 1350.57 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536 +C1_dimension: 1000005, C2_dimension: 64, vals: 64000320 +D1_dimension: 1000005, D2_dimension: 64, vals: 64000320 +E1_dimension: 1000005, E2_dimension: 64, vals: 64000320 + + +kernel execution time: 84.4185 ms +fused time: 84.8803 + +kernel execution time: 131.898 ms +sddmm time: 132.465 + +kernel execution time: 27.6062 ms +sddmm time: 28.2117 + +kernel execution time: 59.0816 ms +spmm time: 59.6189 + +kernel execution time: 731.805 ms +taco reference time: 732.441 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536 +C1_dimension: 1000005, C2_dimension: 64, vals: 64000320 +D1_dimension: 1000005, D2_dimension: 64, vals: 64000320 +E1_dimension: 1000005, E2_dimension: 64, vals: 64000320 + + +kernel execution time: 76.4489 ms +fused time: 76.9087 + +kernel execution time: 65.9875 ms +sddmm time: 66.5522 + +kernel execution time: 25.2905 ms +sddmm time: 25.8759 + +kernel execution time: 50.1563 ms +spmm time: 50.6842 + +kernel execution time: 397.479 ms +taco reference time: 398.109 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536 +C1_dimension: 1000005, C2_dimension: 64, vals: 64000320 +D1_dimension: 1000005, D2_dimension: 64, vals: 64000320 +E1_dimension: 1000005, E2_dimension: 64, vals: 64000320 + + +kernel execution time: 74.0227 ms +fused time: 74.5259 + +kernel execution time: 40.2983 ms +sddmm time: 40.889 + +kernel execution time: 25.1349 ms +sddmm time: 25.7522 + +kernel execution time: 46.3853 ms +spmm time: 46.9556 + +kernel execution time: 418.693 ms +taco reference time: 419.345 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291 +C1_dimension: 5558326, C2_dimension: 64, vals: 355732864 +D1_dimension: 5558326, D2_dimension: 64, vals: 355732864 +E1_dimension: 5558326, E2_dimension: 64, vals: 355732864 + + +kernel execution time: 1982.06 ms +fused time: 1982.93 + +kernel execution time: 1668.23 ms +sddmm time: 1668.77 + +kernel execution time: 962.046 ms +sddmm time: 962.591 + +kernel execution time: 1821.97 ms +spmm time: 1822.46 + +kernel execution time: 47772.2 ms +taco reference time: 47773.4 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291 +C1_dimension: 5558326, C2_dimension: 64, vals: 355732864 +D1_dimension: 5558326, D2_dimension: 64, vals: 355732864 +E1_dimension: 5558326, E2_dimension: 64, vals: 355732864 + + +kernel execution time: 1143.12 ms +fused time: 1144.05 + +kernel execution time: 1254.57 ms +sddmm time: 1255.18 + +kernel execution time: 539.54 ms +sddmm time: 540.136 + +kernel execution time: 1005.14 ms +spmm time: 1005.69 + +kernel execution time: 25805.1 ms +taco reference time: 25806.1 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291 +C1_dimension: 5558326, C2_dimension: 64, vals: 355732864 +D1_dimension: 5558326, D2_dimension: 64, vals: 355732864 +E1_dimension: 5558326, E2_dimension: 64, vals: 355732864 + + +kernel execution time: 782.496 ms +fused time: 783.574 + +kernel execution time: 872.793 ms +sddmm time: 873.351 + +kernel execution time: 353.256 ms +sddmm time: 353.8 + +kernel execution time: 606.511 ms +spmm time: 607.041 + +kernel execution time: 15198.9 ms +taco reference time: 15199.9 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291 +C1_dimension: 5558326, C2_dimension: 64, vals: 355732864 +D1_dimension: 5558326, D2_dimension: 64, vals: 355732864 +E1_dimension: 5558326, E2_dimension: 64, vals: 355732864 + + +kernel execution time: 729.345 ms +fused time: 730.242 + +kernel execution time: 608.324 ms +sddmm time: 608.908 + +kernel execution time: 334.109 ms +sddmm time: 334.653 + +kernel execution time: 471.211 ms +spmm time: 471.77 + +kernel execution time: 8630.19 ms +taco reference time: 8631.29 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291 +C1_dimension: 5558326, C2_dimension: 64, vals: 355732864 +D1_dimension: 5558326, D2_dimension: 64, vals: 355732864 +E1_dimension: 5558326, E2_dimension: 64, vals: 355732864 + + +kernel execution time: 736.326 ms +fused time: 737.203 + +kernel execution time: 482.639 ms +sddmm time: 483.19 + +kernel execution time: 333.58 ms +sddmm time: 334.131 + +kernel execution time: 478.49 ms +spmm time: 479.051 + +kernel execution time: 7244.99 ms +taco reference time: 7246.13 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 62451, B2_dimension: 62451, vals: 4007383 +C1_dimension: 62451, C2_dimension: 64, vals: 3996864 +D1_dimension: 62451, D2_dimension: 64, vals: 3996864 +E1_dimension: 62451, E2_dimension: 64, vals: 3996864 + + +kernel execution time: 13.4143 ms +fused time: 13.9143 + +kernel execution time: 11.2836 ms +sddmm time: 12.0149 + +kernel execution time: 7.35609 ms +sddmm time: 8.06588 + +kernel execution time: 7.36916 ms +spmm time: 7.93476 + +kernel execution time: 120.287 ms +taco reference time: 120.948 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 62451, B2_dimension: 62451, vals: 4007383 +C1_dimension: 62451, C2_dimension: 64, vals: 3996864 +D1_dimension: 62451, D2_dimension: 64, vals: 3996864 +E1_dimension: 62451, E2_dimension: 64, vals: 3996864 + + +kernel execution time: 156.322 ms +fused time: 156.802 + +kernel execution time: 77.0794 ms +sddmm time: 77.6574 + +kernel execution time: 81.2772 ms +sddmm time: 81.8141 + +kernel execution time: 74.4419 ms +spmm time: 74.9538 + +kernel execution time: 5091.25 ms +taco reference time: 5092.34 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 62451, B2_dimension: 62451, vals: 4007383 +C1_dimension: 62451, C2_dimension: 64, vals: 3996864 +D1_dimension: 62451, D2_dimension: 64, vals: 3996864 +E1_dimension: 62451, E2_dimension: 64, vals: 3996864 + + +kernel execution time: 160.868 ms +fused time: 161.347 + +kernel execution time: 78.1223 ms +sddmm time: 78.7031 + +kernel execution time: 82.4929 ms +sddmm time: 83.0729 + +kernel execution time: 77.24 ms +spmm time: 77.7896 + +kernel execution time: 5087.42 ms +taco reference time: 5088.53 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 62451, B2_dimension: 62451, vals: 4007383 +C1_dimension: 62451, C2_dimension: 64, vals: 3996864 +D1_dimension: 62451, D2_dimension: 64, vals: 3996864 +E1_dimension: 62451, E2_dimension: 64, vals: 3996864 + + +kernel execution time: 157.627 ms +fused time: 158.106 + +kernel execution time: 76.9497 ms +sddmm time: 77.5265 + +kernel execution time: 81.9491 ms +sddmm time: 82.4945 + +kernel execution time: 81.9841 ms +spmm time: 82.5149 + +kernel execution time: 5084.06 ms +taco reference time: 5085.15 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 62451, B2_dimension: 62451, vals: 4007383 +C1_dimension: 62451, C2_dimension: 64, vals: 3996864 +D1_dimension: 62451, D2_dimension: 64, vals: 3996864 +E1_dimension: 62451, E2_dimension: 64, vals: 3996864 + + +kernel execution time: 156.608 ms +fused time: 157.085 + +kernel execution time: 76.6969 ms +sddmm time: 77.2366 + +kernel execution time: 80.7238 ms +sddmm time: 81.2624 + +kernel execution time: 74.4498 ms +spmm time: 74.9694 + +kernel execution time: 5076.16 ms +taco reference time: 5077.28 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 62451, B2_dimension: 62451, vals: 4007383 +C1_dimension: 62451, C2_dimension: 64, vals: 3996864 +D1_dimension: 62451, D2_dimension: 64, vals: 3996864 +E1_dimension: 62451, E2_dimension: 64, vals: 3996864 + + +kernel execution time: 156.489 ms +fused time: 156.996 + +kernel execution time: 77.2215 ms +sddmm time: 77.7763 + +kernel execution time: 81.2983 ms +sddmm time: 81.8357 + +kernel execution time: 75.4752 ms +spmm time: 76.0191 + +kernel execution time: 5087.37 ms +taco reference time: 5088.51 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 62451, B2_dimension: 62451, vals: 4007383 +C1_dimension: 62451, C2_dimension: 64, vals: 3996864 +D1_dimension: 62451, D2_dimension: 64, vals: 3996864 +E1_dimension: 62451, E2_dimension: 64, vals: 3996864 + + +kernel execution time: 156.515 ms +fused time: 156.991 + +kernel execution time: 76.9797 ms +sddmm time: 77.5298 + +kernel execution time: 81.4654 ms +sddmm time: 82.0017 + +kernel execution time: 76.1847 ms +spmm time: 76.693 + +kernel execution time: 5078.68 ms +taco reference time: 5079.85 + +sddmm-spmm execution + +----------------------------------------- +A(i) = B(i,j) * C(j,k) * v(k); +B1_dimension: 5, B2_dimension: 5, vals: 19 +C1_dimension: 5, C2_dimension: 5, vals: 19 +D1_dimension: 5, vals: 5 + + +sddmm-spmm execution + +----------------------------------------- +A(i) = B(i,j) * C(j,k) * v(k); +B1_dimension: 5, B2_dimension: 5, vals: 19 +C1_dimension: 5, C2_dimension: 5, vals: 19 +D1_dimension: 5, vals: 5 + + +sddmm-spmm execution + +----------------------------------------- +A(i) = B(i,j) * C(j,k) * v(k); +B1_dimension: 5, B2_dimension: 5, vals: 19 +C1_dimension: 5, C2_dimension: 5, vals: 19 +D1_dimension: 5, vals: 5 + + +sddmm-spmm execution + +----------------------------------------- +A(i) = B(i,j) * C(j,k) * v(k); +B1_dimension: 5, B2_dimension: 5, vals: 19 +C1_dimension: 5, C2_dimension: 5, vals: 19 +D1_dimension: 5, vals: 5 + + +sddmm-spmm execution + +----------------------------------------- +A(i) = B(i,j) * C(j,k) * v(k); +B1_dimension: 5, B2_dimension: 5, vals: 19 +C1_dimension: 5, C2_dimension: 5, vals: 19 +D1_dimension: 5, vals: 5 + + +sddmm-spmm execution + +----------------------------------------- +A(i) = B(i,j) * C(j,k) * v(k); +B1_dimension: 5, B2_dimension: 5, vals: 19 +C1_dimension: 5, C2_dimension: 5, vals: 19 +D1_dimension: 5, vals: 5 + + +sddmm-spmm execution + +----------------------------------------- +A(i) = B(i,j) * C(j,k) * v(k); +B1_dimension: 5, B2_dimension: 5, vals: 19 +C1_dimension: 5, C2_dimension: 5, vals: 19 +D1_dimension: 5, vals: 5 + + +sddmm-spmm execution + +----------------------------------------- +A(i) = B(i,j) * C(j,k) * v(k); +B1_dimension: 5, B2_dimension: 5, vals: 19 +C1_dimension: 5, C2_dimension: 5, vals: 19 +D1_dimension: 5, vals: 5 + + +sddmm-spmm execution + +----------------------------------------- +A(i) = B(i,j) * C(j,k) * v(k); +B1_dimension: 5, B2_dimension: 5, vals: 19 +C1_dimension: 5, C2_dimension: 5, vals: 19 +D1_dimension: 5, vals: 5 + + +sddmm-spmm execution + +----------------------------------------- +A(i) = B(i,j) * C(j,k) * v(k); +B1_dimension: 5, B2_dimension: 5, vals: 19 +C1_dimension: 5, C2_dimension: 5, vals: 19 +D1_dimension: 5, vals: 5 + + +sddmm-spmm execution + +----------------------------------------- +A(i) = B(i,j) * C(j,k) * v(k); +B1_dimension: 5, B2_dimension: 5, vals: 19 +C1_dimension: 5, C2_dimension: 5, vals: 19 +D1_dimension: 5, vals: 5 + + +sddmm-spmm execution + +----------------------------------------- +A(i) = B(i,j) * C(j,k) * v(k); +B1_dimension: 5, B2_dimension: 5, vals: 19 +C1_dimension: 5, C2_dimension: 5, vals: 19 +D1_dimension: 5, vals: 5 + + +sddmm-spmm execution + +----------------------------------------- +A(i) = B(i,j) * C(j,k) * v(k); +B1_dimension: 5, B2_dimension: 5, vals: 19 +C1_dimension: 5, C2_dimension: 5, vals: 19 +D1_dimension: 5, vals: 5 + + +sddmm-spmm execution + +----------------------------------------- +A(i) = B(i,j) * C(j,k) * v(k); +B1_dimension: 5, B2_dimension: 5, vals: 19 +C1_dimension: 5, C2_dimension: 5, vals: 19 +D1_dimension: 5, vals: 5 + + +sddmm-spmm execution + +----------------------------------------- +A(i) = B(i,j) * C(j,k) * v(k); +B1_dimension: 5, B2_dimension: 5, vals: 19 +C1_dimension: 5, C2_dimension: 5, vals: 19 +D1_dimension: 5, vals: 5 + + +sddmm-spmm execution + +----------------------------------------- +A(i) = B(i,j) * C(j,k) * v(k); +B1_dimension: 5, B2_dimension: 5, vals: 19 +C1_dimension: 5, C2_dimension: 5, vals: 19 +D1_dimension: 5, vals: 5 + + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 5, B2_dimension: 5, vals: 19 +C1_dimension: 5, C2_dimension: 64, vals: 320 +D1_dimension: 5, D2_dimension: 64, vals: 320 +E1_dimension: 64, E2_dimension: 64, vals: 4096 + + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 5, B2_dimension: 5, vals: 19 +C1_dimension: 5, C2_dimension: 64, vals: 320 +D1_dimension: 5, D2_dimension: 64, vals: 320 +E1_dimension: 64, E2_dimension: 64, vals: 4096 + + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 64, vals: 2330688 +D1_dimension: 36417, D2_dimension: 64, vals: 2330688 +E1_dimension: 64, E2_dimension: 64, vals: 4096 + + +kernel execution time: 115.102 ms +fused time: 115.803 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 83334, B2_dimension: 83334, vals: 6010480 +C1_dimension: 83334, C2_dimension: 128, vals: 10666752 +D1_dimension: 83334, D2_dimension: 128, vals: 10666752 +E1_dimension: 83334, E2_dimension: 128, vals: 10666752 + + +kernel execution time: 30.977 ms +fused time: 35.4912 + +separate execution + +kernel execution time: 26.0898 ms +sddmm time: 26.6915 + +kernel execution time: 15.4341 ms +sddmm time: 16.0058 + +kernel execution time: 17.7466 ms +spmm time: 18.2995 + +reference execution + +kernel execution time: 694.171 ms +taco reference time: 694.888 +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 +C1_dimension: 121192, C2_dimension: 128, vals: 15512576 +D1_dimension: 121192, D2_dimension: 128, vals: 15512576 +E1_dimension: 121192, E2_dimension: 128, vals: 15512576 + + +kernel execution time: 52.5109 ms +fused time: 56.6803 + +separate execution + +kernel execution time: 41.9638 ms +sddmm time: 42.5925 + +kernel execution time: 21.3537 ms +sddmm time: 21.9855 + +kernel execution time: 25.1185 ms +spmm time: 25.7047 + +reference execution + +kernel execution time: 323.01 ms +taco reference time: 323.699 +/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 140874, B2_dimension: 140874, vals: 7813404 +C1_dimension: 140874, C2_dimension: 128, vals: 18031872 +D1_dimension: 140874, D2_dimension: 128, vals: 18031872 +E1_dimension: 140874, E2_dimension: 128, vals: 18031872 + + +kernel execution time: 45.3128 ms +fused time: 48.4929 + +separate execution + +kernel execution time: 39.7986 ms +sddmm time: 40.3901 + +kernel execution time: 20.8296 ms +sddmm time: 21.432 + +kernel execution time: 25.0308 ms +spmm time: 25.5726 + +reference execution + +kernel execution time: 867.794 ms +taco reference time: 868.418 +/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 170998, B2_dimension: 170998, vals: 958936 +C1_dimension: 170998, C2_dimension: 128, vals: 21887744 +D1_dimension: 170998, D2_dimension: 128, vals: 21887744 +E1_dimension: 170998, E2_dimension: 128, vals: 21887744 + + +kernel execution time: 34.2915 ms +fused time: 38.221 + +separate execution + +kernel execution time: 18.8777 ms +sddmm time: 19.4859 + +kernel execution time: 12.8794 ms +sddmm time: 16.5695 + +kernel execution time: 19.7876 ms +spmm time: 23.5933 + +reference execution + +kernel execution time: 114.374 ms +taco reference time: 115.03 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 83334, B2_dimension: 83334, vals: 6010480 +C1_dimension: 83334, C2_dimension: 128, vals: 10666752 +D1_dimension: 83334, D2_dimension: 128, vals: 10666752 +E1_dimension: 83334, E2_dimension: 128, vals: 10666752 + + +kernel execution time: 77.2194 ms +fused time: 78.1408 + +separate execution + +kernel execution time: 28.0545 ms +sddmm time: 28.625 + +kernel execution time: 15.7941 ms +sddmm time: 16.3986 + +kernel execution time: 18.1167 ms +spmm time: 18.7055 + +reference execution + +kernel execution time: 652.088 ms +taco reference time: 652.794 +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 +C1_dimension: 121192, C2_dimension: 128, vals: 15512576 +D1_dimension: 121192, D2_dimension: 128, vals: 15512576 +E1_dimension: 121192, E2_dimension: 128, vals: 15512576 + + +kernel execution time: 100.999 ms +fused time: 104.98 + +separate execution + +kernel execution time: 42.4345 ms +sddmm time: 43.0804 + +kernel execution time: 21.5005 ms +sddmm time: 22.1326 + +kernel execution time: 25.1479 ms +spmm time: 25.7284 + +reference execution + +kernel execution time: 303.541 ms +taco reference time: 304.249 +/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 140874, B2_dimension: 140874, vals: 7813404 +C1_dimension: 140874, C2_dimension: 128, vals: 18031872 +D1_dimension: 140874, D2_dimension: 128, vals: 18031872 +E1_dimension: 140874, E2_dimension: 128, vals: 18031872 + + +kernel execution time: 121.702 ms +fused time: 122.44 + +separate execution + +kernel execution time: 41.1645 ms +sddmm time: 41.7679 + +kernel execution time: 21.4454 ms +sddmm time: 22.062 + +kernel execution time: 25.7274 ms +spmm time: 26.3069 + +reference execution + +kernel execution time: 838.679 ms +taco reference time: 839.358 +/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 170998, B2_dimension: 170998, vals: 958936 +C1_dimension: 170998, C2_dimension: 128, vals: 21887744 +D1_dimension: 170998, D2_dimension: 128, vals: 21887744 +E1_dimension: 170998, E2_dimension: 128, vals: 21887744 + + +kernel execution time: 49.6789 ms +fused time: 53.8345 + +separate execution + +kernel execution time: 19.3289 ms +sddmm time: 19.9476 + +kernel execution time: 12.9298 ms +sddmm time: 16.5522 + +kernel execution time: 19.7859 ms +spmm time: 23.3756 + +reference execution + +kernel execution time: 114.935 ms +taco reference time: 115.594 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 83334, B2_dimension: 83334, vals: 6010480 +C1_dimension: 83334, C2_dimension: 128, vals: 10666752 +D1_dimension: 83334, D2_dimension: 128, vals: 10666752 +E1_dimension: 83334, E2_dimension: 128, vals: 10666752 + + +kernel execution time: 29.3495 ms +fused time: 32.2304 + +separate execution + +kernel execution time: 23.942 ms +sddmm time: 24.54 + +kernel execution time: 14.4886 ms +sddmm time: 16.5358 + +kernel execution time: 16.8516 ms +spmm time: 20.2626 + +reference execution + +kernel execution time: 709.96 ms +taco reference time: 710.774 +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 +C1_dimension: 121192, C2_dimension: 128, vals: 15512576 +D1_dimension: 121192, D2_dimension: 128, vals: 15512576 +E1_dimension: 121192, E2_dimension: 128, vals: 15512576 + + +kernel execution time: 58.2762 ms +fused time: 62.5278 + +separate execution + +kernel execution time: 42.1594 ms +sddmm time: 42.7262 + +kernel execution time: 22.1442 ms +sddmm time: 23.0064 + +kernel execution time: 25.7924 ms +spmm time: 26.3623 + +reference execution + +kernel execution time: 329.572 ms +taco reference time: 330.27 +/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 140874, B2_dimension: 140874, vals: 7813404 +C1_dimension: 140874, C2_dimension: 128, vals: 18031872 +D1_dimension: 140874, D2_dimension: 128, vals: 18031872 +E1_dimension: 140874, E2_dimension: 128, vals: 18031872 + + +kernel execution time: 46.007 ms +fused time: 50.2274 + +separate execution + +kernel execution time: 41.4699 ms +sddmm time: 42.0415 + +kernel execution time: 21.559 ms +sddmm time: 22.136 + +kernel execution time: 25.525 ms +spmm time: 26.0801 + +reference execution + +kernel execution time: 869.823 ms +taco reference time: 873.823 +/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 170998, B2_dimension: 170998, vals: 958936 +C1_dimension: 170998, C2_dimension: 128, vals: 21887744 +D1_dimension: 170998, D2_dimension: 128, vals: 21887744 +E1_dimension: 170998, E2_dimension: 128, vals: 21887744 + + +kernel execution time: 33.3907 ms +fused time: 37.2851 + +separate execution + +kernel execution time: 19.369 ms +sddmm time: 19.9378 + +kernel execution time: 12.956 ms +sddmm time: 15.1889 + +kernel execution time: 19.8054 ms +spmm time: 23.5126 + +reference execution + +kernel execution time: 115.104 ms +taco reference time: 115.684 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 206500, B2_dimension: 206500, vals: 1273389 +C1_dimension: 206500, C2_dimension: 128, vals: 26432000 +D1_dimension: 206500, D2_dimension: 128, vals: 26432000 +E1_dimension: 206500, E2_dimension: 128, vals: 26432000 + + +kernel execution time: 45.2869 ms +fused time: 49.074 + +separate execution + +kernel execution time: 20.8037 ms +sddmm time: 21.3769 + +kernel execution time: 18.6117 ms +sddmm time: 19.1765 + +kernel execution time: 27.6368 ms +spmm time: 28.2194 + +reference execution + +kernel execution time: 157.83 ms +taco reference time: 158.458 +/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536 +C1_dimension: 1000005, C2_dimension: 128, vals: 128000640 +D1_dimension: 1000005, D2_dimension: 128, vals: 128000640 +E1_dimension: 1000005, E2_dimension: 128, vals: 128000640 + + +kernel execution time: 133.416 ms +fused time: 137.603 + +separate execution + +kernel execution time: 50.8463 ms +sddmm time: 51.4255 + +kernel execution time: 41.2442 ms +sddmm time: 41.8788 + +kernel execution time: 83.4032 ms +spmm time: 84.052 + +reference execution + +kernel execution time: 569.216 ms +taco reference time: 570.035 +/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291 +C1_dimension: 5558326, C2_dimension: 128, vals: 711465728 +D1_dimension: 5558326, D2_dimension: 128, vals: 711465728 +E1_dimension: 5558326, E2_dimension: 128, vals: 711465728 + + +kernel execution time: 1282.76 ms +fused time: 1287.59 + +separate execution + +kernel execution time: 606.985 ms +sddmm time: 607.616 + +kernel execution time: 561.224 ms +sddmm time: 561.958 + +kernel execution time: 874.527 ms +spmm time: 875.232 + +reference execution + +kernel execution time: 21707 ms +taco reference time: 21710.6 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 5, B2_dimension: 5, vals: 19 +C1_dimension: 5, C2_dimension: 128, vals: 640 +D1_dimension: 5, D2_dimension: 128, vals: 640 +E1_dimension: 5, E2_dimension: 128, vals: 640 + + +kernel execution time: 3.43602 ms +fused time: 27.8707 + +separate execution + +kernel execution time: 4107.02 ms +sddmm time: 4122.77 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 5, B2_dimension: 5, vals: 19 +C1_dimension: 5, C2_dimension: 128, vals: 640 +D1_dimension: 5, D2_dimension: 128, vals: 640 +E1_dimension: 5, E2_dimension: 128, vals: 640 + + +kernel execution time: 0.115981 ms +fused time: 0.499507 + +separate execution + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 5, B2_dimension: 5, vals: 19 +C1_dimension: 5, C2_dimension: 128, vals: 640 +D1_dimension: 5, D2_dimension: 128, vals: 640 +E1_dimension: 5, E2_dimension: 128, vals: 640 + + +kernel execution time: 0.133052 ms +fused time: 3.69599 + +separate execution + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2700, B2_dimension: 2700, vals: 5400 +C1_dimension: 2700, C2_dimension: 128, vals: 345600 +D1_dimension: 2700, D2_dimension: 128, vals: 345600 +E1_dimension: 2700, E2_dimension: 128, vals: 345600 + + +kernel execution time: 0.606469 ms +fused time: 4.32552 + +separate execution + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2700, B2_dimension: 2700, vals: 5400 +C1_dimension: 2700, C2_dimension: 128, vals: 345600 +D1_dimension: 2700, D2_dimension: 128, vals: 345600 +E1_dimension: 2700, E2_dimension: 128, vals: 345600 + + +kernel execution time: 0.650529 ms +fused time: 1.40893 + +separate execution + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5400 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 0.620999 ms +fused time: 1.38301 + +separate execution + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5400 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 0.652959 ms +fused time: 3.94184 + +separate execution + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 0.597158 ms +fused time: 4.27836 + +separate execution + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 0.659809 ms +fused time: 4.6484 + +separate execution + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 0.591018 ms +fused time: 2.44084 + +separate execution + +kernel execution time: 0.607388 ms +sddmm time: 0.891202 + +kernel execution time: 0.857981 ms +sddmm time: 1.16087 + +kernel execution time: 0.922992 ms +spmm time: 1.60378 + +reference execution + +kernel execution time: 4.47191 ms +taco reference time: 5.26226 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 0.658879 ms +fused time: 4.15402 + +separate execution + +kernel execution time: 0.70888 ms +sddmm time: 1.21343 + +kernel execution time: 0.531398 ms +sddmm time: 1.30729 + +kernel execution time: 0.965464 ms +spmm time: 2.35378 + +reference execution + +kernel execution time: 3.48771 ms +taco reference time: 7.55141 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 0.616739 ms +fused time: 4.4146 + +separate execution + +kernel execution time: 0.556318 ms +sddmm time: 3.03196 + +kernel execution time: 0.945623 ms +sddmm time: 1.89019 + +kernel execution time: 0.777471 ms +spmm time: 3.57728 + +reference execution + +kernel execution time: 3.22827 ms +taco reference time: 7.39799 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 0.65531 ms +fused time: 4.08374 + +separate execution + +kernel execution time: 0.666219 ms +sddmm time: 1.20641 + +kernel execution time: 0.941573 ms +sddmm time: 1.73185 + +kernel execution time: 1.01493 ms +spmm time: 1.75608 + +reference execution + +kernel execution time: 5.25507 ms +taco reference time: 6.04624 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 0.670959 ms +fused time: 1.50328 + +separate execution + +kernel execution time: 0.600268 ms +sddmm time: 1.32833 + +kernel execution time: 0.476237 ms +sddmm time: 0.792151 + +kernel execution time: 0.781091 ms +spmm time: 1.10271 + +reference execution + +kernel execution time: 3.07623 ms +taco reference time: 3.53829 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 0.760541 ms +fused time: 1.49073 + +separate execution + +kernel execution time: 0.639829 ms +sddmm time: 1.21327 + +kernel execution time: 0.576218 ms +sddmm time: 1.14083 + +kernel execution time: 0.829512 ms +spmm time: 1.33624 + +reference execution + +kernel execution time: 4.14591 ms +taco reference time: 4.82508 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 0.638949 ms +fused time: 1.02277 + +separate execution + +kernel execution time: 0.945034 ms +sddmm time: 1.20456 + +kernel execution time: 0.6772 ms +sddmm time: 0.943263 + +kernel execution time: 0.888033 ms +spmm time: 1.133 + +reference execution + +kernel execution time: 3.82989 ms +taco reference time: 4.18452 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 0.7361 ms +fused time: 1.45315 + +separate execution + +kernel execution time: 0.7335 ms +sddmm time: 1.25184 + +kernel execution time: 0.642509 ms +sddmm time: 1.16064 + +kernel execution time: 1.02361 ms +spmm time: 1.48614 + +reference execution + +kernel execution time: 4.12035 ms +taco reference time: 4.75857 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 334863, B2_dimension: 334863, vals: 777323 +C1_dimension: 334863, C2_dimension: 128, vals: 42862464 +D1_dimension: 334863, D2_dimension: 128, vals: 42862464 +E1_dimension: 334863, E2_dimension: 128, vals: 42862464 + + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 548551, B2_dimension: 548551, vals: 925872 +C1_dimension: 548551, C2_dimension: 128, vals: 70214528 +D1_dimension: 548551, D2_dimension: 128, vals: 70214528 +E1_dimension: 548551, E2_dimension: 128, vals: 70214528 + + +kernel execution time: 66.4595 ms +fused time: 66.9196 + +separate execution + +kernel execution time: 22.9317 ms +sddmm time: 23.4738 + +kernel execution time: 22.4453 ms +sddmm time: 23.0045 + +kernel execution time: 44.2796 ms +spmm time: 44.8052 + +reference execution + +kernel execution time: 187.6 ms +taco reference time: 188.247 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 548551, B2_dimension: 548551, vals: 1851744 +C1_dimension: 548551, C2_dimension: 128, vals: 70214528 +D1_dimension: 548551, D2_dimension: 128, vals: 70214528 +E1_dimension: 548551, E2_dimension: 128, vals: 70214528 + + +kernel execution time: 103.551 ms +fused time: 104.018 + +separate execution + +kernel execution time: 39.9535 ms +sddmm time: 40.5639 + +kernel execution time: 39.2683 ms +sddmm time: 39.8581 + +kernel execution time: 65.8336 ms +spmm time: 66.417 + +reference execution + +kernel execution time: 306.901 ms +taco reference time: 307.61 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 548551, B2_dimension: 548551, vals: 1851744 +C1_dimension: 548551, C2_dimension: 128, vals: 70214528 +D1_dimension: 548551, D2_dimension: 128, vals: 70214528 +E1_dimension: 548551, E2_dimension: 128, vals: 70214528 + + +kernel execution time: 106.782 ms +fused time: 107.261 + +separate execution + +kernel execution time: 40.7961 ms +sddmm time: 41.3604 + +kernel execution time: 39.8676 ms +sddmm time: 40.4959 + +kernel execution time: 66.2656 ms +spmm time: 66.8105 + +reference execution + +kernel execution time: 367.416 ms +taco reference time: 368.086 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 548551, B2_dimension: 548551, vals: 1851744 +C1_dimension: 548551, C2_dimension: 128, vals: 70214528 +D1_dimension: 548551, D2_dimension: 128, vals: 70214528 +E1_dimension: 548551, E2_dimension: 128, vals: 70214528 + + +kernel execution time: 108.809 ms +fused time: 109.274 + +separate execution + +kernel execution time: 42.2311 ms +sddmm time: 42.826 + +kernel execution time: 41.711 ms +sddmm time: 42.3721 + +kernel execution time: 65.9512 ms +spmm time: 66.5647 + +reference execution + +kernel execution time: 360.581 ms +taco reference time: 361.225 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 548551, B2_dimension: 548551, vals: 1851744 +C1_dimension: 548551, C2_dimension: 128, vals: 70214528 +D1_dimension: 548551, D2_dimension: 128, vals: 70214528 +E1_dimension: 548551, E2_dimension: 128, vals: 70214528 + + +kernel execution time: 922.149 ms +fused time: 922.605 + +separate execution + +kernel execution time: 392.18 ms +sddmm time: 392.716 + +kernel execution time: 393.251 ms +sddmm time: 393.777 + +kernel execution time: 520.496 ms +spmm time: 521.007 + +reference execution + +kernel execution time: 9912.29 ms +taco reference time: 9913.37 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 2.15935 ms +fused time: 2.88765 + +separate execution + +kernel execution time: 1.09729 ms +sddmm time: 1.64867 + +kernel execution time: 0.987463 ms +sddmm time: 1.50853 + +kernel execution time: 2.22996 ms +spmm time: 2.71273 + +reference execution + +kernel execution time: 29.4617 ms +taco reference time: 29.8511 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 0.667108 ms +fused time: 1.05163 + +separate execution + +kernel execution time: 0.680159 ms +sddmm time: 0.994963 + +kernel execution time: 0.611478 ms +sddmm time: 1.1057 + +kernel execution time: 0.988313 ms +spmm time: 1.4939 + +reference execution + +kernel execution time: 3.64386 ms +taco reference time: 4.33446 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 0.691709 ms +fused time: 1.07767 + +separate execution + +kernel execution time: 0.516997 ms +sddmm time: 0.77957 + +kernel execution time: 0.458366 ms +sddmm time: 0.73026 + +kernel execution time: 0.777811 ms +spmm time: 1.01678 + +reference execution + +kernel execution time: 3.47463 ms +taco reference time: 3.82426 +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 548551, B2_dimension: 548551, vals: 1851744 +C1_dimension: 548551, C2_dimension: 128, vals: 70214528 +D1_dimension: 548551, D2_dimension: 128, vals: 70214528 +E1_dimension: 548551, E2_dimension: 128, vals: 70214528 + + +kernel execution time: 104.681 ms +fused time: 105.128 + +separate execution + +kernel execution time: 39.5478 ms +sddmm time: 40.1164 + +kernel execution time: 40.2068 ms +sddmm time: 40.7802 + +kernel execution time: 67.2769 ms +spmm time: 67.8666 + +reference execution + +kernel execution time: 378.806 ms +taco reference time: 379.526 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 2.0421 ms +fused time: 2.77318 + +separate execution + +kernel execution time: 0.890922 ms +sddmm time: 1.4406 + +kernel execution time: 0.673509 ms +sddmm time: 0.955103 + +kernel execution time: 1.93153 ms +spmm time: 2.18341 + +reference execution + +kernel execution time: 33.2851 ms +taco reference time: 33.6343 +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 548551, B2_dimension: 548551, vals: 1851744 +C1_dimension: 548551, C2_dimension: 128, vals: 70214528 +D1_dimension: 548551, D2_dimension: 128, vals: 70214528 +E1_dimension: 548551, E2_dimension: 128, vals: 70214528 + + +kernel execution time: 913.728 ms +fused time: 914.178 + +separate execution + +kernel execution time: 389.744 ms +sddmm time: 390.317 + +kernel execution time: 389.105 ms +sddmm time: 389.68 + +kernel execution time: 520.43 ms +spmm time: 520.979 + +reference execution + +kernel execution time: 9970.19 ms +taco reference time: 9971.18 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 1.81249 ms +fused time: 2.53831 + +separate execution + +kernel execution time: 1.41327 ms +sddmm time: 1.9866 + +kernel execution time: 0.687839 ms +sddmm time: 0.957583 + +kernel execution time: 1.99132 ms +spmm time: 2.2301 + +reference execution + +kernel execution time: 33.8389 ms +taco reference time: 34.1855 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 2.08639 ms +fused time: 2.81403 + +separate execution + +kernel execution time: 0.75901 ms +sddmm time: 1.27309 + +kernel execution time: 0.72208 ms +sddmm time: 1.00494 + +kernel execution time: 1.95748 ms +spmm time: 2.20503 + +reference execution + +kernel execution time: 33.4827 ms +taco reference time: 33.8347 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 2.09414 ms +fused time: 2.82691 + +separate execution + +kernel execution time: 1.03623 ms +sddmm time: 1.58316 + +kernel execution time: 0.653819 ms +sddmm time: 0.926463 + +kernel execution time: 1.88145 ms +spmm time: 2.12517 + +reference execution + +kernel execution time: 33.3395 ms +taco reference time: 33.6915 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 1.70968 ms +fused time: 2.43176 + +separate execution + +kernel execution time: 0.76455 ms +sddmm time: 1.31209 + +kernel execution time: 0.664099 ms +sddmm time: 0.932353 + +kernel execution time: 1.92536 ms +spmm time: 2.17072 + +reference execution + +kernel execution time: 32.5601 ms +taco reference time: 32.9017 diff --git a/test/stats/spmm-spmm.txt b/test/stats/spmm-spmm.txt new file mode 100644 index 000000000..329aacd65 --- /dev/null +++ b/test/stats/spmm-spmm.txt @@ -0,0 +1,3604 @@ + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 +C1_dimension: 121192, C2_dimension: 121192, vals: 2624331 +D1_dimension: 121192, D2_dimension: 64, vals: 7756288 + + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 +C1_dimension: 121192, C2_dimension: 121192, vals: 2624331 +D1_dimension: 121192, D2_dimension: 64, vals: 7756288 + + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 +C1_dimension: 121192, C2_dimension: 121192, vals: 2624331 +D1_dimension: 121192, D2_dimension: 64, vals: 7756288 + + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 +C1_dimension: 121192, C2_dimension: 64, vals: 7756288 +D1_dimension: 64, D2_dimension: 64, vals: 4096 + + +kernel execution time: 303.084 ms +fused time: 303.842 + +kernel execution time: 8140.55 ms +taco reference time: 8141.59 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 +C1_dimension: 121192, C2_dimension: 64, vals: 7756288 +D1_dimension: 64, D2_dimension: 64, vals: 4096 + + +kernel execution time: 269.44 ms +fused time: 270.181 + +kernel execution time: 1612.62 ms +taco reference time: 1613.21 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 +C1_dimension: 121192, C2_dimension: 121192, vals: 2624331 +D1_dimension: 121192, D2_dimension: 64, vals: 7756288 + + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 +C1_dimension: 121192, C2_dimension: 121192, vals: 2624331 +D1_dimension: 121192, D2_dimension: 64, vals: 7756288 + + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 +C1_dimension: 121192, C2_dimension: 121192, vals: 2624331 +D1_dimension: 121192, D2_dimension: 64, vals: 7756288 + + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 +C1_dimension: 121192, C2_dimension: 121192, vals: 2624331 +D1_dimension: 121192, D2_dimension: 64, vals: 7756288 + + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 5, B2_dimension: 5, vals: 19 +C1_dimension: 5, C2_dimension: 5, vals: 19 +D1_dimension: 5, D2_dimension: 64, vals: 320 + + +kernel execution time: 0.125431 ms +fused time: 0.815671 + +kernel execution time: 0.03254 ms +taco reference time: 0.828291 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 10974, vals: 428650 +D1_dimension: 10974, D2_dimension: 64, vals: 702336 + + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 10974, vals: 428650 +D1_dimension: 10974, D2_dimension: 8, vals: 87792 + + +kernel execution time: 783.639 ms +fused time: 784.413 + +kernel execution time: 25.6025 ms +taco reference time: 25.9422 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 10974, vals: 428650 +D1_dimension: 10974, D2_dimension: 8, vals: 87792 + + +kernel execution time: 3538.49 ms +fused time: 3539.6 + +kernel execution time: 544.057 ms +taco reference time: 544.496 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 10974, vals: 428650 +D1_dimension: 10974, D2_dimension: 8, vals: 87792 + + +kernel execution time: 3451.46 ms +fused time: 3452.59 + +kernel execution time: 540.889 ms +taco reference time: 541.34 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 8, vals: 87792 +D1_dimension: 8, D2_dimension: 8, vals: 64 + + +kernel execution time: 23.9997 ms +fused time: 24.715 + +kernel execution time: 116.717 ms +taco reference time: 117.038 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 8, vals: 87792 +D1_dimension: 8, D2_dimension: 8, vals: 64 + + +kernel execution time: 2.19466 ms +fused time: 2.91615 + +kernel execution time: 9.4728 ms +taco reference time: 10.0292 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 +C1_dimension: 121192, C2_dimension: 8, vals: 969536 +D1_dimension: 8, D2_dimension: 8, vals: 64 + + +kernel execution time: 30.5327 ms +fused time: 31.2749 + +kernel execution time: 35.9838 ms +taco reference time: 36.52 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536 +C1_dimension: 1000005, C2_dimension: 64, vals: 64000320 +D1_dimension: 64, D2_dimension: 64, vals: 4096 + + +kernel execution time: 1803.51 ms +fused time: 1804.27 + +kernel execution time: 1976.12 ms +taco reference time: 1976.69 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 206500, B2_dimension: 206500, vals: 1273389 +C1_dimension: 206500, C2_dimension: 64, vals: 13216000 +D1_dimension: 64, D2_dimension: 128, vals: 8192 + + +kernel execution time: 484.907 ms +fused time: 485.835 + +kernel execution time: 1567.31 ms +taco reference time: 1567.89 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536 +C1_dimension: 1000005, C2_dimension: 64, vals: 64000320 +D1_dimension: 64, D2_dimension: 128, vals: 8192 + + +kernel execution time: 2301.83 ms +fused time: 2302.58 + +kernel execution time: 3904.01 ms +taco reference time: 3905 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 64, D2_dimension: 128, vals: 8192 + + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 64, D2_dimension: 128, vals: 8192 + + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 64, D2_dimension: 128, vals: 8192 + + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 64, D2_dimension: 128, vals: 8192 + + +kernel execution time: 11.7415 ms +fused time: 12.4648 + +kernel execution time: 155.192 ms +taco reference time: 155.893 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 64, vals: 702336 +D1_dimension: 64, D2_dimension: 128, vals: 8192 + + +kernel execution time: 6.56465 ms +fused time: 7.31046 + +kernel execution time: 1.17042 ms +sddmm time: 1.68226 + +kernel execution time: 5.08948 ms +spmm time: 5.36855 + +kernel execution time: 124.176 ms +taco reference time: 124.551 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 64, vals: 2330688 +D1_dimension: 64, D2_dimension: 128, vals: 8192 + + +kernel execution time: 25.3076 ms +fused time: 25.7407 + +kernel execution time: 14.1922 ms +sddmm time: 14.7097 + +kernel execution time: 16.8223 ms +spmm time: 17.3081 + +kernel execution time: 1299.07 ms +taco reference time: 1299.47 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 46835, B2_dimension: 46835, vals: 2374001 +C1_dimension: 46835, C2_dimension: 64, vals: 2997440 +D1_dimension: 64, D2_dimension: 128, vals: 8192 + + +kernel execution time: 27.1044 ms +fused time: 27.5788 + +kernel execution time: 9.05436 ms +sddmm time: 9.61561 + +kernel execution time: 21.401 ms +spmm time: 21.9403 + +kernel execution time: 695.617 ms +taco reference time: 696.166 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 62451, B2_dimension: 62451, vals: 4007383 +C1_dimension: 62451, C2_dimension: 64, vals: 3996864 +D1_dimension: 64, D2_dimension: 128, vals: 8192 + + +kernel execution time: 33.1726 ms +fused time: 33.5921 + +kernel execution time: 14.8585 ms +sddmm time: 15.3574 + +kernel execution time: 28.8622 ms +spmm time: 29.3477 + +kernel execution time: 1179.24 ms +taco reference time: 1179.66 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 83334, B2_dimension: 83334, vals: 6010480 +C1_dimension: 83334, C2_dimension: 64, vals: 5333376 +D1_dimension: 64, D2_dimension: 128, vals: 8192 + + +kernel execution time: 50.933 ms +fused time: 51.3664 + +kernel execution time: 22.1051 ms +sddmm time: 22.6231 + +kernel execution time: 37.9487 ms +spmm time: 38.4594 + +kernel execution time: 1793.69 ms +taco reference time: 1794.18 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 +C1_dimension: 121192, C2_dimension: 64, vals: 7756288 +D1_dimension: 64, D2_dimension: 128, vals: 8192 + + +kernel execution time: 77.6403 ms +fused time: 78.0713 + +kernel execution time: 19.9996 ms +sddmm time: 20.5235 + +kernel execution time: 55.1072 ms +spmm time: 55.6382 + +kernel execution time: 757.71 ms +taco reference time: 758.251 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 140874, B2_dimension: 140874, vals: 7813404 +C1_dimension: 140874, C2_dimension: 64, vals: 9015936 +D1_dimension: 64, D2_dimension: 128, vals: 8192 + + +kernel execution time: 74.448 ms +fused time: 74.8977 + +kernel execution time: 28.5447 ms +sddmm time: 29.0628 + +kernel execution time: 64.5939 ms +spmm time: 65.3752 + +kernel execution time: 2277.84 ms +taco reference time: 2278.26 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 170998, B2_dimension: 170998, vals: 958936 +C1_dimension: 170998, C2_dimension: 64, vals: 10943872 +D1_dimension: 64, D2_dimension: 128, vals: 8192 + + +kernel execution time: 103.993 ms +fused time: 104.417 + +kernel execution time: 13.9953 ms +sddmm time: 14.4722 + +kernel execution time: 77.1505 ms +spmm time: 77.6507 + +kernel execution time: 277.888 ms +taco reference time: 278.424 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 206500, B2_dimension: 206500, vals: 1273389 +C1_dimension: 206500, C2_dimension: 64, vals: 13216000 +D1_dimension: 64, D2_dimension: 128, vals: 8192 + + +kernel execution time: 122.094 ms +fused time: 122.526 + +kernel execution time: 16.3934 ms +sddmm time: 16.9174 + +kernel execution time: 93.4293 ms +spmm time: 93.9709 + +kernel execution time: 368.185 ms +taco reference time: 368.744 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536 +C1_dimension: 1000005, C2_dimension: 64, vals: 64000320 +D1_dimension: 64, D2_dimension: 128, vals: 8192 + + +kernel execution time: 594.481 ms +fused time: 594.903 + +kernel execution time: 68.7062 ms +sddmm time: 69.19 + +kernel execution time: 456.966 ms +spmm time: 457.476 + +kernel execution time: 939.672 ms +taco reference time: 940.234 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291 +C1_dimension: 5558326, C2_dimension: 64, vals: 355732864 +D1_dimension: 64, D2_dimension: 128, vals: 8192 + + +kernel execution time: 3572.47 ms +fused time: 3573.32 + +kernel execution time: 1088.24 ms +sddmm time: 1088.74 + +kernel execution time: 2533.08 ms +spmm time: 2533.64 + +kernel execution time: 19935.1 ms +taco reference time: 19936.1 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 128, vals: 1404672 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 28.4416 ms +fused time: 28.8482 + +kernel execution time: 58.9151 ms +sddmm time: 59.3822 + +kernel execution time: 85.1524 ms +spmm time: 85.6136 + +kernel execution time: 3443.24 ms +taco reference time: 3444.27 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 128, vals: 1404672 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 28.4398 ms +fused time: 28.9133 + +kernel execution time: 59.5781 ms +SpMM time: 60.0552 + +kernel execution time: 85.038 ms +GeMM time: 85.49 + +kernel execution time: 83.589 ms +Optimized GeMM time: 83.939 + +kernel execution time: 3425.66 ms +taco reference time: 3426.56 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 128, vals: 1404672 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 28.1949 ms +fused time: 28.6047 + +kernel execution time: 58.8056 ms +SpMM time: 59.2739 + +kernel execution time: 85.098 ms +GeMM time: 85.5677 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 128, vals: 1404672 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 128, vals: 1404672 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 34.4562 ms +fused time: 35.1247 + +kernel execution time: 57.8421 ms +SpMM time: 58.3206 + +kernel execution time: 84.8243 ms +GeMM time: 85.2948 + +kernel execution time: 84.2094 ms +Optimized GeMM template time: 84.5715 + +kernel execution time: 3423.26 ms +taco reference time: 3424.18 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 128, vals: 1404672 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 34.1982 ms +fused time: 34.9007 + +kernel execution time: 58.2208 ms +SpMM time: 58.708 + +kernel execution time: 85.2639 ms +GeMM time: 85.7329 + +kernel execution time: 84.6708 ms +Optimized GeMM template time: 85.0447 + +kernel execution time: 3448.38 ms +taco reference time: 3449.25 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 128, vals: 1404672 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 3.98391 ms +fused time: 4.78728 + +kernel execution time: 3.85974 ms +SpMM time: 4.41484 + +kernel execution time: 5.20996 ms +GeMM time: 5.78292 + +kernel execution time: 85.5005 ms +Optimized GeMM template time: 85.8224 + +kernel execution time: 68.5977 ms +taco reference time: 69.0953 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 128, vals: 1404672 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 35.477 ms +fused time: 36.1715 + +kernel execution time: 57.2092 ms +SpMM time: 57.6862 + +kernel execution time: 84.9251 ms +GeMM time: 85.3862 + +kernel execution time: 84.8529 ms +Optimized GeMM template time: 85.2333 + +kernel execution time: 3425.71 ms +taco reference time: 3426.59 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 128, vals: 1404672 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 35.2755 ms +fused time: 35.9965 + +kernel execution time: 57.3952 ms +SpMM time: 57.8851 + +kernel execution time: 85.2686 ms +GeMM time: 85.7356 + +kernel execution time: 84.5744 ms +Optimized GeMM template time: 84.9512 + +kernel execution time: 3429.7 ms +taco reference time: 3430.52 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 128, vals: 1404672 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 3.98364 ms +fused time: 4.61817 + +kernel execution time: 3.85737 ms +SpMM time: 4.28322 + +kernel execution time: 5.15902 ms +GeMM time: 5.6055 + +kernel execution time: 87.1601 ms +Optimized GeMM template time: 87.4622 + +kernel execution time: 69.0316 ms +taco reference time: 69.4576 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 128, vals: 1404672 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 128, vals: 1404672 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 4.62195 ms +fused time: 5.02884 + +kernel execution time: 4.03094 ms +SpMM time: 4.41592 + +kernel execution time: 5.10184 ms +GeMM time: 5.44766 + +kernel execution time: 83.6233 ms +Optimized GeMM template time: 83.895 + +kernel execution time: 5.3188 ms +Optimized GeMM template time: 5.65673 + +kernel execution time: 69.2656 ms +taco reference time: 69.6404 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 128, vals: 1404672 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 4.03732 ms +fused time: 4.69314 + +kernel execution time: 3.72378 ms +SpMM time: 4.02627 + +kernel execution time: 2.04995 ms +GeMM time: 2.33804 + +kernel execution time: 2.25997 ms +Optimized GeMM template time: 2.50901 + +kernel execution time: 5.18509 ms +Optimized GeMM template time: 5.46269 + +kernel execution time: 68.4415 ms +taco reference time: 68.78 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 128, vals: 1404672 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 3.95981 ms +fused time: 4.3754 + +kernel execution time: 3.78475 ms +SpMM time: 4.19686 + +kernel execution time: 2.00709 ms +GeMM time: 2.38028 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 128, vals: 1404672 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 4.05057 ms +fused time: 4.40773 + +kernel execution time: 3.75306 ms +SpMM time: 4.08598 + +kernel execution time: 2.05899 ms +GeMM time: 2.36596 + +kernel execution time: 2.12928 ms +Optimized GeMM template time: 2.36493 + +kernel execution time: 5.14712 ms +Optimized GeMM template time: 5.41248 + +kernel execution time: 68.075 ms +taco reference time: 68.3835 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 128, vals: 1404672 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 3.88934 ms +fused time: 4.25328 + +kernel execution time: 3.82407 ms +SpMM time: 4.19446 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 128, vals: 1404672 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 4.28741 ms +fused time: 4.98944 + +kernel execution time: 3.79765 ms +SpMM time: 4.16417 + +kernel execution time: 1.4265 ms +SpMM template time: 1.74127 + +kernel execution time: 2.10898 ms +GeMM time: 2.39285 + +kernel execution time: 2.34628 ms +Optimized GeMM template time: 2.61728 + +kernel execution time: 5.31869 ms +Optimized GeMM template time: 5.60267 + +kernel execution time: 69.5098 ms +taco reference time: 69.8708 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 128, vals: 1404672 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 4.01588 ms +fused time: 4.65051 + +kernel execution time: 3.86258 ms +SpMM time: 4.2125 + +kernel execution time: 1.43425 ms +SpMM template time: 1.72825 + +kernel execution time: 2.09177 ms +GeMM time: 2.35741 + +kernel execution time: 2.03779 ms +GeMM time: 2.26668 + +kernel execution time: 2.18152 ms +Optimized GeMM template time: 2.45788 + +kernel execution time: 0.974804 ms +Optimized GeMM template time: 1.25462 + +kernel execution time: 67.9024 ms +taco reference time: 68.2452 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 128, vals: 1404672 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 4.0224 ms +fused time: 4.44033 + +kernel execution time: 3.84077 ms +SpMM time: 4.2196 + +kernel execution time: 1.57684 ms +SpMM template time: 1.93604 + +kernel execution time: 2.00289 ms +GeMM time: 2.38135 + +kernel execution time: 1.93219 ms +ref 2 GeMM time: 2.16952 + +kernel execution time: 1.9562 ms +ref3 GeMM template time: 2.22014 + +kernel execution time: 1.02843 ms +SpMM template time: 1.3134 + +kernel execution time: 68.6937 ms +taco reference time: 69.0531 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 128, vals: 1404672 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 128, vals: 1404672 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 4.70723 ms +fused time: 5.10663 + +kernel execution time: 3.86475 ms +SpMM time: 4.22896 + +kernel execution time: 1.5696 ms +SpMM template time: 1.91027 + +kernel execution time: 2.06463 ms +GeMM time: 2.35063 + +kernel execution time: 1.93837 ms +ref 2 GeMM time: 2.18475 + +kernel execution time: 1.93808 ms +ref3 GeMM template time: 2.21134 + +kernel execution time: 1.00393 ms +SpMM template time: 1.28759 + +kernel execution time: 65.6539 ms +taco reference time: 66.0123 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 128, vals: 1404672 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 4.41073 ms +fused time: 4.81175 + +kernel execution time: 3.96438 ms +SpMM time: 4.33792 + +kernel execution time: 1.48077 ms +SpMM template time: 1.84634 + +kernel execution time: 2.06276 ms +GeMM time: 2.52122 + +kernel execution time: 2.4643 ms +ref 2 GeMM template time: 3.77443 + +kernel execution time: 2.21292 ms +ref3 GeMM template time: 2.48374 + +kernel execution time: 1.02386 ms +SpMM template time ref4: 5.63941 + +kernel execution time: 73.0137 ms +taco reference time: 73.4188 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 128, vals: 1404672 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 2.81946 ms +fused time: 3.44515 + +kernel execution time: 3.93379 ms +SpMM time: 4.19505 + +kernel execution time: 1.46537 ms +SpMM template time: 1.77106 + +kernel execution time: 2.48839 ms +GeMM time: 2.75159 + +kernel execution time: 2.57119 ms +ref 2 GeMM template time: 2.83288 + +kernel execution time: 2.19579 ms +ref3 GeMM template time: 2.44668 + +kernel execution time: 1.08977 ms +SpMM template time ref4: 1.3527 + +kernel execution time: 72.5212 ms +taco reference time: 72.8405 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 128, vals: 1404672 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 2.34088 ms +fused time: 2.99398 + +kernel execution time: 3.80606 ms +SpMM time: 4.36154 + +kernel execution time: 1.58906 ms +SpMM template time: 1.95568 + +kernel execution time: 2.25455 ms +GeMM time: 2.5356 + +kernel execution time: 2.3975 ms +ref 2 GeMM template time: 2.66963 + +kernel execution time: 2.10202 ms +ref3 GeMM template time: 2.40392 + +kernel execution time: 1.02333 ms +SpMM template time ref4: 1.30975 + +kernel execution time: 72.6994 ms +taco reference time: 73.0145 + + + + + + +--------------------------------------------------------------------------------------------------------------- +--------------------------------------------------------------------------------------------------------------- +--------------------------------------------------------------------------------------------------------------- + + +with 64 threads + + + + + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 128, vals: 1404672 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 2.36795 ms +fused time: 2.78304 + +kernel execution time: 3.8721 ms +SpMM time: 4.20057 + +kernel execution time: 1.52637 ms +SpMM template time: 1.85784 + +kernel execution time: 2.03318 ms +GeMM time: 2.31935 + +kernel execution time: 2.39998 ms +ref 2 GeMM template time: 2.68836 + +kernel execution time: 1.94819 ms +ref3 GeMM template time: 2.2353 + +kernel execution time: 1.06049 ms +SpMM template time ref4: 1.35755 + +kernel execution time: 68.6851 ms +taco reference time: 69.0188 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 128, vals: 4661376 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 8.41302 ms +fused time: 8.85733 + +kernel execution time: 17.639 ms +SpMM time: 18.2378 + +kernel execution time: 7.98654 ms +SpMM template time: 8.57087 + +kernel execution time: 6.34574 ms +GeMM time: 6.8938 + +kernel execution time: 6.10335 ms +ref 2 GeMM template time: 6.39173 + +kernel execution time: 5.82956 ms +ref3 GeMM template time: 6.11877 + +kernel execution time: 4.70653 ms +SpMM template time ref4: 5.04278 + +kernel execution time: 671.833 ms +taco reference time: 672.353 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 46835, B2_dimension: 46835, vals: 2374001 +C1_dimension: 46835, C2_dimension: 128, vals: 5994880 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 7.27388 ms +fused time: 7.73945 + +kernel execution time: 17.7256 ms +SpMM time: 18.3199 + +kernel execution time: 7.35832 ms +SpMM template time: 7.9109 + +kernel execution time: 8.33036 ms +GeMM time: 8.86966 + +kernel execution time: 7.86963 ms +ref 2 GeMM template time: 8.15124 + +kernel execution time: 7.7866 ms +ref3 GeMM template time: 8.07407 + +kernel execution time: 4.49305 ms +SpMM template time ref4: 4.80781 + +kernel execution time: 398.926 ms +taco reference time: 399.478 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 62451, B2_dimension: 62451, vals: 4007383 +C1_dimension: 62451, C2_dimension: 128, vals: 7993728 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 11.3443 ms +fused time: 11.8147 + +kernel execution time: 22.2928 ms +SpMM time: 22.924 + +kernel execution time: 12.4461 ms +SpMM template time: 13.0043 + +kernel execution time: 10.9317 ms +GeMM time: 11.5006 + +kernel execution time: 10.7585 ms +ref 2 GeMM template time: 11.0658 + +kernel execution time: 11.0196 ms +ref3 GeMM template time: 11.3149 + +kernel execution time: 6.90358 ms +SpMM template time ref4: 7.24984 + +kernel execution time: 657.038 ms +taco reference time: 657.641 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 83334, B2_dimension: 83334, vals: 6010480 +C1_dimension: 83334, C2_dimension: 128, vals: 10666752 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 15.2657 ms +fused time: 15.7013 + +kernel execution time: 31.6235 ms +SpMM time: 32.1905 + +kernel execution time: 16.8006 ms +SpMM template time: 17.332 + +kernel execution time: 14.3795 ms +GeMM time: 14.9199 + +kernel execution time: 14.4997 ms +ref 2 GeMM template time: 14.8349 + +kernel execution time: 14.0983 ms +ref3 GeMM template time: 14.393 + +kernel execution time: 9.33791 ms +SpMM template time ref4: 9.73698 + +kernel execution time: 903.295 ms +taco reference time: 903.924 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 +C1_dimension: 121192, C2_dimension: 128, vals: 15512576 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 27.1267 ms +fused time: 27.6407 + +kernel execution time: 52.874 ms +SpMM time: 53.49 + +kernel execution time: 25.9708 ms +SpMM template time: 26.5475 + +kernel execution time: 20.1295 ms +GeMM time: 20.7267 + +kernel execution time: 21.2549 ms +ref 2 GeMM template time: 21.7256 + +kernel execution time: 20.7262 ms +ref3 GeMM template time: 21.1848 + +kernel execution time: 12.5379 ms +SpMM template time ref4: 13.0829 + +kernel execution time: 405.376 ms +taco reference time: 406.043 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 140874, B2_dimension: 140874, vals: 7813404 +C1_dimension: 140874, C2_dimension: 128, vals: 18031872 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 22.7136 ms +fused time: 23.1625 + +kernel execution time: 49.1418 ms +SpMM time: 49.7343 + +kernel execution time: 25.0936 ms +SpMM template time: 25.604 + +kernel execution time: 23.6444 ms +GeMM time: 24.1812 + +kernel execution time: 24.348 ms +ref 2 GeMM template time: 24.6837 + +kernel execution time: 23.9836 ms +ref3 GeMM template time: 24.2972 + +kernel execution time: 14.4884 ms +SpMM template time ref4: 14.8698 + +kernel execution time: 1154.44 ms +taco reference time: 1155.04 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 170998, B2_dimension: 170998, vals: 958936 +C1_dimension: 170998, C2_dimension: 128, vals: 21887744 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 18.015 ms +fused time: 18.4775 + +kernel execution time: 56.1907 ms +SpMM time: 56.8126 + +kernel execution time: 20.0375 ms +SpMM template time: 20.5913 + +kernel execution time: 28.1716 ms +GeMM time: 28.7647 + +kernel execution time: 30.484 ms +ref 2 GeMM template time: 30.9681 + +kernel execution time: 30.0422 ms +ref3 GeMM template time: 30.5496 + +kernel execution time: 10.8925 ms +SpMM template time ref4: 11.4401 + +kernel execution time: 162.277 ms +taco reference time: 162.908 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 206500, B2_dimension: 206500, vals: 1273389 +C1_dimension: 206500, C2_dimension: 128, vals: 26432000 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 23.8637 ms +fused time: 24.4029 + +kernel execution time: 69.8832 ms +SpMM time: 70.504 + +kernel execution time: 26.8086 ms +SpMM template time: 27.6336 + +kernel execution time: 34.2049 ms +GeMM time: 34.8056 + +kernel execution time: 34.6783 ms +ref 2 GeMM template time: 35.183 + +kernel execution time: 33.8854 ms +ref3 GeMM template time: 34.3954 + +kernel execution time: 13.9069 ms +SpMM template time ref4: 14.4251 + +kernel execution time: 189.271 ms +taco reference time: 189.95 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536 +C1_dimension: 1000005, C2_dimension: 128, vals: 128000640 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 66.2912 ms +fused time: 66.8207 + +kernel execution time: 335.04 ms +SpMM time: 335.699 + +kernel execution time: 83.9137 ms +SpMM template time: 84.5618 + +kernel execution time: 157.411 ms +GeMM time: 158.061 + +kernel execution time: 169.35 ms +ref 2 GeMM template time: 169.938 + +kernel execution time: 168.201 ms +ref3 GeMM template time: 168.762 + +kernel execution time: 44.531 ms +SpMM template time ref4: 45.176 + +kernel execution time: 458.322 ms +taco reference time: 458.992 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291 +C1_dimension: 5558326, C2_dimension: 128, vals: 711465728 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 629.911 ms +fused time: 630.89 + +kernel execution time: 2385.92 ms +SpMM time: 2386.45 + +kernel execution time: 904.117 ms +SpMM template time: 904.66 + +kernel execution time: 867.356 ms +GeMM time: 867.943 + +kernel execution time: 946.344 ms +ref 2 GeMM template time: 946.912 + +kernel execution time: 951.944 ms +ref3 GeMM template time: 952.496 + +kernel execution time: 464.289 ms +SpMM template time ref4: 464.847 + +kernel execution time: 19646 ms +taco reference time: 19647.2 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536 +C1_dimension: 1000005, C2_dimension: 128, vals: 128000640 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 65.749 ms +fused time: 66.2393 + +kernel execution time: 334.436 ms +SpMM time: 335.114 + +kernel execution time: 85.6378 ms +SpMM template time: 86.2216 + +kernel execution time: 156.716 ms +GeMM time: 157.281 + +kernel execution time: 169.383 ms +ref 2 GeMM template time: 169.948 + +kernel execution time: 168.128 ms +ref3 GeMM template time: 168.722 + +kernel execution time: 44.3902 ms +SpMM template time ref4: 44.9859 + +kernel execution time: 462.089 ms +taco reference time: 462.747 + +kernel execution time: 472.176 ms +taco reference new time: 472.868 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 206500, B2_dimension: 206500, vals: 1273389 +C1_dimension: 206500, C2_dimension: 128, vals: 26432000 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 22.9203 ms +fused time: 23.382 + +kernel execution time: 69.0678 ms +SpMM time: 69.6771 + +kernel execution time: 25.7576 ms +SpMM template time: 26.2883 + +kernel execution time: 33.838 ms +GeMM time: 34.3893 + +kernel execution time: 36.2223 ms +ref 2 GeMM template time: 36.7099 + +kernel execution time: 35.9919 ms +ref3 GeMM template time: 36.5181 + +kernel execution time: 13.5094 ms +SpMM template time ref4: 14.0411 + +kernel execution time: 209.225 ms +taco reference time: 209.806 + +kernel execution time: 195.258 ms +taco reference new time: 195.862 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 206500, B2_dimension: 206500, vals: 1273389 +C1_dimension: 206500, C2_dimension: 128, vals: 26432000 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 23.9941 ms +fused time: 24.5306 + +kernel execution time: 70.3118 ms +SpMM time: 70.9711 + +kernel execution time: 26.7754 ms +SpMM template time: 27.3965 + +kernel execution time: 34.3488 ms +GeMM time: 34.9449 + +kernel execution time: 34.9754 ms +ref 2 GeMM template time: 35.5492 + +kernel execution time: 34.4524 ms +ref3 GeMM template time: 35.0358 + +kernel execution time: 13.8295 ms +SpMM template time ref4: 14.4023 + +kernel execution time: 195.316 ms +taco reference time: 195.985 + +kernel execution time: 194.321 ms +taco reference new time: 194.959 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 140874, B2_dimension: 140874, vals: 7813404 +C1_dimension: 140874, C2_dimension: 128, vals: 18031872 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 499.31 ms +fused time: 500.253 + +kernel execution time: 1127.92 ms +SpMM time: 1128.46 + +kernel execution time: 314.563 ms +SpMM template time: 315.094 + +kernel execution time: 1071.42 ms +GeMM time: 1071.96 + +kernel execution time: 772.255 ms +ref 2 GeMM template time: 772.765 + +kernel execution time: 768.478 ms +ref3 GeMM template time: 768.998 + +kernel execution time: 162.934 ms +SpMM template time ref4: 163.456 + +kernel execution time: 51182.8 ms +taco reference time: 51183.7 + +kernel execution time: 62360.6 ms +taco reference new time: 62361.5 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 170998, B2_dimension: 170998, vals: 958936 +C1_dimension: 170998, C2_dimension: 128, vals: 21887744 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 343.987 ms +fused time: 344.403 + +kernel execution time: 127.278 ms +SpMM time: 127.803 + +kernel execution time: 139.755 ms +SpMM template time: 140.297 + +kernel execution time: 1308.19 ms +GeMM time: 1308.77 + +kernel execution time: 930.985 ms +ref 2 GeMM template time: 931.498 + +kernel execution time: 924.636 ms +ref3 GeMM template time: 925.164 + +kernel execution time: 83.9238 ms +SpMM template time ref4: 84.4508 + +kernel execution time: 6298.13 ms +taco reference time: 6299.21 + +kernel execution time: 7357.04 ms +taco reference new time: 7358.09 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 206500, B2_dimension: 206500, vals: 1273389 +C1_dimension: 206500, C2_dimension: 128, vals: 26432000 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 404.825 ms +fused time: 405.271 + +kernel execution time: 142.933 ms +SpMM time: 143.48 + +kernel execution time: 155.193 ms +SpMM template time: 155.761 + +kernel execution time: 1572.88 ms +GeMM time: 1573.41 + +kernel execution time: 1132.63 ms +ref 2 GeMM template time: 1133.13 + +kernel execution time: 1126.54 ms +ref3 GeMM template time: 1127.06 + +kernel execution time: 96.7404 ms +SpMM template time ref4: 97.2437 + +kernel execution time: 8321.2 ms +taco reference time: 8322.27 + +kernel execution time: 9774.76 ms +taco reference new time: 9775.82 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536 +C1_dimension: 1000005, C2_dimension: 128, vals: 128000640 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 1796.56 ms +fused time: 1797.34 + +kernel execution time: 429.623 ms +SpMM time: 430.127 + +kernel execution time: 406.352 ms +SpMM template time: 406.855 + +kernel execution time: 7603.48 ms +GeMM time: 7604.4 + +kernel execution time: 5458.44 ms +ref 2 GeMM template time: 5459.36 + +kernel execution time: 5413.18 ms +ref3 GeMM template time: 5414.05 + +kernel execution time: 266.783 ms +SpMM template time ref4: 267.276 + +kernel execution time: 20481.5 ms +taco reference time: 20482.6 + +kernel execution time: 23942.3 ms +taco reference new time: 23943.8 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291 +C1_dimension: 5558326, C2_dimension: 128, vals: 711465728 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 11983.1 ms +fused time: 11984.1 + +kernel execution time: 14647.3 ms +SpMM time: 14648.4 + +kernel execution time: 5779.35 ms +SpMM template time: 5780.3 + +kernel execution time: 42156 ms +GeMM time: 42156.9 + +kernel execution time: 30315.6 ms +ref 2 GeMM template time: 30316.6 + +kernel execution time: 30070.9 ms +ref3 GeMM template time: 30071.9 + +kernel execution time: 3196.34 ms +SpMM template time ref4: 3197.36 + +kernel execution time: 387963 ms +taco reference time: 387964 + +kernel execution time: 481094 ms +taco reference new time: 481095 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 128, vals: 1404672 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 32.8596 ms +fused time: 33.2745 + +kernel execution time: 57.4073 ms +SpMM time: 57.9242 + +kernel execution time: 18.9092 ms +SpMM template time: 19.4238 + +kernel execution time: 84.8547 ms +GeMM time: 85.3549 + +kernel execution time: 60.5468 ms +ref 2 GeMM template time: 60.9429 + +kernel execution time: 60.3303 ms +ref3 GeMM template time: 60.7269 + +kernel execution time: 9.95693 ms +SpMM template time ref4: 10.3864 + +kernel execution time: 2808.32 ms +taco reference time: 2808.79 + +kernel execution time: 3456.32 ms +taco reference new time: 3457.29 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 128, vals: 4661376 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 203.078 ms +fused time: 203.513 + +kernel execution time: 594.431 ms +SpMM time: 594.968 + +kernel execution time: 135.247 ms +SpMM template time: 135.774 + +kernel execution time: 277.557 ms +GeMM time: 278.077 + +kernel execution time: 201.246 ms +ref 2 GeMM template time: 201.741 + +kernel execution time: 200.173 ms +ref3 GeMM template time: 200.697 + +kernel execution time: 67.3815 ms +SpMM template time ref4: 67.9079 + +kernel execution time: 28413.2 ms +taco reference time: 28414.2 + +kernel execution time: 34685.2 ms +taco reference new time: 34687 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 46835, B2_dimension: 46835, vals: 2374001 +C1_dimension: 46835, C2_dimension: 128, vals: 5994880 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 156.103 ms +fused time: 156.534 + +kernel execution time: 313.946 ms +SpMM time: 314.545 + +kernel execution time: 95.9908 ms +SpMM template time: 96.5235 + +kernel execution time: 355.516 ms +GeMM time: 356.043 + +kernel execution time: 257.486 ms +ref 2 GeMM template time: 258 + +kernel execution time: 255.966 ms +ref3 GeMM template time: 256.498 + +kernel execution time: 50.7943 ms +SpMM template time ref4: 51.3121 + +kernel execution time: 15474.9 ms +taco reference time: 15476 + +kernel execution time: 19054.1 ms +taco reference new time: 19055.3 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 62451, B2_dimension: 62451, vals: 4007383 +C1_dimension: 62451, C2_dimension: 128, vals: 7993728 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 233.01 ms +fused time: 233.435 + +kernel execution time: 583.856 ms +SpMM time: 584.39 + +kernel execution time: 148.111 ms +SpMM template time: 148.649 + +kernel execution time: 474.209 ms +GeMM time: 474.735 + +kernel execution time: 343.934 ms +ref 2 GeMM template time: 344.44 + +kernel execution time: 342.778 ms +ref3 GeMM template time: 343.3 + +kernel execution time: 74.5241 ms +SpMM template time ref4: 75.0386 + +kernel execution time: 26129.8 ms +taco reference time: 26130.9 + +kernel execution time: 32058.9 ms +taco reference new time: 32059.8 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 83334, B2_dimension: 83334, vals: 6010480 +C1_dimension: 83334, C2_dimension: 128, vals: 10666752 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 332.296 ms +fused time: 332.73 + +kernel execution time: 871.053 ms +SpMM time: 871.586 + +kernel execution time: 217.386 ms +SpMM template time: 217.911 + +kernel execution time: 636.82 ms +GeMM time: 637.357 + +kernel execution time: 461.8 ms +ref 2 GeMM template time: 462.325 + +kernel execution time: 458.184 ms +ref3 GeMM template time: 458.738 + +kernel execution time: 114.816 ms +SpMM template time ref4: 115.341 + +kernel execution time: 39240.9 ms +taco reference time: 39242 + +kernel execution time: 48108.4 ms +taco reference new time: 48109.4 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 +C1_dimension: 121192, C2_dimension: 128, vals: 15512576 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 351.775 ms +fused time: 352.201 + +kernel execution time: 317.447 ms +SpMM time: 317.983 + +kernel execution time: 217.205 ms +SpMM template time: 217.733 + +kernel execution time: 921.754 ms +GeMM time: 922.288 + +kernel execution time: 667.69 ms +ref 2 GeMM template time: 668.21 + +kernel execution time: 655.357 ms +ref3 GeMM template time: 655.888 + +kernel execution time: 118.018 ms +SpMM template time ref4: 118.546 + +kernel execution time: 17243.9 ms +taco reference time: 17245 + +kernel execution time: 21353.4 ms +taco reference new time: 21354.7 + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 +C1_dimension: 121192, C2_dimension: 128, vals: 15512576 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 27502 ms +fused time: 27581.4 + +kernel execution time: 19193.1 ms +SpMM time: 19304.1 + +kernel execution time: 8528.83 ms +SpMM template time: 8571.46 + +kernel execution time: 33685.2 ms +GeMM time: 33768.7 + +kernel execution time: 32503 ms +ref 2 GeMM template time: 32589.2 + +kernel execution time: 32859.6 ms +ref3 GeMM template time: 32952.9 + +kernel execution time: 4862.19 ms +SpMM template time ref4: 4917.41 + +kernel execution time: 891084 ms +taco reference time: 891170 + + + + + + +---------------------------------------------------------------------------------------------------- +---------------------------------------------------------------------------------------------------- +---------------------------------------------------------------------------------------------------- + + + + +spmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 128, vals: 1404672 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 2.69255 ms +fused time: 5.71229 + +kernel execution time: 3.93158 ms +SpMM time: 4.42244 + +1st pattern computation + +kernel execution time: 1.69479 ms +SpMM template time: 2.18137 + +kernel execution time: 2.53215 ms +GeMM time: 2.92698 + +kernel execution time: 82.7455 ms +ref 2 GeMM template time: 83.6829 + +2nd pattern computation + +kernel execution time: 2.52512 ms +ref3 GeMM template time: 2.90403 + +kernel execution time: 1.07835 ms +SpMM template time ref4: 1.34312 + +reference pattern computation + +kernel execution time: 66.8405 ms +taco reference time: 67.1485 + +kernel execution time: 71.5847 ms +taco reference new time: 71.9261 + +spmm-spmm execution + +----------------------------------------- +filenum: 2 +--------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 128, vals: 1404672 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 2.77205 ms +fused time: 6.22498 + +kernel execution time: 3.70735 ms +SpMM time: 4.15143 + +1st pattern computation + +kernel execution time: 1.68777 ms +SpMM template time: 2.37238 + +kernel execution time: 2.64104 ms +GeMM time: 5.76589 + +kernel execution time: 81.9899 ms +ref 2 GeMM template time: 82.2704 + +2nd pattern computation + +kernel execution time: 2.45488 ms +ref3 GeMM template time: 2.8586 + +kernel execution time: 1.12289 ms +SpMM template time ref4: 1.39155 + +reference pattern computation + +kernel execution time: 76.3877 ms +taco reference time: 78.7939 + +kernel execution time: 72.755 ms +taco reference new time: 73.1269 +filenum: 3 +--------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 128, vals: 4661376 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 7.80932 ms +fused time: 11.2518 + +kernel execution time: 16.5944 ms +SpMM time: 17.886 + +1st pattern computation + +kernel execution time: 7.11089 ms +SpMM template time: 7.68253 + +kernel execution time: 6.4731 ms +GeMM time: 9.33681 + +kernel execution time: 275.759 ms +ref 2 GeMM template time: 276.631 + +2nd pattern computation + +kernel execution time: 6.3356 ms +ref3 GeMM template time: 6.81471 + +kernel execution time: 4.47152 ms +SpMM template time ref4: 4.76175 + +reference pattern computation + +kernel execution time: 658.29 ms +taco reference time: 658.76 + +kernel execution time: 687.782 ms +taco reference new time: 688.49 +filenum: 4 +--------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 46835, B2_dimension: 46835, vals: 2374001 +C1_dimension: 46835, C2_dimension: 128, vals: 5994880 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 6.78576 ms +fused time: 8.17823 + +kernel execution time: 18.7121 ms +SpMM time: 20.1397 + +1st pattern computation + +kernel execution time: 6.53343 ms +SpMM template time: 7.11366 + +kernel execution time: 8.13131 ms +GeMM time: 10.4823 + +kernel execution time: 341.676 ms +ref 2 GeMM template time: 341.986 + +2nd pattern computation + +kernel execution time: 7.69804 ms +ref3 GeMM template time: 8.15483 + +kernel execution time: 4.61245 ms +SpMM template time ref4: 4.90988 + +reference pattern computation + +kernel execution time: 343.367 ms +taco reference time: 343.755 + +kernel execution time: 374.197 ms +taco reference new time: 374.704 +filenum: 5 +--------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 62451, B2_dimension: 62451, vals: 4007383 +C1_dimension: 62451, C2_dimension: 128, vals: 7993728 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 11.6176 ms +fused time: 15.1115 + +kernel execution time: 22.6994 ms +SpMM time: 23.3508 + +1st pattern computation + +kernel execution time: 11.9033 ms +SpMM template time: 12.4284 + +kernel execution time: 10.4635 ms +GeMM time: 10.9336 + +kernel execution time: 452.62 ms +ref 2 GeMM template time: 452.931 + +2nd pattern computation + +kernel execution time: 9.29193 ms +ref3 GeMM template time: 9.74228 + +kernel execution time: 7.21434 ms +SpMM template time ref4: 7.5664 + +reference pattern computation + +kernel execution time: 570.857 ms +taco reference time: 571.396 + +kernel execution time: 623.78 ms +taco reference new time: 624.325 +filenum: 6 +--------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 83334, B2_dimension: 83334, vals: 6010480 +C1_dimension: 83334, C2_dimension: 128, vals: 10666752 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 15.2241 ms +fused time: 17.4586 + +kernel execution time: 31.7064 ms +SpMM time: 32.3582 + +1st pattern computation + +kernel execution time: 16.5454 ms +SpMM template time: 17.0802 + +kernel execution time: 13.8741 ms +GeMM time: 14.3707 + +kernel execution time: 604.662 ms +ref 2 GeMM template time: 605.002 + +2nd pattern computation + +kernel execution time: 11.9433 ms +ref3 GeMM template time: 12.403 + +kernel execution time: 9.77169 ms +SpMM template time ref4: 10.1324 + +reference pattern computation + +kernel execution time: 841.646 ms +taco reference time: 842.221 + +kernel execution time: 932.828 ms +taco reference new time: 933.378 +filenum: 7 +--------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 +C1_dimension: 121192, C2_dimension: 128, vals: 15512576 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 25.1981 ms +fused time: 28.3453 + +kernel execution time: 51.7019 ms +SpMM time: 52.3269 + +1st pattern computation + +kernel execution time: 24.2567 ms +SpMM template time: 24.8204 + +kernel execution time: 19.9687 ms +GeMM time: 20.5536 + +kernel execution time: 874.389 ms +ref 2 GeMM template time: 874.8 + +2nd pattern computation + +kernel execution time: 17.1428 ms +ref3 GeMM template time: 17.605 + +kernel execution time: 12.4989 ms +SpMM template time ref4: 12.9327 + +reference pattern computation + +kernel execution time: 374.424 ms +taco reference time: 375.053 + +kernel execution time: 412.224 ms +taco reference new time: 412.828 +filenum: 8 +--------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 140874, B2_dimension: 140874, vals: 7813404 +C1_dimension: 140874, C2_dimension: 128, vals: 18031872 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 22.3642 ms +fused time: 22.9541 + +kernel execution time: 48.8361 ms +SpMM time: 49.478 + +1st pattern computation + +kernel execution time: 24.4919 ms +SpMM template time: 25.0744 + +kernel execution time: 23.1278 ms +GeMM time: 23.714 + +kernel execution time: 1021.89 ms +ref 2 GeMM template time: 1022.32 + +2nd pattern computation + +kernel execution time: 19.872 ms +ref3 GeMM template time: 20.3315 + +kernel execution time: 14.608 ms +SpMM template time ref4: 15.077 + +reference pattern computation + +kernel execution time: 1080.68 ms +taco reference time: 1081.32 + +kernel execution time: 1211.77 ms +taco reference new time: 1212.36 +filenum: 9 +--------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 170998, B2_dimension: 170998, vals: 958936 +C1_dimension: 170998, C2_dimension: 128, vals: 21887744 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 16.318 ms +fused time: 18.887 + +kernel execution time: 56.5258 ms +SpMM time: 57.1171 + +1st pattern computation + +kernel execution time: 18.2007 ms +SpMM template time: 18.7215 + +kernel execution time: 28.1041 ms +GeMM time: 28.6173 + +kernel execution time: 1232.84 ms +ref 2 GeMM template time: 1233.26 + +2nd pattern computation + +kernel execution time: 23.6402 ms +ref3 GeMM template time: 24.1216 + +kernel execution time: 10.6221 ms +SpMM template time ref4: 11.1278 + +reference pattern computation + +kernel execution time: 136.61 ms +taco reference time: 137.191 + +kernel execution time: 143.222 ms +taco reference new time: 143.823 +filenum: 10 +--------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 206500, B2_dimension: 206500, vals: 1273389 +C1_dimension: 206500, C2_dimension: 128, vals: 26432000 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 22.1951 ms +fused time: 25.4707 + +kernel execution time: 69.5817 ms +SpMM time: 70.2133 + +1st pattern computation + +kernel execution time: 25.2229 ms +SpMM template time: 25.818 + +kernel execution time: 34.0166 ms +GeMM time: 34.5719 + +kernel execution time: 1506.8 ms +ref 2 GeMM template time: 1507.32 + +2nd pattern computation + +kernel execution time: 27.9513 ms +ref3 GeMM template time: 28.4381 + +kernel execution time: 13.4585 ms +SpMM template time ref4: 14.0168 + +reference pattern computation + +kernel execution time: 182.244 ms +taco reference time: 182.878 + +kernel execution time: 191.621 ms +taco reference new time: 192.28 +filenum: 12 +--------------------------------- +/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536 +C1_dimension: 1000005, C2_dimension: 128, vals: 128000640 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 62.6358 ms +fused time: 66.0562 + +kernel execution time: 331.995 ms +SpMM time: 332.669 + +1st pattern computation + +kernel execution time: 81.0262 ms +SpMM template time: 81.6316 + +kernel execution time: 155.308 ms +GeMM time: 155.913 + +kernel execution time: 7174.32 ms +ref 2 GeMM template time: 7175.38 + +2nd pattern computation + +kernel execution time: 131.848 ms +ref3 GeMM template time: 132.36 + +kernel execution time: 43.681 ms +SpMM template time ref4: 44.293 + +reference pattern computation + +kernel execution time: 444.857 ms +taco reference time: 445.492 + +kernel execution time: 467.509 ms +taco reference new time: 468.15 +filenum: 15 +--------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291 +C1_dimension: 5558326, C2_dimension: 128, vals: 711465728 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 621.338 ms +fused time: 625.05 + +kernel execution time: 2276.7 ms +SpMM time: 2277.28 + +1st pattern computation + +kernel execution time: 881.7 ms +SpMM template time: 882.296 + +kernel execution time: 859.785 ms +GeMM time: 860.272 + +kernel execution time: 39771.6 ms +ref 2 GeMM template time: 39772.6 + +2nd pattern computation + +kernel execution time: 748.251 ms +ref3 GeMM template time: 748.758 + +kernel execution time: 452.61 ms +SpMM template time ref4: 453.163 + +reference pattern computation + +kernel execution time: 19528.6 ms +taco reference time: 19529.7 + +kernel execution time: 26715.2 ms +taco reference new time: 26716.6 + +spmm-spmm execution + +----------------------------------------- +filenum: 2 +--------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 10974, B2_dimension: 10974, vals: 428650 +C1_dimension: 10974, C2_dimension: 128, vals: 1404672 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 2.64213 ms +fused time: 6.13507 + +--------- 1st pattern computation TTM, GEMM + +kernel execution time: 3.84056 ms +SpMM time: 4.24008 + +kernel execution time: 1.61274 ms +SpMM template time: 2.04575 + +kernel execution time: 2.33971 ms +GeMM time: 2.69705 + +kernel execution time: 85.2544 ms +ref 2 GeMM template time: 86.1514 + +--------- 2nd pattern computation GEMM, SpMM + +kernel execution time: 2.2757 ms +ref3 GeMM template time: 2.64863 + +kernel execution time: 1.04819 ms +SpMM template time ref4: 1.27491 + +-------- reference pattern computation + +kernel execution time: 69.4126 ms +taco reference time: 71.9418 + +kernel execution time: 71.8522 ms +taco reference new time: 72.137 +filenum: 3 +--------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 128, vals: 4661376 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 7.47716 ms +fused time: 11.1061 + +--------- 1st pattern computation TTM, GEMM + +kernel execution time: 16.7215 ms +SpMM time: 17.3352 + +kernel execution time: 7.10234 ms +SpMM template time: 7.68864 + +kernel execution time: 6.44691 ms +GeMM time: 9.89357 + +kernel execution time: 275.868 ms +ref 2 GeMM template time: 276.795 + +--------- 2nd pattern computation GEMM, SpMM + +kernel execution time: 6.21948 ms +ref3 GeMM template time: 6.86379 + +kernel execution time: 4.55999 ms +SpMM template time ref4: 4.85255 + +-------- reference pattern computation + +kernel execution time: 643.662 ms +taco reference time: 644.221 + +kernel execution time: 682.88 ms +taco reference new time: 683.468 +filenum: 4 +--------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 46835, B2_dimension: 46835, vals: 2374001 +C1_dimension: 46835, C2_dimension: 128, vals: 5994880 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 7.25024 ms +fused time: 11.0411 + +--------- 1st pattern computation TTM, GEMM + +kernel execution time: 18.4386 ms +SpMM time: 18.956 + +kernel execution time: 6.48062 ms +SpMM template time: 7.03658 + +kernel execution time: 7.9428 ms +GeMM time: 9.42206 + +kernel execution time: 343.414 ms +ref 2 GeMM template time: 343.746 + +--------- 2nd pattern computation GEMM, SpMM + +kernel execution time: 6.9495 ms +ref3 GeMM template time: 7.40299 + +kernel execution time: 4.95305 ms +SpMM template time ref4: 5.26981 + +-------- reference pattern computation + +kernel execution time: 338.889 ms +taco reference time: 339.74 + +kernel execution time: 373.621 ms +taco reference new time: 374.075 +filenum: 5 +--------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 62451, B2_dimension: 62451, vals: 4007383 +C1_dimension: 62451, C2_dimension: 128, vals: 7993728 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 11.3714 ms +fused time: 15.0722 + +--------- 1st pattern computation TTM, GEMM + +kernel execution time: 22.4213 ms +SpMM time: 22.9773 + +kernel execution time: 11.8747 ms +SpMM template time: 12.4314 + +kernel execution time: 10.2572 ms +GeMM time: 12.818 + +kernel execution time: 451.818 ms +ref 2 GeMM template time: 452.131 + +--------- 2nd pattern computation GEMM, SpMM + +kernel execution time: 9.4658 ms +ref3 GeMM template time: 9.90856 + +kernel execution time: 6.97316 ms +SpMM template time ref4: 7.30846 + +-------- reference pattern computation + +kernel execution time: 543.932 ms +taco reference time: 544.422 + +kernel execution time: 623.419 ms +taco reference new time: 623.935 +filenum: 6 +--------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 83334, B2_dimension: 83334, vals: 6010480 +C1_dimension: 83334, C2_dimension: 128, vals: 10666752 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 15.18 ms +fused time: 18.5471 + +--------- 1st pattern computation TTM, GEMM + +kernel execution time: 31.3038 ms +SpMM time: 31.9251 + +kernel execution time: 16.4816 ms +SpMM template time: 17.0655 + +kernel execution time: 13.7454 ms +GeMM time: 14.2668 + +kernel execution time: 601.657 ms +ref 2 GeMM template time: 602.024 + +--------- 2nd pattern computation GEMM, SpMM + +kernel execution time: 14.354 ms +ref3 GeMM template time: 14.8072 + +kernel execution time: 9.41569 ms +SpMM template time ref4: 9.77992 + +-------- reference pattern computation + +kernel execution time: 805.535 ms +taco reference time: 806.106 + +kernel execution time: 928.447 ms +taco reference new time: 928.999 +filenum: 7 +--------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 +C1_dimension: 121192, C2_dimension: 128, vals: 15512576 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 25.2666 ms +fused time: 27.8771 + +--------- 1st pattern computation TTM, GEMM + +kernel execution time: 51.9104 ms +SpMM time: 52.5127 + +kernel execution time: 23.9709 ms +SpMM template time: 24.5371 + +kernel execution time: 19.8979 ms +GeMM time: 20.5052 + +kernel execution time: 878.762 ms +ref 2 GeMM template time: 879.166 + +--------- 2nd pattern computation GEMM, SpMM + +kernel execution time: 16.9454 ms +ref3 GeMM template time: 17.4072 + +kernel execution time: 12.6943 ms +SpMM template time ref4: 13.1204 + +-------- reference pattern computation + +kernel execution time: 356.591 ms +taco reference time: 357.146 + +kernel execution time: 408.529 ms +taco reference new time: 409.172 +filenum: 8 +--------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 140874, B2_dimension: 140874, vals: 7813404 +C1_dimension: 140874, C2_dimension: 128, vals: 18031872 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 22.2469 ms +fused time: 22.8567 + +--------- 1st pattern computation TTM, GEMM + +kernel execution time: 49.6959 ms +SpMM time: 50.3273 + +kernel execution time: 24.2333 ms +SpMM template time: 24.8116 + +kernel execution time: 23.0719 ms +GeMM time: 23.6169 + +kernel execution time: 1017.55 ms +ref 2 GeMM template time: 1018 + +--------- 2nd pattern computation GEMM, SpMM + +kernel execution time: 19.3601 ms +ref3 GeMM template time: 19.8249 + +kernel execution time: 14.2804 ms +SpMM template time ref4: 14.7665 + +-------- reference pattern computation + +kernel execution time: 1048.84 ms +taco reference time: 1049.44 + +kernel execution time: 1209.88 ms +taco reference new time: 1210.47 +filenum: 9 +--------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 170998, B2_dimension: 170998, vals: 958936 +C1_dimension: 170998, C2_dimension: 128, vals: 21887744 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 15.8746 ms +fused time: 19.813 + +--------- 1st pattern computation TTM, GEMM + +kernel execution time: 55.9723 ms +SpMM time: 56.6152 + +kernel execution time: 17.9806 ms +SpMM template time: 18.623 + +kernel execution time: 27.7406 ms +GeMM time: 28.4557 + +kernel execution time: 1236.24 ms +ref 2 GeMM template time: 1236.69 + +--------- 2nd pattern computation GEMM, SpMM + +kernel execution time: 23.8143 ms +ref3 GeMM template time: 24.2887 + +kernel execution time: 10.5388 ms +SpMM template time ref4: 11.0342 + +-------- reference pattern computation + +kernel execution time: 131.162 ms +taco reference time: 131.729 + +kernel execution time: 142.639 ms +taco reference new time: 143.262 +filenum: 10 +--------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 206500, B2_dimension: 206500, vals: 1273389 +C1_dimension: 206500, C2_dimension: 128, vals: 26432000 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 22.0414 ms +fused time: 24.5186 + +--------- 1st pattern computation TTM, GEMM + +kernel execution time: 69.6038 ms +SpMM time: 70.136 + +kernel execution time: 24.6489 ms +SpMM template time: 25.1488 + +kernel execution time: 33.413 ms +GeMM time: 33.9108 + +kernel execution time: 1497.05 ms +ref 2 GeMM template time: 1497.51 + +--------- 2nd pattern computation GEMM, SpMM + +kernel execution time: 29.3442 ms +ref3 GeMM template time: 29.8157 + +kernel execution time: 12.9244 ms +SpMM template time ref4: 13.3503 + +-------- reference pattern computation + +kernel execution time: 174.347 ms +taco reference time: 174.811 + +kernel execution time: 190.408 ms +taco reference new time: 190.973 +filenum: 12 +--------------------------------- +/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536 +C1_dimension: 1000005, C2_dimension: 128, vals: 128000640 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 61.219 ms +fused time: 65.9604 + +--------- 1st pattern computation TTM, GEMM + +kernel execution time: 329.098 ms +SpMM time: 329.782 + +kernel execution time: 80.1902 ms +SpMM template time: 80.758 + +kernel execution time: 154.474 ms +GeMM time: 155.08 + +kernel execution time: 7192.75 ms +ref 2 GeMM template time: 7193.76 + +--------- 2nd pattern computation GEMM, SpMM + +kernel execution time: 132.057 ms +ref3 GeMM template time: 132.561 + +kernel execution time: 43.0394 ms +SpMM template time ref4: 43.6558 + +-------- reference pattern computation + +kernel execution time: 430.157 ms +taco reference time: 430.825 + +kernel execution time: 463.848 ms +taco reference new time: 464.498 +filenum: 15 +--------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291 +C1_dimension: 5558326, C2_dimension: 128, vals: 711465728 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 602.9 ms +fused time: 606.764 + +--------- 1st pattern computation TTM, GEMM + +kernel execution time: 2126.86 ms +SpMM time: 2127.49 + +kernel execution time: 871.892 ms +SpMM template time: 872.491 + +kernel execution time: 845.837 ms +GeMM time: 846.363 + +kernel execution time: 39844.5 ms +ref 2 GeMM template time: 39845.6 + +--------- 2nd pattern computation GEMM, SpMM + +kernel execution time: 740.208 ms +ref3 GeMM template time: 740.701 + +kernel execution time: 447.66 ms +SpMM template time ref4: 448.268 + +-------- reference pattern computation + +kernel execution time: 18669.7 ms +taco reference time: 18671 + +kernel execution time: 26729.8 ms +taco reference new time: 26731.1 + +spmm-spmm execution + +----------------------------------------- +filenum: 1 +--------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 5, B2_dimension: 5, vals: 19 +C1_dimension: 5, C2_dimension: 128, vals: 640 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 0.044111 ms +fused time: 0.69912 + +--------- 1st pattern computation TTM, GEMM + +kernel execution time: 0.019191 ms +SpMM time: 1.30214 + +kernel execution time: 0.499717 ms +SpMM template time: 1.01315 + +kernel execution time: 0.096371 ms +GeMM time: 0.631739 + +kernel execution time: 0.070191 ms +ref 2 GeMM template time: 0.560537 + +--------- 2nd pattern computation GEMM, SpMM + +kernel execution time: 0.070901 ms +ref3 GeMM template time: 0.579358 + +kernel execution time: 0.02984 ms +SpMM template time ref4: 0.851161 + +-------- reference pattern computation + +kernel execution time: 0.194393 ms +taco reference time: 0.628889 + +kernel execution time: 0.242974 ms +taco reference new time: 0.667439 + +spmm-spmm execution + +----------------------------------------- +filenum: 1 +--------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 5, B2_dimension: 5, vals: 19 +C1_dimension: 5, C2_dimension: 128, vals: 640 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 0.043801 ms +fused time: 0.685989 + +--------- 1st pattern computation TTM, GEMM + +kernel execution time: 0.01878 ms +SpMM time: 0.861191 + +kernel execution time: 0.503617 ms +SpMM template time: 1.00581 + +kernel execution time: 0.095292 ms +GeMM time: 0.583898 + +kernel execution time: 0.070121 ms +ref 2 GeMM template time: 0.520137 + +--------- 2nd pattern computation GEMM, SpMM + +kernel execution time: 0.070641 ms +ref3 GeMM template time: 0.537688 + +kernel execution time: 0.035491 ms +SpMM template time ref4: 0.514717 + +-------- reference pattern computation + +kernel execution time: 0.194192 ms +taco reference time: 0.618658 + +kernel execution time: 0.239543 ms +taco reference new time: 0.655149 + +spmm-spmm execution + +----------------------------------------- +filenum: 1 +--------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 5, B2_dimension: 5, vals: 19 +C1_dimension: 5, C2_dimension: 128, vals: 640 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 0.04383 ms +fused time: 0.680319 + +--------- 1st pattern computation TTM, GEMM + +kernel execution time: 0.019891 ms +SpMM time: 0.72453 + +kernel execution time: 0.515237 ms +SpMM template time: 0.995294 + +kernel execution time: 0.095731 ms +GeMM time: 0.628018 + +kernel execution time: 0.071101 ms +ref 2 GeMM template time: 0.539967 + +--------- 2nd pattern computation GEMM, SpMM + +kernel execution time: 0.071171 ms +ref3 GeMM template time: 0.592848 + +kernel execution time: 0.029131 ms +SpMM template time ref4: 0.582288 + +-------- reference pattern computation + +kernel execution time: 0.254484 ms +taco reference time: 0.768111 + +kernel execution time: 0.273853 ms +taco reference new time: 0.781751 + +spmm-spmm execution + +----------------------------------------- +filenum: 1 +--------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 5, B2_dimension: 5, vals: 19 +C1_dimension: 5, C2_dimension: 128, vals: 640 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 0.043111 ms +fused time: 0.676409 + +--------- 1st pattern computation TTM, GEMM + +kernel execution time: 0.01898 ms +SpMM time: 0.836491 + +kernel execution time: 0.489586 ms +SpMM template time: 0.969303 + +kernel execution time: 0.094641 ms +GeMM time: 0.561697 + +kernel execution time: 0.070251 ms +ref 2 GeMM template time: 0.545778 + +--------- 2nd pattern computation GEMM, SpMM + +kernel execution time: 0.07045 ms +ref3 GeMM template time: 0.550897 + +kernel execution time: 0.0282 ms +SpMM template time ref4: 0.463227 + +-------- reference pattern computation + +kernel execution time: 0.245783 ms +taco reference time: 0.761711 + +kernel execution time: 0.304314 ms +taco reference new time: 0.834081 + +spmm-spmm execution + +----------------------------------------- +filenum: 1 +--------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 5, B2_dimension: 5, vals: 19 +C1_dimension: 5, C2_dimension: 128, vals: 640 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 0.03874 ms +fused time: 0.669969 + +--------- 1st pattern computation TTM, GEMM + +kernel execution time: 0.019931 ms +SpMM time: 0.857531 + +kernel execution time: 0.507936 ms +SpMM template time: 1.00321 + +kernel execution time: 0.093961 ms +GeMM time: 0.727229 + +kernel execution time: 0.070371 ms +ref 2 GeMM template time: 0.867451 + +--------- 2nd pattern computation GEMM, SpMM + +kernel execution time: 0.069541 ms +ref3 GeMM template time: 0.546687 + +kernel execution time: 0.02565 ms +SpMM template time ref4: 0.541707 + +-------- reference pattern computation + +kernel execution time: 0.195092 ms +taco reference time: 0.615338 + +kernel execution time: 0.239653 ms +taco reference new time: 0.657449 + +spmm-spmm execution + +----------------------------------------- +filenum: 3 +--------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 128, vals: 4661376 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +kernel execution time: 202.946 ms +fused time: 203.369 + +--------- 1st pattern computation TTM, GEMM + +kernel execution time: 604.532 ms +SpMM time: 605.081 + +kernel execution time: 137.88 ms +SpMM template time: 138.397 + +kernel execution time: 281.01 ms +GeMM time: 281.522 + +kernel execution time: 267.152 ms +ref 2 GeMM template time: 267.64 + +--------- 2nd pattern computation GEMM, SpMM + +kernel execution time: 202.612 ms +ref3 GeMM template time: 203.13 + +kernel execution time: 72.1263 ms +SpMM template time ref4: 72.634 + +-------- reference pattern computation + +kernel execution time: 26464.3 ms +taco reference time: 26465.4 + +kernel execution time: 34639.1 ms +taco reference new time: 34640.2 + +spmm-spmm execution + +----------------------------------------- +filenum: 3 +--------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 128, vals: 4661376 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +--------- 1st pattern computation TTM, GEMM + +kernel execution time: 601.783 ms +SpMM time: 602.149 + +kernel execution time: 135.443 ms +SpMM template time: 135.968 + +kernel execution time: 277.027 ms +GeMM time: 277.575 + +kernel execution time: 262.418 ms +ref 2 GeMM template time: 262.884 + +--------- 2nd pattern computation GEMM, SpMM + +kernel execution time: 200.17 ms +ref3 GeMM template time: 200.726 + +kernel execution time: 71.523 ms +SpMM template time ref4: 72.0077 + +-------- reference pattern computation + +kernel execution time: 26468.2 ms +taco reference time: 26469.2 + +spmm-spmm execution + +----------------------------------------- +filenum: 3 +--------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 +C1_dimension: 36417, C2_dimension: 128, vals: 4661376 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +--------- 1st pattern computation TTM, GEMM + +kernel execution time: 600.837 ms +SpMM time: 601.215 + +kernel execution time: 137.481 ms +SpMM template time: 138.009 + +kernel execution time: 280.631 ms +GeMM time: 281.208 + +kernel execution time: 266.073 ms +ref 2 GeMM template time: 266.549 + +--------- 2nd pattern computation GEMM, SpMM + +kernel execution time: 200.674 ms +ref3 GeMM template time: 201.238 + +kernel execution time: 72.8548 ms +SpMM template time ref4: 73.3562 + +-------- reference pattern computation + +kernel execution time: 26717.7 ms +taco reference time: 26718.7 + +kernel execution time: 34613.6 ms +taco reference new time: 34614.6 + +kernel execution time: 202.425 ms +fused time: 203.027 + +spmm-spmm execution + +----------------------------------------- +filenum: 3 +--------------------------------- + +spmm-spmm execution + +----------------------------------------- +filenum: 0 +--------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +--------- 1st pattern computation TTM, GEMM + +kernel execution time: 0.924512 ms +SpMM time: 1.22967 + +kernel execution time: 1.23287 ms +SpMM template time: 1.51353 + +kernel execution time: 20.7805 ms +GeMM time: 21.0769 + +kernel execution time: 19.6116 ms +ref 2 GeMM template time: 19.8379 + +--------- 2nd pattern computation GEMM, SpMM + +kernel execution time: 14.7563 ms +ref3 GeMM template time: 15.0245 + +kernel execution time: 0.823641 ms +SpMM template time ref4: 1.05233 + +-------- reference pattern computation + +kernel execution time: 34.1041 ms +taco reference time: 34.4607 + +kernel execution time: 41.9195 ms +taco reference new time: 42.2061 + +kernel execution time: 4.76242 ms +fused time: 5.04101 +filenum: 1 +--------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 548551, B2_dimension: 548551, vals: 1851744 +C1_dimension: 548551, C2_dimension: 128, vals: 70214528 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +--------- 1st pattern computation TTM, GEMM + +kernel execution time: 394.8 ms +SpMM time: 395.503 + +kernel execution time: 473.148 ms +SpMM template time: 473.684 + +kernel execution time: 4117.68 ms +GeMM time: 4118.6 + +kernel execution time: 3957.31 ms +ref 2 GeMM template time: 3958.16 + +--------- 2nd pattern computation GEMM, SpMM + +kernel execution time: 3017.13 ms +ref3 GeMM template time: 3017.67 + +kernel execution time: 314.652 ms +SpMM template time ref4: 315.164 + +-------- reference pattern computation + +kernel execution time: 11644.6 ms +taco reference time: 11645.6 + +kernel execution time: 14402.6 ms +taco reference new time: 14403.6 + +kernel execution time: 1261.33 ms +fused time: 1261.88 + +spmm-spmm execution + +----------------------------------------- +filenum: 0 +--------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +--------- 1st pattern computation TTM, GEMM + +kernel execution time: 0.209133 ms +SpMM time: 0.517016 + +kernel execution time: 0.579748 ms +SpMM template time: 0.864251 + +kernel execution time: 1.0574 ms +GeMM time: 1.37727 + +kernel execution time: 19.621 ms +ref 2 GeMM template time: 19.8504 + +--------- 2nd pattern computation GEMM, SpMM + +kernel execution time: 1.44618 ms +ref3 GeMM template time: 1.72243 + +kernel execution time: 0.384425 ms +SpMM template time ref4: 0.610708 + +-------- reference pattern computation + +kernel execution time: 3.59893 ms +taco reference time: 3.95508 + +kernel execution time: 4.81855 ms +taco reference new time: 5.10349 + +kernel execution time: 1.47107 ms +fused time: 1.90463 +filenum: 1 +--------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 548551, B2_dimension: 548551, vals: 1851744 +C1_dimension: 548551, C2_dimension: 128, vals: 70214528 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +--------- 1st pattern computation TTM, GEMM + +kernel execution time: 50.1795 ms +SpMM time: 50.5567 + +kernel execution time: 64.2504 ms +SpMM template time: 64.8179 + +kernel execution time: 96.8464 ms +GeMM time: 97.4123 + +kernel execution time: 3949.87 ms +ref 2 GeMM template time: 3950.93 + +--------- 2nd pattern computation GEMM, SpMM + +kernel execution time: 123.802 ms +ref3 GeMM template time: 124.342 + +kernel execution time: 39.2723 ms +SpMM template time ref4: 39.8322 + +-------- reference pattern computation + +kernel execution time: 457.271 ms +taco reference time: 457.979 + +kernel execution time: 427.194 ms +taco reference new time: 427.789 + +kernel execution time: 93.1417 ms +fused time: 93.7188 diff --git a/test/stats/spmv-spmv.txt b/test/stats/spmv-spmv.txt new file mode 100644 index 000000000..90b7482e7 --- /dev/null +++ b/test/stats/spmv-spmv.txt @@ -0,0 +1,81 @@ + +spmv-spmv execution + +----------------------------------------- +A(i) = B(i,j) * C(j,k) * v(k); +B1_dimension: 5, B2_dimension: 5, vals: 19 +C1_dimension: 5, C2_dimension: 5, vals: 19 +D1_dimension: 5, vals: 5 + + +spmv-spmv execution + +----------------------------------------- +A(i) = B(i,j) * C(j,k) * v(k); +B1_dimension: 5, B2_dimension: 5, vals: 19 +C1_dimension: 5, C2_dimension: 5, vals: 19 +D1_dimension: 5, vals: 5 + + +spmv-spmv execution + +----------------------------------------- +A(i) = B(i,j) * C(j,k) * v(k); +B1_dimension: 5, B2_dimension: 5, vals: 19 +C1_dimension: 5, C2_dimension: 5, vals: 19 +D1_dimension: 5, vals: 5 + + +spmv-spmv execution + +----------------------------------------- +A(i) = B(i,j) * C(j,k) * v(k); +B1_dimension: 5, B2_dimension: 5, vals: 19 +C1_dimension: 5, C2_dimension: 5, vals: 19 +D1_dimension: 5, vals: 5 + + +spmv-spmv execution + +----------------------------------------- +A(i) = B(i,j) * C(j,k) * v(k); +B1_dimension: 5, B2_dimension: 5, vals: 19 +C1_dimension: 5, C2_dimension: 5, vals: 19 +D1_dimension: 5, vals: 5 + + +spmv-spmv execution + +----------------------------------------- +A(i) = B(i,j) * C(j,k) * v(k); +B1_dimension: 5, B2_dimension: 5, vals: 19 +C1_dimension: 5, C2_dimension: 5, vals: 19 +D1_dimension: 5, vals: 5 + + +spmv-spmv execution + +----------------------------------------- +A(i) = B(i,j) * C(j,k) * v(k); +B1_dimension: 5, B2_dimension: 5, vals: 19 +C1_dimension: 5, C2_dimension: 5, vals: 19 +D1_dimension: 5, vals: 5 + + +spmv-spmv execution + +----------------------------------------- +A(i) = B(i,j) * C(j,k) * v(k); +B1_dimension: 5, B2_dimension: 5, vals: 19 +C1_dimension: 5, C2_dimension: 5, vals: 19 +D1_dimension: 5, vals: 5 + + +spmv-spmv execution + +----------------------------------------- +A(i) = B(i,j) * C(j,k) * v(k); +B1_dimension: 5, B2_dimension: 5, vals: 19 +C1_dimension: 5, C2_dimension: 5, vals: 19 +D1_dimension: 5, vals: 5 + diff --git a/test/stats/ttm-ttm.txt b/test/stats/ttm-ttm.txt new file mode 100644 index 000000000..7080af67b --- /dev/null +++ b/test/stats/ttm-ttm.txt @@ -0,0 +1,2924 @@ +ttm-ttm execution + +----------------------------------------- +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +/home/min/a/kadhitha/ispc-examples/data/tns/delicious-3d.tns +B1_dimension: 532924, B2_dimension: 17262471, B3_dimension: 532924, vals: 140126181 +C1_dimension: 2480308, C2_dimension: 16, vals: 39684928 +D1_dimension: 16, D2_dimension: 16, vals: 256 + + +kernel execution time: 6299.03 ms +fused time: 6300.12 + +kernel execution time: 21080.2 ms +reference time: 21081.3 + +kernel execution time: 2757.48 ms +reference time: 2757.94 + +kernel execution time: 5064.84 ms +reference time: 5065.87 + +ttm-ttm execution + +----------------------------------------- +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns +B1_dimension: 319686, B2_dimension: 28153045, B3_dimension: 319686, vals: 112890310 +C1_dimension: 1607191, C2_dimension: 16, vals: 25715056 +D1_dimension: 16, D2_dimension: 16, vals: 256 + + +kernel execution time: 3709.97 ms +fused time: 3711.05 + +kernel execution time: 16159.4 ms +reference time: 16160.5 + +kernel execution time: 1773.12 ms +reference time: 1773.58 + +kernel execution time: 3030.89 ms +reference time: 3031.42 + +ttm-ttm execution + +------------------------------------------ +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns +B1_dimension: 12092, B2_dimension: 9184, B3_dimension: 12092, vals: 76879419 +C1_dimension: 28818, C2_dimension: 16, vals: 461088 +D1_dimension: 16, D2_dimension: 16, vals: 256 + + +kernel execution time: 487.016 ms +fused time: 487.513 + +kernel execution time: 11041.9 ms +reference time: 11043 + +kernel execution time: 1009.63 ms +reference time: 1010.12 + +kernel execution time: 37.1546 ms +reference time: 37.757 + + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/nell-1.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 2902330, B2_dimension: 2143368, B3_dimension: 2902330, vals: 143599552 +C1_dimension: 25495389, C2_dimension: 16, vals: 407926224 +D1_dimension: 16, D2_dimension: 16, vals: 256 + + +kernel execution time: 11984.9 ms +fused time: 11985.9 + +kernel execution time: 34959 ms +reference time: 34960.1 + +kernel execution time: 8476.95 ms +reference time: 8477.9 + +kernel execution time: 1869.85 ms +reference time: 1870.39 + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854 +C1_dimension: 2, C2_dimension: 16, vals: 32 +D1_dimension: 16, D2_dimension: 16, vals: 256 + + +kernel execution time: 2730.05 ms +fused time: 2731.15 + +kernel execution time: 4167.74 ms +reference time: 4168.86 + +kernel execution time: 550.937 ms +reference time: 551.395 + +kernel execution time: 2788.55 ms +reference time: 2789.07 + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/1998DARPA.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 22476, B2_dimension: 22476, B3_dimension: 22476, vals: 28421307 +C1_dimension: 23776223, C2_dimension: 16, vals: 380419568 +D1_dimension: 16, D2_dimension: 16, vals: 256 + + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/freebase_music.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 23343790, B2_dimension: 23344784, B3_dimension: 23343790, vals: 99546550 +C1_dimension: 166, C2_dimension: 16, vals: 2656 +D1_dimension: 16, D2_dimension: 16, vals: 256 + + +kernel execution time: 10491.6 ms +fused time: 10492.7 + +kernel execution time: 15968 ms +reference time: 15969.1 + +kernel execution time: 1886.09 ms +reference time: 1886.55 + +kernel execution time: 10763.7 ms +reference time: 10765 + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/darpa1998.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 22476, B2_dimension: 22476, B3_dimension: 22476, vals: 28421307 +C1_dimension: 23776223, C2_dimension: 16, vals: 380419568 +D1_dimension: 16, D2_dimension: 16, vals: 256 + + +kernel execution time: 847.087 ms +fused time: 847.588 + +kernel execution time: 7136.54 ms +reference time: 7137.57 + +kernel execution time: 1340.45 ms +reference time: 1340.91 + +kernel execution time: 8.28247 ms +reference time: 8.80899 + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/freebase_sampled.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 38954435, B2_dimension: 38955429, B3_dimension: 38954435, vals: 139920770 +C1_dimension: 532, C2_dimension: 16, vals: 8512 +D1_dimension: 16, D2_dimension: 16, vals: 256 + + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/freebase_music.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 23343790, B2_dimension: 23344784, B3_dimension: 23343790, vals: 99546550 +C1_dimension: 166, C2_dimension: 16, vals: 2656 +D1_dimension: 16, D2_dimension: 16, vals: 256 + + +kernel execution time: 10540.6 ms +fused time: 10541.6 + +kernel execution time: 16072 ms +reference time: 16073 + +kernel execution time: 1900.39 ms +reference time: 1900.89 + +kernel execution time: 10819.5 ms +reference time: 10820.5 + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/freebase_sampled.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 38954435, B2_dimension: 38955429, B3_dimension: 38954435, vals: 139920770 +C1_dimension: 532, C2_dimension: 16, vals: 8512 +D1_dimension: 16, D2_dimension: 16, vals: 256 + + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 319686, B2_dimension: 28153045, B3_dimension: 319686, vals: 112890310 +C1_dimension: 1607191, C2_dimension: 16, vals: 25715056 +D1_dimension: 16, D2_dimension: 16, vals: 256 + + +kernel execution time: 3689.85 ms +fused time: 3690.99 + +kernel execution time: 16162.6 ms +reference time: 16163.7 + +kernel execution time: 2035.42 ms +TTM1: 2035.96 + +kernel execution time: 3004.2 ms +TTM2: 3004.74 + +kernel execution time: 147.233 ms +dense: 147.648 + +kernel execution time: 2240.45 ms +TTM after dense: 2240.96 + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 319686, B2_dimension: 28153045, B3_dimension: 319686, vals: 112890310 +C1_dimension: 1607191, C2_dimension: 16, vals: 25715056 +D1_dimension: 16, D2_dimension: 1024, vals: 16384 + + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 12092, B2_dimension: 9184, B3_dimension: 12092, vals: 76879419 +C1_dimension: 28818, C2_dimension: 16, vals: 461088 +D1_dimension: 16, D2_dimension: 32, vals: 512 + + +kernel execution time: 542.361 ms +fused time: 542.813 + +kernel execution time: 22547.6 ms +reference time: 22548.6 + +kernel execution time: 1008.25 ms +TTM1: 1008.82 + +kernel execution time: 70.7434 ms +TTM2: 71.2926 + +kernel execution time: 5.2174 ms +dense: 5.58699 + +kernel execution time: 2086.85 ms +TTM after dense: 2087.25 + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 12092, B2_dimension: 9184, B3_dimension: 12092, vals: 76879419 +C1_dimension: 28818, C2_dimension: 16, vals: 461088 +D1_dimension: 16, D2_dimension: 16, vals: 256 + + +kernel execution time: 531.924 ms +fused time: 532.696 + +kernel execution time: 11314 ms +reference time: 11315.1 + +kernel execution time: 1009.54 ms +TTM1: 1010.08 + +kernel execution time: 37.5466 ms +TTM2: 38.0867 + +kernel execution time: 2.77519 ms +dense: 3.13589 + +kernel execution time: 1014.37 ms +TTM after dense: 1014.74 + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 12092, B2_dimension: 9184, B3_dimension: 12092, vals: 76879419 +C1_dimension: 28818, C2_dimension: 16, vals: 461088 +D1_dimension: 16, D2_dimension: 64, vals: 1024 + + +kernel execution time: 604.787 ms +fused time: 605.25 + +kernel execution time: 45011.1 ms +reference time: 45012.2 + +kernel execution time: 1008.41 ms +TTM1: 1008.97 + +kernel execution time: 137.791 ms +TTM2: 138.316 + +kernel execution time: 10.0591 ms +dense: 10.4452 + +kernel execution time: 5120.5 ms +TTM after dense: 5121.57 + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 +C1_dimension: 25, C2_dimension: 32, vals: 800 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 0.129572 ms +fused time: 0.560598 + +kernel execution time: 0.151942 ms +reference time: 0.999013 + +kernel execution time: 0.01803 ms +TTM1: 0.310364 + +kernel execution time: 0.119052 ms +TTM2: 0.897713 + +kernel execution time: 0.093421 ms +dense: 0.284444 + +kernel execution time: 0.032111 ms +TTM after dense: 0.662509 + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 +C1_dimension: 25, C2_dimension: 32, vals: 800 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 0.136562 ms +fused time: 0.555088 + +kernel execution time: 0.155282 ms +reference time: 1.02811 + +kernel execution time: 0.01913 ms +TTM1: 0.293014 + +kernel execution time: 0.148032 ms +TTM2: 1.08159 + +kernel execution time: 0.093351 ms +dense: 0.282434 + +kernel execution time: 0.03336 ms +TTM after dense: 0.309775 + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 +C1_dimension: 25, C2_dimension: 32, vals: 800 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 0.133302 ms +fused time: 0.590248 + +kernel execution time: 0.154633 ms +reference time: 0.976683 + +kernel execution time: 0.032061 ms +TTM1: 0.554668 + +kernel execution time: 0.231943 ms +TTM2: 0.790901 + +kernel execution time: 0.093152 ms +dense: 0.456727 + +kernel execution time: 0.168413 ms +TTM after dense: 0.866702 + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 +C1_dimension: 25, C2_dimension: 32, vals: 800 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 0.211383 ms +fused time: 0.979204 + +kernel execution time: 0.300854 ms +reference time: 0.976764 + +kernel execution time: 0.03182 ms +TTM1: 0.986423 + +kernel execution time: 0.223513 ms +TTM2: 1.25582 + +kernel execution time: 0.140142 ms +dense: 0.491247 + +kernel execution time: 0.057651 ms +TTM after dense: 0.632639 + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 +C1_dimension: 25, C2_dimension: 32, vals: 800 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 0.226813 ms +fused time: 0.981434 + +kernel execution time: 0.299435 ms +reference time: 0.980784 + +kernel execution time: 0.03171 ms +TTM1: 1.17345 + +kernel execution time: 0.236723 ms +TTM2: 1.08452 + +kernel execution time: 0.099581 ms +dense: 0.448246 + +kernel execution time: 0.055691 ms +TTM after dense: 0.595948 + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 +C1_dimension: 25, C2_dimension: 32, vals: 800 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 0.183452 ms +fused time: 0.934223 + +kernel execution time: 0.258304 ms +reference time: 1.14423 + +kernel execution time: 0.028031 ms +TTM1: 0.530247 + +kernel execution time: 0.192393 ms +TTM2: 0.865752 + +kernel execution time: 0.104401 ms +dense: 0.458676 + +kernel execution time: 0.058181 ms +TTM after dense: 0.641949 + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 +C1_dimension: 25, C2_dimension: 32, vals: 800 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 0.212263 ms +fused time: 1.00447 + +kernel execution time: 0.293174 ms +reference time: 1.00466 + +kernel execution time: 0.03429 ms +TTM1: 1.06194 + +kernel execution time: 0.227643 ms +TTM2: 0.77555 + +kernel execution time: 0.093021 ms +dense: 0.615169 + +kernel execution time: 0.111302 ms +TTM after dense: 1.19147 + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 +C1_dimension: 25, C2_dimension: 32, vals: 800 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 0.126042 ms +fused time: 0.542138 + +kernel execution time: 0.170263 ms +reference time: 0.974603 + +kernel execution time: 0.01972 ms +TTM1: 0.286434 + +kernel execution time: 0.125282 ms +TTM2: 0.402736 + +kernel execution time: 0.103582 ms +dense: 0.7661 + +kernel execution time: 0.04149 ms +TTM after dense: 0.320775 + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 +C1_dimension: 25, C2_dimension: 32, vals: 800 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 0.193463 ms +fused time: 0.831391 + +kernel execution time: 0.347254 ms +reference time: 1.12168 + +kernel execution time: 0.03811 ms +TTM1: 1.19729 + +kernel execution time: 0.334915 ms +TTM2: 1.14708 + +kernel execution time: 0.109681 ms +dense: 0.526707 + +kernel execution time: 0.140412 ms +TTM after dense: 0.76001 + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 +C1_dimension: 25, C2_dimension: 32, vals: 800 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 0.147722 ms +fused time: 0.7865 + +kernel execution time: 0.237434 ms +reference time: 1.01788 + +kernel execution time: 0.020341 ms +TTM1: 0.330005 + +kernel execution time: 0.201823 ms +TTM2: 1.01705 + +kernel execution time: 0.069931 ms +dense: 0.261943 + +kernel execution time: 0.032231 ms +TTM after dense: 0.314845 + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 +C1_dimension: 25, C2_dimension: 32, vals: 800 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 0.210293 ms +fused time: 0.999243 + +kernel execution time: 0.577188 ms +reference time: 1.23453 + +kernel execution time: 0.032071 ms +TTM1: 0.965223 + +kernel execution time: 0.227183 ms +TTM2: 1.25077 + +kernel execution time: 0.091622 ms +dense: 0.449416 + +kernel execution time: 0.04494 ms +TTM after dense: 0.73161 + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 +C1_dimension: 25, C2_dimension: 32, vals: 800 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 0.109392 ms +fused time: 0.481746 + +kernel execution time: 0.242474 ms +reference time: 0.72963 + +kernel execution time: 0.01624 ms +TTM1: 0.257934 + +kernel execution time: 0.089982 ms +TTM2: 0.341365 + +kernel execution time: 0.106392 ms +dense: 0.74066 + +kernel execution time: 0.027241 ms +TTM after dense: 0.277864 + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 +C1_dimension: 25, C2_dimension: 32, vals: 800 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 +C1_dimension: 25, C2_dimension: 32, vals: 800 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 0.938612 ms +fused time: 1.66032 + +kernel execution time: 0.598878 ms +reference time: 1.2444 + +kernel execution time: 0.027881 ms +TTM1: 0.664309 + +kernel execution time: 0.172162 ms +TTM2: 1.0861 + +kernel execution time: 0.087052 ms +dense: 0.420256 + +kernel execution time: 0.044921 ms +TTM after dense: 0.669959 + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 +C1_dimension: 25, C2_dimension: 32, vals: 800 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 0.723749 ms +fused time: 1.52668 + +kernel execution time: 1.33287 ms +reference time: 2.02148 + +kernel execution time: 0.03285 ms +TTM1: 1.06994 + +kernel execution time: 0.227263 ms +TTM2: 1.00641 + +kernel execution time: 0.121451 ms +dense: 0.410656 + +kernel execution time: 0.046891 ms +TTM after dense: 0.612258 + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 +C1_dimension: 25, C2_dimension: 32, vals: 800 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 0.654879 ms +fused time: 1.0716 + +kernel execution time: 1.24327 ms +reference time: 1.59976 + +kernel execution time: 0.691129 ms +TTM1: 1.0059 + +kernel execution time: 0.859771 ms +TTM2: 1.1516 + +kernel execution time: 0.136762 ms +dense: 0.334665 + +kernel execution time: 0.524517 ms +TTM after dense: 0.806231 + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 319686, B2_dimension: 28153045, B3_dimension: 319686, vals: 112890310 +C1_dimension: 1607191, C2_dimension: 32, vals: 51430112 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 891.501 ms +fused time: 892.508 + +kernel execution time: 6378.22 ms +reference time: 6379.42 + +kernel execution time: 265.033 ms +TTM1: 265.676 + +kernel execution time: 514.397 ms +TTM2: 515.1 + +kernel execution time: 70.5991 ms +dense: 71.0624 + +kernel execution time: 541.878 ms +TTM after dense: 542.548 + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854 +C1_dimension: 2, C2_dimension: 32, vals: 64 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 753.49 ms +fused time: 754.615 + +kernel execution time: 1394.55 ms +reference time: 1395.28 + +kernel execution time: 197.246 ms +TTM1: 197.894 + +kernel execution time: 503.301 ms +TTM2: 503.886 + +kernel execution time: 0.0622 ms +dense: 1.00584 + +kernel execution time: 380.931 ms +TTM after dense: 381.331 + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 319686, B2_dimension: 28153045, B3_dimension: 319686, vals: 112890310 +C1_dimension: 1607191, C2_dimension: 32, vals: 51430112 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 894.532 ms +fused time: 895.512 + +kernel execution time: 6345.62 ms +reference time: 6346.77 + +kernel execution time: 266.55 ms +TTM1: 267.22 + +kernel execution time: 515.257 ms +TTM2: 515.893 + +kernel execution time: 70.7658 ms +dense: 71.2374 + +kernel execution time: 542.175 ms +TTM after dense: 542.864 + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 12092, B2_dimension: 9184, B3_dimension: 12092, vals: 76879419 +C1_dimension: 28818, C2_dimension: 32, vals: 922176 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 49.8694 ms +fused time: 50.6512 + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/nell-1.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 2902330, B2_dimension: 2143368, B3_dimension: 2902330, vals: 143599552 +C1_dimension: 25495389, C2_dimension: 32, vals: 815852448 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 1309.77 ms +fused time: 1310.84 + +kernel execution time: 8179.4 ms +reference time: 8180.68 + +kernel execution time: 805.812 ms +TTM1: 806.562 + +kernel execution time: 314.204 ms +TTM2: 314.751 + +kernel execution time: 1134.47 ms +dense: 1134.93 + +kernel execution time: 1621.3 ms +TTM after dense: 1621.92 + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854 +C1_dimension: 2, C2_dimension: 32, vals: 64 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 749.757 ms +fused time: 750.843 + +kernel execution time: 1391.56 ms +reference time: 1392.35 + +kernel execution time: 196.711 ms +TTM1: 197.347 + +kernel execution time: 502.61 ms +TTM2: 503.193 + +kernel execution time: 0.063271 ms +dense: 0.948892 + +kernel execution time: 381.132 ms +TTM after dense: 381.508 + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/darpa1998.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 22476, B2_dimension: 22476, B3_dimension: 22476, vals: 28421307 +C1_dimension: 23776223, C2_dimension: 32, vals: 760839136 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 230.973 ms +fused time: 231.921 + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 +C1_dimension: 25, C2_dimension: 32, vals: 800 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 0.72187 ms +fused time: 1.46707 + +kernel execution time: 0.842291 ms +reference time: 1.52295 + +kernel execution time: 0.490417 ms +TTM1: 1.08223 + +kernel execution time: 0.653919 ms +TTM2: 1.17803 + +kernel execution time: 0.115332 ms +dense: 0.889372 + +kernel execution time: 0.446076 ms +TTM after dense: 1.05921 + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 +C1_dimension: 25, C2_dimension: 16, vals: 400 +D1_dimension: 16, D2_dimension: 32, vals: 512 + + +kernel execution time: 1.29819 ms +fused time: 2.11481 + +kernel execution time: 0.560877 ms +reference time: 1.26788 + +kernel execution time: 0.506967 ms +TTM1: 1.14189 + +kernel execution time: 0.547697 ms +TTM2: 1.24278 + +kernel execution time: 0.075421 ms +dense: 0.508546 + +kernel execution time: 0.464356 ms +TTM after dense: 1.09434 + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/darpa1998.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 22476, B2_dimension: 22476, B3_dimension: 22476, vals: 28421307 +C1_dimension: 23776223, C2_dimension: 16, vals: 380419568 +D1_dimension: 16, D2_dimension: 32, vals: 512 + + +kernel execution time: 126.199 ms +fused time: 126.724 + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/darpa1998.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 22476, B2_dimension: 22476, B3_dimension: 22476, vals: 28421307 +C1_dimension: 23776223, C2_dimension: 16, vals: 380419568 +D1_dimension: 16, D2_dimension: 32, vals: 512 + + +kernel execution time: 132.543 ms +fused time: 133.165 + +kernel execution time: 2405.44 ms +reference time: 2406.19 + +kernel execution time: 331.61 ms +TTM1: 332.199 + +kernel execution time: 2.26417 ms +TTM2: 3.02615 + +kernel execution time: 400.791 ms +dense: 401.064 + +kernel execution time: 620.74 ms +TTM after dense: 621.389 + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 319686, B2_dimension: 28153045, B3_dimension: 319686, vals: 112890310 +C1_dimension: 1607191, C2_dimension: 16, vals: 25715056 +D1_dimension: 16, D2_dimension: 32, vals: 512 + + +kernel execution time: 455.645 ms +fused time: 456.696 + +kernel execution time: 718.699 ms +reference time: 719.384 + +kernel execution time: 142.557 ms +TTM1: 143.105 + +kernel execution time: 256.179 ms +TTM2: 256.785 + +kernel execution time: 29.5586 ms +dense: 30.0451 + +kernel execution time: 269.529 ms +TTM after dense: 270.186 + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 319686, B2_dimension: 28153045, B3_dimension: 319686, vals: 112890310 +C1_dimension: 1607191, C2_dimension: 32, vals: 51430112 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 890.318 ms +fused time: 891.345 + +kernel execution time: 2038.26 ms +reference time: 2038.96 + +kernel execution time: 265.076 ms +TTM1: 265.783 + +kernel execution time: 544.765 ms +TTM2: 545.423 + +kernel execution time: 70.9058 ms +dense: 71.4509 + +kernel execution time: 541.442 ms +TTM after dense: 542.115 + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854 +C1_dimension: 2, C2_dimension: 64, vals: 128 +D1_dimension: 64, D2_dimension: 64, vals: 4096 + + +kernel execution time: 902.466 ms +fused time: 903.626 + +kernel execution time: 1051.52 ms +reference time: 1052.27 + +kernel execution time: 385.619 ms +TTM1: 386.243 + +kernel execution time: 937.648 ms +TTM2: 938.212 + +kernel execution time: 0.067901 ms +dense: 1.00372 + +kernel execution time: 380.193 ms +TTM after dense: 380.613 + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854 +C1_dimension: 2, C2_dimension: 64, vals: 128 +D1_dimension: 64, D2_dimension: 128, vals: 8192 + + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854 +C1_dimension: 2, C2_dimension: 64, vals: 128 +D1_dimension: 64, D2_dimension: 64, vals: 4096 + + +kernel execution time: 898.295 ms +fused time: 899.297 + +kernel execution time: 1037.66 ms +reference time: 1038.39 + +kernel execution time: 385.768 ms +TTM1: 386.452 + +kernel execution time: 939.137 ms +TTM2: 939.74 + +kernel execution time: 0.073171 ms +dense: 1.20129 + +kernel execution time: 383.479 ms +TTM after dense: 384.01 + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 319686, B2_dimension: 28153045, B3_dimension: 319686, vals: 112890310 +C1_dimension: 1607191, C2_dimension: 64, vals: 102860224 +D1_dimension: 64, D2_dimension: 64, vals: 4096 + + +kernel execution time: 1034.06 ms +fused time: 1035.05 + +kernel execution time: 4275.39 ms +reference time: 4276.62 + +kernel execution time: 516.765 ms +TTM1: 517.518 + +kernel execution time: 1048.69 ms +TTM2: 1049.32 + +kernel execution time: 119.233 ms +dense: 119.711 + +kernel execution time: 546.744 ms +TTM after dense: 547.412 + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 319686, B2_dimension: 28153045, B3_dimension: 319686, vals: 112890310 +C1_dimension: 1607191, C2_dimension: 32, vals: 51430112 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 894.088 ms +fused time: 895.234 + +kernel execution time: 2025.29 ms +reference time: 2025.92 + +kernel execution time: 264.446 ms +TTM1: 265.069 + +kernel execution time: 541.153 ms +TTM2: 541.71 + +kernel execution time: 70.7936 ms +dense: 71.2153 + +kernel execution time: 542.474 ms +TTM after dense: 543.104 + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 319686, B2_dimension: 28153045, B3_dimension: 319686, vals: 112890310 +C1_dimension: 1607191, C2_dimension: 16, vals: 25715056 +D1_dimension: 16, D2_dimension: 64, vals: 1024 + + +kernel execution time: 871.496 ms +fused time: 872.523 + +kernel execution time: 1340.14 ms +reference time: 1340.84 + +kernel execution time: 143.439 ms +TTM1: 143.995 + +kernel execution time: 459.09 ms +TTM2: 459.668 + +kernel execution time: 51.7433 ms +dense: 52.1957 + +kernel execution time: 545.092 ms +TTM after dense: 545.899 + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 319686, B2_dimension: 28153045, B3_dimension: 319686, vals: 112890310 +C1_dimension: 1607191, C2_dimension: 32, vals: 51430112 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 893.815 ms +fused time: 894.866 + +kernel execution time: 2016.15 ms +reference time: 2016.8 + +kernel execution time: 266.599 ms +TTM1: 267.18 + +kernel execution time: 544.015 ms +TTM2: 544.597 + +kernel execution time: 70.7604 ms +dense: 71.1854 + +kernel execution time: 543.212 ms +TTM after dense: 543.879 + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 12092, B2_dimension: 9184, B3_dimension: 12092, vals: 76879419 +C1_dimension: 28818, C2_dimension: 32, vals: 922176 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 47.6087 ms +fused time: 48.0666 + +kernel execution time: 2381.79 ms +reference time: 2382.51 + +kernel execution time: 85.3431 ms +TTM1: 86.158 + +kernel execution time: 8.56212 ms +TTM2: 9.19594 + +kernel execution time: 1.27998 ms +dense: 1.66095 + +kernel execution time: 185.324 ms +TTM after dense: 185.729 + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/nell-1.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 2902330, B2_dimension: 2143368, B3_dimension: 2902330, vals: 143599552 +C1_dimension: 25495389, C2_dimension: 32, vals: 815852448 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 1312.78 ms +fused time: 1313.78 + +kernel execution time: 3548.92 ms +reference time: 3550.02 + +kernel execution time: 794.193 ms +TTM1: 794.835 + +kernel execution time: 371.233 ms +TTM2: 371.853 + +kernel execution time: 1136.25 ms +dense: 1136.73 + +kernel execution time: 1608.81 ms +TTM after dense: 1609.49 + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854 +C1_dimension: 2, C2_dimension: 32, vals: 64 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 749.836 ms +fused time: 750.93 + +kernel execution time: 566.457 ms +reference time: 567.141 + +kernel execution time: 197.095 ms +TTM1: 197.696 + +kernel execution time: 503.839 ms +TTM2: 504.407 + +kernel execution time: 0.05955 ms +dense: 0.911152 + +kernel execution time: 382.185 ms +TTM after dense: 382.591 + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/darpa1998.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 22476, B2_dimension: 22476, B3_dimension: 22476, vals: 28421307 +C1_dimension: 23776223, C2_dimension: 32, vals: 760839136 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 226.079 ms +fused time: 227.028 + +kernel execution time: 8763.95 ms +reference time: 8765.15 + +kernel execution time: 605.807 ms +TTM1: 606.7 + +kernel execution time: 5.27951 ms +TTM2: 5.94312 + +kernel execution time: 1075.36 ms +dense: 1075.63 + +kernel execution time: 1244.1 ms +TTM after dense: 1244.76 + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/freebase_music.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 23343790, B2_dimension: 23344784, B3_dimension: 23343790, vals: 99546550 +C1_dimension: 166, C2_dimension: 32, vals: 5312 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/delicious-3d.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 958893, B2_dimension: 55584242, B3_dimension: 958893, vals: 140126164 +C1_dimension: 2480308, C2_dimension: 32, vals: 79369856 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 319686, B2_dimension: 28153045, B3_dimension: 319686, vals: 112890310 +C1_dimension: 1607191, C2_dimension: 32, vals: 51430112 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 14896.3 ms +fused time: 14897.5 + +kernel execution time: 94041.2 ms +reference time: 94042.2 + +kernel execution time: 3578.66 ms +TTM1: 3579.61 + +kernel execution time: 18883.5 ms +TTM2: 18884.5 + +kernel execution time: 2197.87 ms +dense: 2198.28 + +kernel execution time: 7686.45 ms +TTM after dense: 7687.46 + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 12092, B2_dimension: 9184, B3_dimension: 12092, vals: 76879419 +C1_dimension: 28818, C2_dimension: 32, vals: 922176 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 1072.87 ms +fused time: 1073.82 + +kernel execution time: 71021.8 ms +reference time: 71022.9 + +kernel execution time: 1996.05 ms +TTM1: 1996.58 + +kernel execution time: 231.665 ms +TTM2: 232.177 + +kernel execution time: 40.2369 ms +dense: 40.6304 + +kernel execution time: 4971.71 ms +TTM after dense: 4972.6 + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/nell-1.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 2902330, B2_dimension: 2143368, B3_dimension: 2902330, vals: 143599552 +C1_dimension: 25495389, C2_dimension: 32, vals: 815852448 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 29074.9 ms +fused time: 29076 + +kernel execution time: 148072 ms +reference time: 148073 + +kernel execution time: 13571.2 ms +TTM1: 13572.2 + +kernel execution time: 11698.5 ms +TTM2: 11699.5 + +kernel execution time: 34736.9 ms +dense: 34737.7 + +kernel execution time: 22283.6 ms +TTM after dense: 22284.5 + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854 +C1_dimension: 2, C2_dimension: 32, vals: 64 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 12513.9 ms +fused time: 12515 + +kernel execution time: 23535.3 ms +reference time: 23536.3 + +kernel execution time: 1334.33 ms +TTM1: 1334.87 + +kernel execution time: 17560.3 ms +TTM2: 17561.3 + +kernel execution time: 0.019291 ms +dense: 0.885501 + +kernel execution time: 3394.59 ms +TTM after dense: 3395.34 + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/darpa1998.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 22476, B2_dimension: 22476, B3_dimension: 22476, vals: 28421307 +C1_dimension: 23776223, C2_dimension: 32, vals: 760839136 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 1517.3 ms +fused time: 1518.25 + +kernel execution time: 45929.9 ms +reference time: 45930.9 + +kernel execution time: 2929.29 ms +TTM1: 2929.82 + +kernel execution time: 53.4282 ms +TTM2: 53.9625 + +kernel execution time: 32592.7 ms +dense: 32593.5 + +kernel execution time: 6277.64 ms +TTM after dense: 6278.68 + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 +C1_dimension: 25, C2_dimension: 32, vals: 800 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 0.852321 ms +fused time: 1.60101 + +kernel execution time: 0.662379 ms +reference time: 1.32203 + +kernel execution time: 0.511427 ms +TTM1: 1.03372 + +kernel execution time: 0.667709 ms +TTM2: 1.20996 + +kernel execution time: 0.118331 ms +dense: 0.542977 + +kernel execution time: 0.483187 ms +TTM after dense: 0.900252 + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 +C1_dimension: 25, C2_dimension: 32, vals: 800 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 0.671739 ms +fused time: 4.90845 + +kernel execution time: 0.711039 ms +reference time: 5.04208 + +kernel execution time: 0.486907 ms +reference new time: 4.37081 + +kernel execution time: 0.482627 ms +TTM1: 3.67761 + +kernel execution time: 0.589078 ms +TTM2: 4.27397 + +kernel execution time: 0.095461 ms +dense: 0.492616 + +kernel execution time: 0.530937 ms +TTM after dense: 1.0284 + +ttm-ttm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 319686, B2_dimension: 28153045, B3_dimension: 319686, vals: 112890310 +C1_dimension: 1607191, C2_dimension: 32, vals: 51430112 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 881.367 ms +fused time: 886.111 + +reference impl time + +kernel execution time: 2050.43 ms +reference time: 2051.08 + +kernel execution time: 2002.9 ms +reference new time: 2003.54 + +kernel execution time: 260.701 ms +TTM1: 261.277 + +kernel execution time: 539.892 ms +TTM2: 540.489 + +kernel execution time: 69.5675 ms +dense: 70.0315 + +kernel execution time: 531.744 ms +TTM after dense: 532.375 +/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 12092, B2_dimension: 9184, B3_dimension: 12092, vals: 76879419 +C1_dimension: 28818, C2_dimension: 32, vals: 922176 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 46.1273 ms +fused time: 50.9231 + +reference impl time + +kernel execution time: 2363.18 ms +reference time: 2364.02 + +kernel execution time: 2340.56 ms +reference new time: 2341.2 + +kernel execution time: 82.5312 ms +TTM1: 83.1034 + +kernel execution time: 8.62143 ms +TTM2: 9.16734 + +kernel execution time: 1.20538 ms +dense: 1.48454 + +kernel execution time: 181.488 ms +TTM after dense: 181.827 + +ttm-ttm execution + +----------------------------------------- + +file: /home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns +---------------------------------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 319686, B2_dimension: 28153045, B3_dimension: 319686, vals: 112890310 +C1_dimension: 1607191, C2_dimension: 32, vals: 51430112 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 874.724 ms +fused time: 878.246 + +reference impl time + +kernel execution time: 2042.51 ms +reference time: 2043.27 + +kernel execution time: 46819.7 ms +reference new time: 46820.8 + +schedule 1 + +kernel execution time: 260.841 ms +TTM1: 261.378 + +kernel execution time: 539.264 ms +TTM2: 539.834 + +schedule 2 + +kernel execution time: 69.2965 ms +dense: 69.7197 + +kernel execution time: 532.774 ms +TTM after dense: 535.64 + +file: /home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns +---------------------------------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 12092, B2_dimension: 9184, B3_dimension: 12092, vals: 76879419 +C1_dimension: 28818, C2_dimension: 32, vals: 922176 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 51.3316 ms +fused time: 55.9685 + +reference impl time + +kernel execution time: 2363.6 ms +reference time: 2364.38 + +kernel execution time: 31523.9 ms +reference new time: 31525 + +schedule 1 + +kernel execution time: 84.4692 ms +TTM1: 84.9774 + +kernel execution time: 7.9451 ms +TTM2: 8.49167 + +schedule 2 + +kernel execution time: 1.17918 ms +dense: 1.49638 + +ttm-ttm execution + +----------------------------------------- + +file: /home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns +---------------------------------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 319686, B2_dimension: 28153045, B3_dimension: 319686, vals: 112890310 +C1_dimension: 1607191, C2_dimension: 32, vals: 51430112 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 877.727 ms +fused time: 881.892 + +reference impl time + +kernel execution time: 1998.47 ms +reference time: 1999.14 + +kernel execution time: 1818.14 ms +reference new time: 1818.77 + +schedule 1 + +kernel execution time: 261.202 ms +TTM1: 261.759 + +kernel execution time: 539.615 ms +TTM2: 540.183 + +schedule 2 + +kernel execution time: 69.7746 ms +dense: 70.1943 + +kernel execution time: 532.374 ms +TTM after dense: 533.008 + +file: /home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns +---------------------------------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 12092, B2_dimension: 9184, B3_dimension: 12092, vals: 76879419 +C1_dimension: 28818, C2_dimension: 32, vals: 922176 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 42.811 ms +fused time: 47.6618 + +reference impl time + +kernel execution time: 2267.84 ms +reference time: 2268.63 + +kernel execution time: 1379.49 ms +reference new time: 1380.15 + +schedule 1 + +kernel execution time: 81.6849 ms +TTM1: 82.4365 + +kernel execution time: 9.74645 ms +TTM2: 10.2848 + +schedule 2 + +kernel execution time: 1.47367 ms +dense: 1.78443 + +kernel execution time: 208.263 ms +TTM after dense: 210.169 + +file: /home/min/a/kadhitha/ispc-examples/data/tns/nell-1.tns +---------------------------------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/nell-1.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 2902330, B2_dimension: 2143368, B3_dimension: 2902330, vals: 143599552 +C1_dimension: 25495389, C2_dimension: 32, vals: 815852448 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 1299.91 ms +fused time: 1303.65 + +reference impl time + +kernel execution time: 3494.78 ms +reference time: 3497.66 + +kernel execution time: 2383.79 ms +reference new time: 2384.52 + +schedule 1 + +kernel execution time: 774.869 ms +TTM1: 775.571 + +kernel execution time: 1488.64 ms +TTM2: 1489.78 + +schedule 2 + +kernel execution time: 1121.66 ms +dense: 1122.11 + +kernel execution time: 1581.94 ms +TTM after dense: 1582.61 + +file: /home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns +---------------------------------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854 +C1_dimension: 2, C2_dimension: 32, vals: 64 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +ttm-ttm execution + +----------------------------------------- + +file: /home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns +---------------------------------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854 +C1_dimension: 2, C2_dimension: 32, vals: 64 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 746.344 ms +fused time: 749.212 + +reference impl time + +kernel execution time: 548.763 ms +reference time: 549.493 + +kernel execution time: 737.768 ms +reference new time: 738.436 + +schedule 1 + +kernel execution time: 195.639 ms +TTM1: 196.286 + +kernel execution time: 493.569 ms +TTM2: 494.15 + +schedule 2 + +kernel execution time: 0.052551 ms +dense: 0.648739 + +kernel execution time: 374.407 ms +TTM after dense: 376.248 + +file: /home/min/a/kadhitha/ispc-examples/data/tns/darpa1998.tns +---------------------------------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/darpa1998.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 22476, B2_dimension: 22476, B3_dimension: 22476, vals: 28421307 +C1_dimension: 23776223, C2_dimension: 32, vals: 760839136 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 221.905 ms +fused time: 222.964 + +reference impl time + +kernel execution time: 8826.57 ms +reference time: 8827.82 + +kernel execution time: 1435.28 ms +reference new time: 1437.65 + +schedule 1 + +kernel execution time: 574.934 ms +TTM1: 576.159 + +kernel execution time: 4.42254 ms +TTM2: 5.12181 + +schedule 2 + +kernel execution time: 1041.05 ms +dense: 1041.36 + +kernel execution time: 1247.06 ms +TTM after dense: 1247.76 + +ttm-ttm execution + +----------------------------------------- + +file: /home/min/a/kadhitha/ispc-examples/data/tns/nell-1.tns +---------------------------------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/nell-1.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 2902330, B2_dimension: 2143368, B3_dimension: 2902330, vals: 143599552 +C1_dimension: 25495389, C2_dimension: 32, vals: 815852448 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 1312.2 ms +fused time: 1315.79 + +reference impl time + +kernel execution time: 3512.84 ms +reference time: 3514.54 + +kernel execution time: 2381.97 ms +reference new time: 2382.6 + +schedule 1 + +kernel execution time: 779.205 ms +TTM1: 779.794 + +kernel execution time: 366.382 ms +TTM2: 367.081 + +schedule 2 + +kernel execution time: 1127.72 ms +dense: 1128.25 + +kernel execution time: 1579.85 ms +TTM after dense: 1580.5 + +ttm-ttm execution + +----------------------------------------- + +file: /home/min/a/kadhitha/ispc-examples/data/tns/nell-1.tns +---------------------------------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/nell-1.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 2902330, B2_dimension: 2143368, B3_dimension: 2902330, vals: 143599552 +C1_dimension: 25495389, C2_dimension: 32, vals: 815852448 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 1326.91 ms +fused time: 1331.56 + +reference impl time + +kernel execution time: 3535.03 ms +reference time: 3536.38 + +kernel execution time: 2387.24 ms +reference new time: 2387.99 + +schedule 1 + +kernel execution time: 780.495 ms +TTM1: 781.09 + +kernel execution time: 369.704 ms +TTM2: 370.292 + +schedule 2 + +kernel execution time: 1119.23 ms +dense: 1119.7 + +kernel execution time: 1579.78 ms +TTM after dense: 1580.54 + +ttm-ttm execution + +----------------------------------------- + +file: /home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns +---------------------------------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854 +C1_dimension: 2, C2_dimension: 64, vals: 128 +D1_dimension: 64, D2_dimension: 128, vals: 8192 + + +ttm-ttm execution + +----------------------------------------- + +file: /home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns +---------------------------------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854 +C1_dimension: 2, C2_dimension: 32, vals: 64 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 746.399 ms +fused time: 747.454 + +reference impl time + +kernel execution time: 549.908 ms +reference time: 550.683 + +kernel execution time: 731.657 ms +reference new time: 732.322 + +schedule 1 + +kernel execution time: 194.605 ms +TTM1: 195.252 + +kernel execution time: 491.591 ms +TTM2: 492.148 + +schedule 2 + +kernel execution time: 0.049841 ms +dense: 0.820181 + +kernel execution time: 372.064 ms +TTM after dense: 372.449 + +ttm-ttm execution + +----------------------------------------- + +file: /home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns +---------------------------------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854 +C1_dimension: 2, C2_dimension: 32, vals: 64 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 746.043 ms +fused time: 747.23 + +reference impl time + +kernel execution time: 561.015 ms +reference time: 561.669 + +kernel execution time: 737.535 ms +reference new time: 738.158 + +schedule 1 + +kernel execution time: 194.638 ms +TTM1: 195.169 + +kernel execution time: 495.355 ms +TTM2: 495.903 + +schedule 2 + +kernel execution time: 0.148292 ms +dense: 0.534998 + +kernel execution time: 374.231 ms +TTM after dense: 374.667 + +ttm-ttm execution + +----------------------------------------- + +file: /home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns +---------------------------------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854 +C1_dimension: 2, C2_dimension: 32, vals: 64 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 745.881 ms +fused time: 746.992 + +reference impl time + +kernel execution time: 551.705 ms +reference time: 552.359 + +kernel execution time: 736.019 ms +reference new time: 736.611 + +schedule 1 + +kernel execution time: 194.777 ms +TTM1: 195.33 + +kernel execution time: 491.151 ms +TTM2: 491.732 + +schedule 2 + +kernel execution time: 0.144522 ms +dense: 0.528597 + +kernel execution time: 374.363 ms +TTM after dense: 374.752 + +ttm-ttm execution + +----------------------------------------- + +file: /home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns +---------------------------------------------------------------- + +ttm-ttm execution + +----------------------------------------- Europa + +file: /home/min/a/kadhitha/workspace/my_taco/tns/darpa1998.tns +---------------------------------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/tns/darpa1998.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 22476, B2_dimension: 22476, B3_dimension: 22476, vals: 28421307 +C1_dimension: 23776223, C2_dimension: 32, vals: 760839136 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 2299.49 ms +fused time: 2301.59 + +reference impl time + +kernel execution time: 78844.2 ms +reference time: 78846.6 + +kernel execution time: 34427 ms +reference new time: 34429.3 + +schedule 1 + +kernel execution time: 6968.36 ms +TTM1: 6970.4 + +kernel execution time: 121.497 ms +TTM2: 123.127 + +schedule 2 + +kernel execution time: 64026.1 ms +dense: 64028 + +kernel execution time: 15531.3 ms +TTM after dense: 15533.4 + +ttm-ttm execution + +----------------------------------------- + +file: /home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns +---------------------------------------------------------------- + +ttm-ttm execution + +----------------------------------------- + +file: /home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns +---------------------------------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854 +C1_dimension: 2, C2_dimension: 32, vals: 64 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 40017.6 ms +fused time: 40019.4 + +reference impl time + +kernel execution time: 50710.4 ms +reference time: 50712.8 + +kernel execution time: 37978.8 ms +reference new time: 37980.6 + +schedule 1 + +kernel execution time: 3848.85 ms +TTM1: 3850.48 + +ttm-ttm execution + +----------------------------------------- + +file: /home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns +---------------------------------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854 +C1_dimension: 2, C2_dimension: 32, vals: 64 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 40277.5 ms +fused time: 40279.9 + +reference impl time + +kernel execution time: 50449.4 ms +reference time: 50452 + +kernel execution time: 37881.2 ms +reference new time: 37883.4 + +schedule 1 + +kernel execution time: 3987.96 ms +TTM1: 3990.09 + +kernel execution time: 40935.3 ms +TTM2: 40937.4 + +schedule 2 + +kernel execution time: 0.098195 ms +dense: 1.2874 + +kernel execution time: 12037.9 ms +TTM after dense: 12039.5 + +ttm-ttm execution + +----------------------------------------- + +file: /home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns +---------------------------------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 2, vals: 26021854 +C1_dimension: 2, C2_dimension: 32, vals: 64 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 36918.5 ms +fused time: 36920.9 + +reference impl time + +kernel execution time: 47892.3 ms +reference time: 47894.8 + +kernel execution time: 37901.4 ms +reference new time: 37903.5 + +schedule 1 + +kernel execution time: 3801.16 ms +TTM1: 3803.21 + +kernel execution time: 43488.6 ms +TTM2: 43490.6 + +schedule 2 + +kernel execution time: 0.060642 ms +dense: 1.08588 + +kernel execution time: 15190.9 ms +TTM after dense: 15192.3 + +ttm-ttm execution + +----------------------------------------- + +file: /home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns + +ttm-ttm execution + +----------------------------------------- + +file: /home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns +---------------------------------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 2, vals: 26021854 +C1_dimension: 2, C2_dimension: 32, vals: 64 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 35130 ms +fused time: 35133.9 + +reference impl time + +kernel execution time: 47634.1 ms +reference time: 47636.7 + +kernel execution time: 37616.7 ms +reference new time: 37618.9 + +schedule 1 + +kernel execution time: 2930.06 ms +TTM1: 2931.74 + +kernel execution time: 40710.7 ms +TTM2: 40713 + +schedule 2 + +kernel execution time: 0.07506 ms +dense: 1.28501 + +kernel execution time: 12393.3 ms +TTM after dense: 12394.9 + +ttm-ttm execution + +----------------------------------------- + +file: /home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns +---------------------------------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 2, vals: 26021854 +C1_dimension: 2, C2_dimension: 32, vals: 64 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 12528.5 ms +fused time: 12529.7 + +reference impl time + +kernel execution time: 23576.9 ms +reference time: 23578.1 + +kernel execution time: 16282.8 ms +reference new time: 16283.8 + +schedule 1 + +kernel execution time: 1332.64 ms +TTM1: 1333.18 + +kernel execution time: 17503.1 ms +TTM2: 17504.2 + +schedule 2 + +kernel execution time: 0.025131 ms +dense: 0.438566 + +kernel execution time: 3369.58 ms +TTM after dense: 3370.48 + +ttm-ttm execution + +----------------------------------------- + +file: /home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns +---------------------------------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 2, vals: 26021854 +C1_dimension: 2, C2_dimension: 32, vals: 64 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 12698.5 ms +fused time: 12699.7 + +reference impl time + +kernel execution time: 23669.6 ms +reference time: 23670.8 + +kernel execution time: 16390.1 ms +reference new time: 16391.1 + +schedule 1 + +kernel execution time: 1343.9 ms +TTM1: 1344.42 + +kernel execution time: 17641.6 ms +TTM2: 17642.6 + +schedule 2 + +kernel execution time: 0.02212 ms +dense: 0.397656 + +kernel execution time: 3411.14 ms +TTM after dense: 3412.04 + +ttm-ttm execution + +----------------------------------------- + +file: /home/min/a/kadhitha/ispc-examples/data/tns/delicious-3d.tns + +ttm-ttm execution + +----------------------------------------- + +file: /home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns + +ttm-ttm execution + +----------------------------------------- + +file: /home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns +---------------------------------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 2, vals: 26021854 +C1_dimension: 2, C2_dimension: 32, vals: 64 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 844.466 ms +fused time: 845.618 + +reference impl time + +kernel execution time: 814.964 ms +reference time: 815.676 + +kernel execution time: 918.472 ms +reference new time: 919.142 + +schedule 1 + +kernel execution time: 200.521 ms +TTM1: 201.112 + +kernel execution time: 678.038 ms +TTM2: 678.647 + +schedule 2 + +kernel execution time: 0.07066 ms +dense: 0.524547 + +kernel execution time: 394.81 ms +TTM after dense: 395.266 + +ttm-ttm execution + +----------------------------------------- + +file: /home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns +---------------------------------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 2, vals: 26021854 +C1_dimension: 2, C2_dimension: 32, vals: 64 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 2900.7 ms +fused time: 2903.25 + +reference impl time + +kernel execution time: 2746.32 ms +reference time: 2748.86 + +kernel execution time: 2812.87 ms +reference new time: 2815.19 + +schedule 1 + +kernel execution time: 2429.09 ms +TTM1: 2431.17 + +kernel execution time: 2451.88 ms +TTM2: 2454.06 + +schedule 2 + +kernel execution time: 1.43373 ms +dense: 2.85191 + +kernel execution time: 1651.7 ms +TTM after dense: 1652.91 + +ttm-ttm execution + +----------------------------------------- + +file: /home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns +---------------------------------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 2, vals: 26021854 +C1_dimension: 2, C2_dimension: 32, vals: 64 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 3539.09 ms +fused time: 3541.54 + +reference impl time + +kernel execution time: 2968.95 ms +reference time: 2972.61 + +kernel execution time: 3354.98 ms +reference new time: 3357.43 + +schedule 1 + +kernel execution time: 2697.68 ms +TTM1: 2699.71 + +kernel execution time: 2804.11 ms +TTM2: 2806.99 + +schedule 2 + +kernel execution time: 6.38211 ms +dense: 8.06652 + +kernel execution time: 1822.02 ms +TTM after dense: 1823.06 + +ttm-ttm execution + +----------------------------------------- + +file: /home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns + +ttm-ttm execution + +----------------------------------------- + +file: /home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns +---------------------------------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 2, vals: 26021854 +C1_dimension: 2, C2_dimension: 32, vals: 64 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 3608.92 ms +fused time: 3611.17 + +reference impl time + +kernel execution time: 3026.81 ms +reference time: 3029.09 + +kernel execution time: 3189.34 ms +reference new time: 3192.69 + +schedule 1 + +kernel execution time: 2659.86 ms +TTM1: 2661.48 + +kernel execution time: 2749.47 ms +TTM2: 2750.96 + +schedule 2 + +kernel execution time: 5.54375 ms +dense: 6.71077 + +kernel execution time: 1799.52 ms +TTM after dense: 1800.4 + +ttm-ttm execution + +----------------------------------------- + +file: /home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns +---------------------------------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 2, vals: 26021854 +C1_dimension: 2, C2_dimension: 32, vals: 64 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 3553.08 ms +fused time: 3555.93 + +reference impl time + +kernel execution time: 2962.14 ms +reference time: 2964.25 + +kernel execution time: 3306.95 ms +reference new time: 3309.38 + +schedule 1 + +kernel execution time: 2723.22 ms +TTM1: 2724.83 + +kernel execution time: 2581.33 ms +TTM2: 2583.4 + +schedule 2 + +kernel execution time: 0.772961 ms +dense: 2.02166 + +kernel execution time: 1731.42 ms +TTM after dense: 1732.48 + +ttm-ttm execution + +----------------------------------------- + +file: /home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns +---------------------------------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 2, vals: 26021854 +C1_dimension: 2, C2_dimension: 32, vals: 64 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 3577.13 ms +fused time: 3580.97 + +reference impl time + +kernel execution time: 3010.77 ms +reference time: 3013.04 + +kernel execution time: 3364.45 ms +reference new time: 3366.58 + +schedule 1 + +kernel execution time: 2740.85 ms +TTM1: 2742.84 + +kernel execution time: 2788.11 ms +TTM2: 2790.79 + +schedule 2 + +kernel execution time: 2.57712 ms +dense: 4.23057 + +kernel execution time: 1934.52 ms +TTM after dense: 1935.9 + +ttm-ttm execution + +----------------------------------------- + +file: /home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns +---------------------------------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 2, vals: 26021854 +C1_dimension: 2, C2_dimension: 32, vals: 64 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 3424.23 ms +fused time: 3426.81 + +reference impl time + +kernel execution time: 3023.35 ms +reference time: 3025.97 + +kernel execution time: 3086.35 ms +reference new time: 3089.41 + +schedule 1 + +kernel execution time: 2913.43 ms +TTM1: 2915.13 + +kernel execution time: 2623.7 ms +TTM2: 2625.65 + +schedule 2 + +kernel execution time: 5.28416 ms +dense: 6.61329 + +kernel execution time: 1971.48 ms +TTM after dense: 1972.7 + +ttm-ttm execution + +----------------------------------------- + +file: /home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns +---------------------------------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 2, vals: 26021854 +C1_dimension: 2, C2_dimension: 32, vals: 64 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 3693.12 ms +fused time: 3695.79 + +reference impl time + +kernel execution time: 2900.73 ms +reference time: 2902.96 + +kernel execution time: 3138.83 ms +reference new time: 3141.16 + +schedule 1 + +kernel execution time: 2673.94 ms +TTM1: 2675.57 + +kernel execution time: 2703.37 ms +TTM2: 2705.31 + +schedule 2 + +kernel execution time: 5.31585 ms +dense: 7.12051 + +kernel execution time: 1724.31 ms +TTM after dense: 1726.36 + +ttm-ttm execution + +----------------------------------------- + +file: /home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns +---------------------------------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 2, vals: 26021854 +C1_dimension: 2, C2_dimension: 32, vals: 64 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 3572.56 ms +fused time: 3575.03 + +reference impl time + +kernel execution time: 2939.46 ms +reference time: 2941.84 + +kernel execution time: 3182.38 ms +reference new time: 3184.81 + +schedule 1 + +kernel execution time: 2731.33 ms +TTM1: 2733.2 + +kernel execution time: 2782.07 ms +TTM2: 2784.32 + +schedule 2 + +kernel execution time: 5.52055 ms +dense: 7.06503 + +kernel execution time: 1729.87 ms +TTM after dense: 1730.87 + +ttm-ttm execution + +----------------------------------------- + +file: /home/min/a/kadhitha/workspace/my_taco/tns/darpa1998.tns + +ttm-ttm execution + +----------------------------------------- + +file: /home/min/a/kadhitha/workspace/my_taco/tns/darpa1998.tns +---------------------------------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/tns/darpa1998.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 22476, B2_dimension: 22476, B3_dimension: 23776223, vals: 28421307 +C1_dimension: 23776223, C2_dimension: 32, vals: 760839136 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 1404.79 ms +fused time: 1406.83 + +reference impl time + +kernel execution time: 28471.3 ms +reference time: 28474.9 + +kernel execution time: 5689.54 ms +reference new time: 5692.1 + +schedule 1 + +kernel execution time: 3526.34 ms +TTM1: 3528.66 + +kernel execution time: 21.5542 ms +TTM2: 23.6182 + +schedule 2 + +kernel execution time: 6069.99 ms +dense: 6071.91 + +kernel execution time: 6163.35 ms +TTM after dense: 6165.73 + +ttm-ttm execution + +----------------------------------------- + +file: /home/min/a/kadhitha/workspace/my_taco/tns/darpa1998.tns +---------------------------------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/tns/darpa1998.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 22476, B2_dimension: 22476, B3_dimension: 23776223, vals: 28421307 +C1_dimension: 23776223, C2_dimension: 32, vals: 760839136 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 1390.55 ms +fused time: 1392.48 + +reference impl time + +kernel execution time: 30840.6 ms +reference time: 30843.4 + +kernel execution time: 5638.37 ms +reference new time: 5641.01 + +schedule 1 + +kernel execution time: 3642.19 ms +TTM1: 3644.13 + +kernel execution time: 24.3447 ms +TTM2: 25.6449 + +schedule 2 + +kernel execution time: 6027.41 ms +dense: 6029.82 + +kernel execution time: 6494.21 ms +TTM after dense: 6497.33 + +ttm-ttm execution + +----------------------------------------- + +file: /home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns +---------------------------------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns +A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) +B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 2, vals: 26021854 +C1_dimension: 2, C2_dimension: 32, vals: 64 +D1_dimension: 32, D2_dimension: 64, vals: 2048 + + +kernel execution time: 3727.32 ms +fused time: 3729.78 + +reference impl time + +kernel execution time: 2996.48 ms +reference time: 2999.42 + +kernel execution time: 3216.53 ms +reference new time: 3218.79 + +schedule 1 + +kernel execution time: 2902.94 ms +TTM1: 2904.86 + +kernel execution time: 2722.22 ms +TTM2: 2724.59 + +schedule 2 + +kernel execution time: 5.8157 ms +dense: 7.48208 + +kernel execution time: 1725.24 ms +TTM after dense: 1726.69 diff --git a/test/test.cpp b/test/test.cpp index a49f10ff7..851493b7f 100644 --- a/test/test.cpp +++ b/test/test.cpp @@ -38,6 +38,20 @@ void ASSERT_TENSOR_EQ(TensorBase expected, TensorBase actual) { ASSERT_TRUE(equals(expected, actual)); } +// void ASSERT_TENSOR_VAL(TensorBase expected, TensorBase actual) { +// std::cout << "order: " << expected.getOrder(); +// std::vector modes{}; +// for (int mode = 0; mode < expected.getOrder(); mode++) { +// if (expected.getDimension(mode) != actual.getDimension(mode)) { +// ASSERT_TRUE(false); +// } + +// for (int i=0; i expected, void ASSERT_STORAGE_EQ(TensorStorage expected, TensorStorage actual); void ASSERT_TENSOR_EQ(TensorBase expected, TensorBase actual); +// void ASSERT_TENSOR_VAL(TensorBase expected, TensorBase actual); template void ASSERT_COMPONENTS_EQUALS(vector>> expectedIndices, diff --git a/test/tests-indexstmt.cpp b/test/tests-indexstmt.cpp index e2a972430..123bea3e6 100644 --- a/test/tests-indexstmt.cpp +++ b/test/tests-indexstmt.cpp @@ -1,10 +1,13 @@ +#include "taco/index_notation/kernel.h" +#include "taco/type.h" #include "test.h" #include "test_tensors.h" #include "taco/tensor.h" #include "taco/index_notation/index_notation.h" +#include "taco/index_notation/transformations.h" using namespace taco; -const IndexVar i("i"), j("j"), k("k"); +const IndexVar i("i"), j("j"), k("k"), l("l"), m("m"); TEST(indexstmt, assignment) { Type t(type(), {3}); @@ -84,4 +87,193 @@ TEST(indexstmt, spmm) { } +TEST(indexstmt, sddmm) { + Type t(type(), {3,3}); + TensorVar A("A", t, {Sparse, Dense}); + TensorVar B("B", t, {Sparse, Dense}); + TensorVar C("C", t, {Dense, Dense}); + TensorVar w("w", Type(type(),{3}), Dense); + + // the below expression is the concrete index notation + // where (consumer, producer) + IndexStmt spmm = forall(i, + forall(k, + where(forall(j, A(i,j) = w(j)), + forall(j, w(j) += B(i,k)*C(k,j)) + ) + ) + ); + + // after adding scheduling transformations to this concrete-topologically sorted index stmt + // + + std::cout << spmm << std::endl; + spmm = reorderLoopsTopologically(spmm); + std::cout << "topologically reordered loops statement: " << spmm << std::endl; + + Kernel kernel = compile(spmm); + kernel.compute(); +} + +TEST(indexstmt, sddmmPlusSpmm) { + + // Y(i,l) = B(i,j)*C(i,k)*D(k,j) * F(j,l); + // indexstmt order i, j, k, l + //topologically reordered loops statement: forall(i, forall(k, forall(j, forall(l, Y(i,l) += B(i,j) * C(i,k) * D(k,j) * F(j,l), NotParallel, IgnoreRaces), NotParallel, IgnoreRaces), NotParallel, IgnoreRaces), NotParallel, IgnoreRaces) + + Type t(type(), {3,3}); + TensorVar Y("Y", t, {Dense, Dense}); + TensorVar B("B", t, {Dense, Sparse}); + TensorVar C("C", t, {Dense, Dense}); + TensorVar D("D", t, {Dense, Dense}); + TensorVar E("E", t, {Dense, Dense}); + + // TensorVar A("A", Type(type(),{3}), ); + TensorVar A("A", Type()); + + IndexStmt fused1 = + forall(i, + forall(j, + forall(k, + forall(l, Y(i,l) += B(i,j) * C(i,k) * D(j,k) * E(j,l)) + ) + ) + ); + + std::cout << "before topological sort" << fused1 << std::endl; + fused1 = reorderLoopsTopologically(fused1); + std::cout << "after topological sort" << fused1 << std::endl; + + Kernel kernel = compile(fused1); + + + IndexStmt fused2 = + forall(i, + forall(j, + where( + forall(l, Y(i,l) += A * E(j,l)), // consumer + forall(k, A += B(i,j)*C(i,k)*D(j,k)) // producer + ) + ) + ); + + Kernel kernel2 = compile(fused2); + +} + + + +TEST(indexstmt, mttkrpPlusSpmm) { + + // ./bin/taco "A(i,m)=B(i,k,l)*C(k,j)*D(l,j)*E(j,m)" -f=A:dd:0,1 -f=B:sss:0,1,2 -f=C:dd:0,1 -f=D:dd:0,1 -f=E:dd:0,1 + + // i = 11, k = 5, l = 7, j = 8; + long unsigned int idim = 11, kdim = 5, ldim = 7, jdim = 8, mdim = 6; + + Type atype(type(), {idim, mdim}); + Type btype(type(), {idim, kdim, ldim}); + Type ctype(type(), {kdim, jdim}); + Type dtype(type(), {ldim, jdim}); + Type etype(type(), {jdim, mdim}); + + TensorVar A("A", atype, {Dense, Dense}); + TensorVar B("B", btype, {Sparse, Sparse, Sparse}); + TensorVar C("C", ctype, {Dense, Dense}); + TensorVar D("D", dtype, {Dense, Dense}); + TensorVar E("E", etype, {Dense, Dense}); + + TensorVar ws("ws", Type(type(), {jdim}) ); + + IndexStmt fused1 = + forall(i, + forall(k, + forall(l, + forall(j, + forall(m, A(i,m) += B(i,k,l) * C(k,j) * D(l,j) * E(j,m)) + ) + ) + ) + ); + + std::cout << "before topological sort" << fused1 << std::endl; + fused1 = reorderLoopsTopologically(fused1); + std::cout << "after topological sort" << fused1 << std::endl; + + Kernel kernel = compile(fused1); + + IndexStmt fused2 = + forall(i, + where( + forall(j, + forall(m, + A(i,m) += ws(j) * E(j,m) + ) + ) + , + forall(k, + forall(l, + forall(j, + ws(j) += B(i,k,l) * C(k,j) * D(l,j) + ) + ) + ) + ) + ); + + Kernel kernel2 = compile(fused2); + +} + +// ./bin/taco "y(i)=A(i,j)*B(j,k)*v(k)" -f=y:d:0 -f=A:dd:0,1 -f=B:dd:0,1 -f=v:d:0 +TEST(indexstmt, mmPlusSpmv) { + + // + + long unsigned int idim = 11, jdim = 8, kdim = 5; + + Type ytype(type(), {idim}); + Type atype(type(), {idim, jdim}); + Type btype(type(), {jdim, kdim}); + Type vtype(type(), {kdim}); + + TensorVar y("y", ytype, {Dense}); + TensorVar A("A", atype, {Dense, Dense}); + TensorVar B("B", btype, {Dense, Dense}); + TensorVar v("v", vtype, {Dense}); + + TensorVar ws("ws", Type(type(), {jdim}) ); + + IndexStmt fused1 = + forall(i, + forall(j, + forall(k, + forall(m, y(i) += A(i,j) * B(j,k) * v(k)) + ) + ) + ); + + std::cout << "before topological sort" << fused1 << std::endl; + fused1 = reorderLoopsTopologically(fused1); + std::cout << "after topological sort" << fused1 << std::endl; + + Kernel kernel = compile(fused1); + + IndexStmt fused2 = + where( + forall(i, + forall(j, + y(i) += A(i,j) * ws(j) + ) + ) + , + forall(j, + forall(k, + ws(j) += B(j,k) * v(k) + ) + ) + ); + + Kernel kernel2 = compile(fused2); +} + diff --git a/test/tests-scheduling-eval.cpp b/test/tests-scheduling-eval.cpp index 52bd74ab4..29a7e512e 100644 --- a/test/tests-scheduling-eval.cpp +++ b/test/tests-scheduling-eval.cpp @@ -1,42 +1,8 @@ -#include -#include -#include -#include -#include "test.h" -#include "test_tensors.h" -#include "taco/tensor.h" -#include "taco/index_notation/index_notation.h" -#include "taco/index_notation/transformations.h" -#include "codegen/codegen.h" -#include "taco/lower/lower.h" - -using namespace taco; +#include "util.h" + const IndexVar i("i"), j("j"), k("k"), l("l"), m("m"), n("n"); int WARP_SIZE = 32; -void printToCout(IndexStmt stmt) { - std::shared_ptr codegen = ir::CodeGen::init_default(cout, ir::CodeGen::ImplementationGen); - ir::Stmt compute = lower(stmt, "compute", false, true); - codegen->compile(compute, true); -} - -void printToFile(string filename, IndexStmt stmt) { - stringstream source; - - string file_path = "eval_generated/"; - mkdir(file_path.c_str(), 0777); - - std::shared_ptr codegen = ir::CodeGen::init_default(source, ir::CodeGen::ImplementationGen); - ir::Stmt compute = lower(stmt, "compute", false, true); - codegen->compile(compute, true); - - ofstream source_file; - string file_ending = should_use_CUDA_codegen() ? ".cu" : ".c"; - source_file.open(file_path + filename + file_ending); - source_file << source.str(); - source_file.close(); -} - IndexStmt scheduleSpMVCPU(IndexStmt stmt, int CHUNK_SIZE=16) { IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1"); return stmt.split(i, i0, i1, CHUNK_SIZE) @@ -44,6 +10,14 @@ IndexStmt scheduleSpMVCPU(IndexStmt stmt, int CHUNK_SIZE=16) { .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); } +IndexStmt scheduleSpMVISPC(IndexStmt stmt, int CHUNK_SIZE=16) { + IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1"); + // return stmt; + return stmt.split(i, i0, i1, CHUNK_SIZE) + .reorder({i0, i1, j}) + .parallelize(i0, ParallelUnit::CPUSimd, OutputRaceStrategy::NoRaces); +} + IndexStmt scheduleSpMMCPU(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); return stmt.split(i, i0, i1, CHUNK_SIZE) @@ -54,6 +28,80 @@ IndexStmt scheduleSpMMCPU(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, i .parallelize(k, ParallelUnit::CPUVector, OutputRaceStrategy::IgnoreRaces); } +IndexStmt scheduleSpMMISPC1(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); + return stmt.split(i, i0, i1, CHUNK_SIZE) + .pos(j, jpos, A(i,j)) + .split(jpos, jpos0, jpos1, UNROLL_FACTOR) + .reorder({i0, i1, jpos0, k, jpos1}) + // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + .parallelize(k, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); +} + +IndexStmt scheduleSpMMISPCOMP1(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); + return stmt.split(i, i0, i1, CHUNK_SIZE) + .pos(j, jpos, A(i,j)) + .split(jpos, jpos0, jpos1, UNROLL_FACTOR) + .reorder({i0, i1, jpos0, k, jpos1}) + .parallelize(i0, ParallelUnit::CPUSpmd, OutputRaceStrategy::NoRaces) + .parallelize(k, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); +} + +IndexStmt scheduleSpMMISPC1_2(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); + return stmt.split(i, i0, i1, CHUNK_SIZE) + .pos(j, jpos, A(i,j)) + .split(jpos, jpos0, jpos1, UNROLL_FACTOR) + .reorder({i0, i1, jpos0, k, jpos1}) + // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + .parallelize(i0, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); +} + +IndexStmt scheduleSpMMISPC1_3(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); + return stmt.split(i, i0, i1, CHUNK_SIZE) + .pos(j, jpos, A(i,j)) + .split(jpos, jpos0, jpos1, UNROLL_FACTOR) + .reorder({i0, i1, jpos0, k, jpos1}) + // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + .parallelize(i1, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); +} + +IndexStmt scheduleSpMMISPC2(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); + return stmt + .parallelize(k, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); +} + +IndexStmt scheduleSpMMISPC2_2(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); + return stmt + .parallelize(i, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); +} + +IndexStmt scheduleSpMMISPC3(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); + return stmt + // .split(i, i0, i1, CHUNK_SIZE) + // .pos(j, jpos, A(i,j)) + // .split(jpos, jpos0, jpos1, UNROLL_FACTOR) + .reorder({j, k}) + // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + .parallelize(k, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); +} + +IndexStmt scheduleSpMMISPC3_2(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); + return stmt + // .split(i, i0, i1, CHUNK_SIZE) + // .pos(j, jpos, A(i,j)) + // .split(jpos, jpos0, jpos1, UNROLL_FACTOR) + .reorder({j, k}) + // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + .parallelize(i, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); +} + IndexStmt scheduleSpGEMMCPU(IndexStmt stmt, bool doPrecompute) { Assignment assign = stmt.as().getStmt().as().getStmt() .as().getStmt().as(); @@ -107,6 +155,68 @@ IndexStmt scheduleSDDMMCPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, .parallelize(kpos1, ParallelUnit::CPUVector, OutputRaceStrategy::ParallelReduction); } +IndexStmt scheduleSDDMMCSRCPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1"); + return stmt; + // return stmt.split(i, i0, i1, CHUNK_SIZE) + // .pos(k, kpos, B(i,k)) + // .split(kpos, kpos0, kpos1, UNROLL_FACTOR) + // .reorder({i0, i1, kpos0, j, kpos1}); + // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); + // .parallelize(k, ParallelUnit::CPUVector, OutputRaceStrategy::IgnoreRaces); +} + +IndexStmt scheduleSDDMM2CPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); + return stmt.split(i, i0, i1, CHUNK_SIZE) + .pos(j, jpos, B(i,j)) + .split(jpos, jpos0, jpos1, UNROLL_FACTOR) + .reorder({i0, i1, jpos0, k, jpos1}) + .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + .parallelize(jpos1, ParallelUnit::CPUVector, OutputRaceStrategy::ParallelReduction); +} + +IndexStmt scheduleSDDMMISPC(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1"); + return stmt.split(i, i0, i1, CHUNK_SIZE) + .pos(k, kpos, B(i,k)) + .split(kpos, kpos0, kpos1, UNROLL_FACTOR) + .reorder({i0, i1, kpos0, j, kpos1}) + // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); + .parallelize(kpos1, ParallelUnit::CPUSimd, OutputRaceStrategy::ParallelReduction); +} + +IndexStmt scheduleSDDMM2ISPC(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); + return stmt.split(i, i0, i1, CHUNK_SIZE) + .pos(j, jpos, B(i,j)) + .split(jpos, jpos0, jpos1, UNROLL_FACTOR) + .reorder({i0, i1, jpos0, k, jpos1}) + // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); + .parallelize(jpos1, ParallelUnit::CPUSimd, OutputRaceStrategy::ParallelReduction); +} + +IndexStmt scheduleSDDMMISPC1(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1"); + return stmt.split(i, i0, i1, CHUNK_SIZE) + .pos(k, kpos, B(i,k)) + .split(kpos, kpos0, kpos1, UNROLL_FACTOR) + .reorder({i0, i1, kpos0, j, kpos1}) + .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + .parallelize(kpos1, ParallelUnit::CPUSimd, OutputRaceStrategy::ParallelReduction); +} + +IndexStmt scheduleSDDMMISPC2(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1"); + return stmt; + // .split(i, i0, i1, CHUNK_SIZE) + // .pos(k, kpos, B(i,k)) + // .split(kpos, kpos0, kpos1, UNROLL_FACTOR) + // .reorder({i0, i1, kpos0, j, kpos1}) + // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + // .parallelize(kpos1, ParallelUnit::CPUSimd, OutputRaceStrategy::ParallelReduction); +} + IndexStmt scheduleTTVCPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16) { IndexVar f("f"), fpos("fpos"), chunk("chunk"), fpos2("fpos2"); return stmt.fuse(i, j, f) @@ -116,6 +226,16 @@ IndexStmt scheduleTTVCPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16) { .parallelize(chunk, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); } +IndexStmt scheduleTTVISPC(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16) { + IndexVar f("f"), fpos("fpos"), chunk("chunk"), fpos2("fpos2"); + // return stmt; + return stmt.fuse(i, j, f) + .pos(f, fpos, B(i,j,k)) + .split(fpos, chunk, fpos2, CHUNK_SIZE) + .reorder({chunk, fpos2, k}) + .parallelize(chunk, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); +} + IndexStmt scheduleTTVCPUCSR(IndexStmt stmt) { TensorVar result = stmt.as().getStmt().as().getStmt() .as().getStmt().as().getLhs() @@ -125,6 +245,25 @@ IndexStmt scheduleTTVCPUCSR(IndexStmt stmt) { OutputRaceStrategy::NoRaces); } +IndexStmt scheduleTTVCPUCSR_ST(IndexStmt stmt) { + TensorVar result = stmt.as().getStmt().as().getStmt() + .as().getStmt().as().getLhs() + .getTensorVar(); + return stmt.assemble(result, AssembleStrategy::Insert); +} + +IndexStmt scheduleTTVISPCCSR(IndexStmt stmt) { + TensorVar result = stmt.as().getStmt().as().getStmt() + .as().getStmt().as().getLhs() + .getTensorVar(); + return stmt.assemble(result, AssembleStrategy::Insert) + .parallelize(i, ParallelUnit::CPUSimd, OutputRaceStrategy::NoRaces); +} + +IndexStmt scheduleTTVISPCCSR2(IndexStmt stmt) { + return stmt; +} + IndexStmt scheduleTTMCPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { IndexVar f("f"), fpos("fpos"), chunk("chunk"), fpos2("fpos2"), kpos("kpos"), kpos1("kpos1"), kpos2("kpos2"); return stmt.fuse(i, j, f) @@ -149,12 +288,47 @@ IndexStmt scheduleMTTKRPCPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, .parallelize(i1, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); } +IndexStmt scheduleMTTKRPCPU_ST(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i1("i1"), i2("i2"); + IndexExpr precomputeExpr = stmt.as().getStmt().as().getStmt() + .as().getStmt().as().getStmt() + .as().getRhs().as().getA(); + TensorVar w("w", Type(Float64, {Dimension(j)}), taco::dense); + return stmt.split(i, i1, i2, CHUNK_SIZE) + .reorder({i1, i2, k, l, j}) + .precompute(precomputeExpr, j, j, w); + // .parallelize(j, ParallelUnit::CPUVector, OutputRaceStrategy::Atomics); // gives error when lowering for IgnoreRaces, NoRaces and Atomics + // .parallelize(i1, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); +} + +IndexStmt scheduleMTTKRPISPC(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i1("i1"), i2("i2"); + IndexExpr precomputeExpr = stmt.as().getStmt().as().getStmt() + .as().getStmt().as().getStmt() + .as().getRhs().as().getA(); + TensorVar w("w", Type(Float64, {Dimension(j)}), taco::dense); + return stmt.split(i, i1, i2, CHUNK_SIZE) + .reorder({i1, i2, k, l, j}) + .precompute(precomputeExpr, j, j, w) + .parallelize(j, ParallelUnit::CPUSimd, OutputRaceStrategy::NoRaces); +} + IndexStmt scheduleMTTKRPPrecomputedCPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { IndexVar i1("i1"), i2("i2"), j_pre("j_pre"); return stmt.split(i, i1, i2, CHUNK_SIZE) .parallelize(i1, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); } +IndexStmt scheduleMTTKRPPrecomputedCPU_ST(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i1("i1"), i2("i2"), j_pre("j_pre"); + return stmt.split(i, i1, i2, CHUNK_SIZE); +} + +IndexStmt scheduleMTTKRPPrecomputedISPC_ST(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i1("i1"), i2("i2"), j_pre("j_pre"); + return stmt.parallelize(j, ParallelUnit::CPUSimd, OutputRaceStrategy::NoRaces); +} + IndexStmt scheduleMTTKRP4CPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { IndexVar i1("i1"), i2("i2"); return stmt.split(i, i1, i2, CHUNK_SIZE) @@ -162,6 +336,19 @@ IndexStmt scheduleMTTKRP4CPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16 .parallelize(i1, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); } +IndexStmt scheduleMTTKRP4CPU_ST(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i1("i1"), i2("i2"); + return stmt.split(i, i1, i2, CHUNK_SIZE) + .reorder({i1, i2, k, l, m, j}); +} + +IndexStmt scheduleMTTKRP4ISPC_ST(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i1("i1"), i2("i2"); + return stmt.split(i, i1, i2, CHUNK_SIZE) + .reorder({i1, i2, k, l, m, j}) + .parallelize(j, ParallelUnit::CPUSimd, OutputRaceStrategy::NoRaces); +} + IndexStmt scheduleMTTKRP5CPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { IndexVar i1("i1"), i2("i2"); return stmt.split(i, i1, i2, CHUNK_SIZE) @@ -576,6 +763,92 @@ TEST(scheduling_eval, spmmCPU) { ASSERT_TENSOR_EQ(expected, C); } +TEST(scheduling_eval, spmmISPC) { + taco::util::TimeResults timevalue; + bool time = true; + + set_ISPC_codegen_enabled(false); + set_CUDA_codegen_enabled(false); + + int NUM_I = 1021/10; + int NUM_J = 1039/10; + int NUM_K = 128; + float SPARSITY = .1; + Tensor A("A", {NUM_I, NUM_J}, CSR); + Tensor B("B", {NUM_J, NUM_K}, {Dense, Dense}); + Tensor C("C", {NUM_I, NUM_K}, {Dense, Dense}); + + srand(75883); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + if (rand_float < SPARSITY) { + A.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + } + + for (int j = 0; j < NUM_J; j++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + B.insert({j, k}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + + A.pack(); + B.pack(); + + set_ISPC_codegen_enabled(true); + C(i, k) = A(i, j) * B(j, k); + + IndexStmt stmt = C.getAssignment().concretize(); + // stmt = scheduleSpMMISPC1(stmt, A); + // stmt = scheduleSpMMISPC1_2(stmt, A); + stmt = scheduleSpMMISPC1_3(stmt, A); + + // stmt = scheduleSpMMISPC2(stmt, A); + // stmt = scheduleSpMMISPC2_2(stmt, A); + + // stmt = scheduleSpMMISPC3(stmt, A); + // stmt = scheduleSpMMISPC3_2(stmt, A); + + //printToFile("spmm_cpu", stmt); + + C.compile(stmt); + C.assemble(); + C.compute(); + + set_ISPC_codegen_enabled(false); + Tensor expected("expected", {NUM_I, NUM_K}, {Dense, Dense}); + expected(i, k) = A(i, j) * B(j, k); + IndexStmt stmt_taco = expected.getAssignment().concretize(); + stmt_taco = scheduleSpMMCPU(stmt_taco, A); + + expected.compile(stmt_taco); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, C); + + // float ERROR_MARGIN = 0.01; + // ASSERT_TENSOR_VAL(expected, y); + for (int i = 0; i < NUM_I; i++) { + for (int k = 0; k < NUM_K; k++) { + if (expected(i,k) <= C(i,k) + ERROR_MARGIN && expected(i,k) >= C(i,k) - ERROR_MARGIN) { + // std::cout << "matched values: expected -> " << expected(j) << " == " << y(j) << " <- actual\n"; + } + else { + std::cout << "unmatched values: expected -> " << expected(i,k) << " != " << C(i,k) << " <- actual\n"; + ASSERT_TRUE(false); + }; + } + } + + for (int i=0; i<10; i++) { + TOOL_BENCHMARK_TIMER(C.compute(), "Compute ISPC: ", timevalue); + TOOL_BENCHMARK_TIMER(expected.compute(), "Compute TACO: ", timevalue); + } +} + struct spgemm : public TestWithParam> {}; TEST_P(spgemm, scheduling_eval) { @@ -805,7 +1078,7 @@ TEST(scheduling_eval, sddmmCPU) { IndexStmt stmt = A.getAssignment().concretize(); stmt = scheduleSDDMMCPU(stmt, B); - //printToFile("sddmm_cpu", stmt); + printToFile("sddmm_cpu_ryan2", stmt); A.compile(stmt); A.assemble(); @@ -819,55 +1092,69 @@ TEST(scheduling_eval, sddmmCPU) { ASSERT_TENSOR_EQ(expected, A); } -TEST(scheduling_eval, spmvCPU) { - if (should_use_CUDA_codegen()) { +TEST(scheduling_eval, sddmmSPMMFusedCPU) { + if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) { return; } + int NUM_I = 1021/10; int NUM_J = 1039/10; + int NUM_K = 1057/10; float SPARSITY = .3; - Tensor A("A", {NUM_I, NUM_J}, CSR); - Tensor x("x", {NUM_J}, Format({Dense})); - Tensor y("y", {NUM_I}, Format({Dense})); + Tensor A("A", {NUM_I, NUM_K}, {Dense, Dense}); + Tensor B("B", {NUM_I, NUM_K}, CSR); + Tensor C("C", {NUM_I, NUM_J}, {Dense, Dense}); + Tensor D("D", {NUM_J, NUM_K}, {Dense, Dense}); - srand(120); + srand(268238); for (int i = 0; i < NUM_I; i++) { for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + C.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + + for (int i = 0; i < NUM_I; i++) { + for (int k = 0; k < NUM_K; k++) { float rand_float = (float)rand()/(float)(RAND_MAX); if (rand_float < SPARSITY) { - A.insert({i, j}, (double) ((int) (rand_float * 3 / SPARSITY))); + B.insert({i, k}, (double) ((int) (rand_float*3/SPARSITY))); } } } for (int j = 0; j < NUM_J; j++) { - float rand_float = (float)rand()/(float)(RAND_MAX); - x.insert({j}, (double) ((int) (rand_float*3/SPARSITY))); + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + D.insert({j, k}, (double) ((int) (rand_float*3/SPARSITY))); + } } - x.pack(); - A.pack(); + B.pack(); + C.pack(); + D.pack(); - y(i) = A(i, j) * x(j); + A(i,k) = B(i,k) * C(i,j) * D(j,k); - IndexStmt stmt = y.getAssignment().concretize(); - stmt = scheduleSpMVCPU(stmt); + IndexStmt stmt = A.getAssignment().concretize(); + stmt = scheduleSDDMMCPU(stmt, B); - //printToFile("spmv_cpu", stmt); + printToFile("sddmm_cpu_ryan2", stmt); - y.compile(stmt); - y.assemble(); - y.compute(); + A.compile(stmt); + A.assemble(); + A.compute(); - Tensor expected("expected", {NUM_I}, Format({Dense})); - expected(i) = A(i, j) * x(j); + Tensor expected("expected", {NUM_I, NUM_K}, {Dense, Dense}); + expected(i,k) = B(i,k) * C(i,j) * D(j,k); expected.compile(); expected.assemble(); expected.compute(); - ASSERT_TENSOR_EQ(expected, y); + ASSERT_TENSOR_EQ(expected, A); } -TEST(scheduling_eval, ttvCPU) { + +TEST(scheduling_eval, sddmmcsrCPU) { if (should_use_CUDA_codegen()) { return; } @@ -875,7 +1162,495 @@ TEST(scheduling_eval, ttvCPU) { int NUM_J = 1039/10; int NUM_K = 1057/10; float SPARSITY = .3; - Tensor A("A", {NUM_I, NUM_J}, {Dense, Dense}); // TODO: change to sparse outputs + Tensor A("A", {NUM_I, NUM_K}, CSR); + Tensor B("B", {NUM_I, NUM_K}, CSR); + Tensor C("C", {NUM_I, NUM_J}, {Dense, Dense}); + Tensor D("D", {NUM_J, NUM_K}, {Dense, Dense}); + + srand(268238); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + C.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + + for (int i = 0; i < NUM_I; i++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, k}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + } + + for (int j = 0; j < NUM_J; j++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + D.insert({j, k}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + + B.pack(); + C.pack(); + D.pack(); + + A(i,k) = B(i,k) * C(i,j) * D(j,k); + + IndexStmt stmt = A.getAssignment().concretize(); + stmt = scheduleSDDMMCSRCPU(stmt, B); + + printToFile("sddmm_cpu", stmt); + + A.compile(stmt); + A.assemble(); + A.compute(); + + Tensor expected("expected", {NUM_I, NUM_K}, CSR); + expected(i,k) = B(i,k) * C(i,j) * D(j,k); + + IndexStmt stmt_ref = expected.getAssignment().concretize(); + printToFile("sddmm_cpu_ref", stmt_ref); + + expected.compile(stmt_ref); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, A); +} + + +TEST(scheduling_eval, sddmm2CPU) { + if (should_use_CUDA_codegen()) { + return; + } + int NUM_I = 1021/10; + int NUM_J = 1021/10; + int NUM_K = 18; + float SPARSITY = .3; + Tensor Y("Y", {NUM_I, NUM_J}, {Dense, Compressed(ModeFormat::UNIQUE)}); + Tensor A("A", {NUM_I, NUM_J}, {Dense, Compressed(ModeFormat::UNIQUE)}); + Tensor X("X", {NUM_I, NUM_K}, {Dense, Dense}); + + srand(268238); + + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + if (rand_float < SPARSITY) { + A.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + } + + for (int i = 0; i < NUM_J; i++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + X.insert({i, k}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + + A.pack(); + X.pack(); + + Y(i,j) = A(i,j) * X(i,k) * X(k,j); + + // IndexStmt stmt = A.getAssignment().concretize(); + // // stmt = scheduleSDDMMCPU(stmt, A); + + // printToFile("sddmm2_cpu", stmt); + + // A.compile(stmt); + // A.assemble(); + // A.compute(); + + // Tensor expected("expected", {NUM_I, NUM_J}, {Dense, Dense}); + // expected(i,j) = A(i,j) * X(i,k) * X(j,k); + // expected.compile(); + // expected.assemble(); + // expected.compute(); + // ASSERT_TENSOR_EQ(expected, A); +} + + + +// bin/taco-test --gtest_filter=scheduling_eval.sddmmISPC +TEST(scheduling_eval, sddmmISPC) { + + taco::util::TimeResults timevalue; + bool time = true; + + set_CUDA_codegen_enabled(false); + set_ISPC_codegen_enabled(false); + + int NUM_I = 1021/10; + int NUM_J = 1039/10; + int NUM_K = 1057/10; + float SPARSITY = .3; + Tensor A("A", {NUM_I, NUM_K}, {Dense, Dense}); + Tensor B("B", {NUM_I, NUM_K}, CSR); + Tensor C("C", {NUM_I, NUM_J}, {Dense, Dense}); + Tensor D("D", {NUM_J, NUM_K}, {Dense, Dense}); + + srand(268238); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + C.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + + for (int i = 0; i < NUM_I; i++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, k}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + } + + for (int j = 0; j < NUM_J; j++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + D.insert({j, k}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + + B.pack(); + C.pack(); + D.pack(); + + set_ISPC_codegen_enabled(true); + A(i,k) = B(i,k) * C(i,j) * D(j,k); + + IndexStmt stmt = A.getAssignment().concretize(); + stmt = scheduleSDDMMISPC(stmt, B); + + //printToFile("sddmm_cpu", stmt); + + A.compile(stmt); + A.assemble(); + // A.compute(); + + set_ISPC_codegen_enabled(false); + Tensor expected("expected", {NUM_I, NUM_K}, {Dense, Dense}); + expected(i,k) = B(i,k) * C(i,j) * D(j,k); + IndexStmt stmt_taco = A.getAssignment().concretize(); + stmt_taco = scheduleSDDMMCPU(stmt_taco, B); + expected.compile(stmt_taco); + expected.assemble(); + // expected.compute(); + + TOOL_BENCHMARK_TIMER(A.compute(), "Compute ISPC: ", timevalue); + TOOL_BENCHMARK_TIMER(expected.compute(), "Compute TACO: ", timevalue); + + ASSERT_TENSOR_EQ(expected, A); + + + // float ERROR_MARGIN = 0.01; + // ASSERT_TENSOR_VAL(expected, y); + for (int i = 0; i < NUM_I; i++) { + for (int k = 0; k < NUM_K; k++) { + if (expected(i,k) <= A(i,k) + ERROR_MARGIN && expected(i,k) >= A(i,k) - ERROR_MARGIN) { + // std::cout << "matched values: expected -> " << expected(j) << " == " << y(j) << " <- actual\n"; + } + else { + std::cout << "unmatched values: expected -> " << expected(i,k) << " != " << A(i,k) << " <- actual\n"; + ASSERT_TRUE(false); + }; + } + } + std::cout << "test scheduling_eval.sddmmISPC passed\n"; + +} + + +// bin/taco-test --gtest_filter=scheduling_eval.sddmmISPC +TEST(scheduling_eval, sddmm2ISPC) { + + taco::util::TimeResults timevalue; + bool time = true; + + set_CUDA_codegen_enabled(false); + set_ISPC_codegen_enabled(false); + + int NUM_I = 1021/10; + int NUM_K = 1039/10; + int NUM_J = 1021/10; + float SPARSITY = .3; + Tensor A("A", {NUM_I, NUM_J}, {Dense, Dense}); + Tensor B("B", {NUM_I, NUM_J}, CSR); + Tensor C("C", {NUM_I, NUM_K}, {Dense, Dense}); + + srand(268238); + for (int i = 0; i < NUM_I; i++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + C.insert({i, k}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + } + + B.pack(); + C.pack(); + + set_ISPC_codegen_enabled(true); + A(i,j) = B(i,j) * C(i,k) * C(j,k); + + IndexStmt stmt = A.getAssignment().concretize(); + stmt = scheduleSDDMM2ISPC(stmt, B); + + //printToFile("sddmm_cpu", stmt); + + A.compile(stmt); + A.assemble(); + // A.compute(); + + set_ISPC_codegen_enabled(false); + Tensor expected("expected", {NUM_I, NUM_J}, {Dense, Dense}); + expected(i,j) = B(i,j) * C(i,k) * C(j,k); + IndexStmt stmt_taco = A.getAssignment().concretize(); + stmt_taco = scheduleSDDMM2CPU(stmt_taco, B); + expected.compile(stmt_taco); + expected.assemble(); + // expected.compute(); + + TOOL_BENCHMARK_TIMER(A.compute(), "Compute ISPC: ", timevalue); + TOOL_BENCHMARK_TIMER(expected.compute(), "Compute TACO: ", timevalue); + + ASSERT_TENSOR_EQ(expected, A); + + + // float ERROR_MARGIN = 0.01; + // ASSERT_TENSOR_VAL(expected, y); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + if (expected(i,j) <= A(i,j) + ERROR_MARGIN && expected(i,j) >= A(i,j) - ERROR_MARGIN) { + // std::cout << "matched values: expected -> " << expected(j) << " == " << y(j) << " <- actual\n"; + } + else { + std::cout << "unmatched values: expected -> " << expected(i,j) << " != " << A(i,j) << " <- actual\n"; + ASSERT_TRUE(false); + }; + } + } + std::cout << "test scheduling_eval.sddmmISPC passed\n"; + +} + + +TEST(scheduling_eval, spmvCPU) { + if (should_use_CUDA_codegen()) { + return; + } + int NUM_I = 1021/10; + int NUM_J = 1039/10; + float SPARSITY = .3; + Tensor A("A", {NUM_I, NUM_J}, CSR); + Tensor x("x", {NUM_J}, Format({Dense})); + Tensor y("y", {NUM_I}, Format({Dense})); + + srand(120); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + if (rand_float < SPARSITY) { + A.insert({i, j}, (double) ((int) (rand_float * 3 / SPARSITY))); + } + } + } + + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + x.insert({j}, (double) ((int) (rand_float*3/SPARSITY))); + } + + x.pack(); + A.pack(); + + y(i) = A(i, j) * x(j); + + IndexStmt stmt = y.getAssignment().concretize(); + stmt = scheduleSpMVCPU(stmt); + + //printToFile("spmv_cpu", stmt); + + y.compile(stmt); + y.assemble(); + y.compute(); + + Tensor expected("expected", {NUM_I}, Format({Dense})); + expected(i) = A(i, j) * x(j); + expected.compile(); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, y); +} + + +TEST(scheduling_eval, spmvISPC) { + + taco::util::TimeResults timevalue; + bool time = true; + + set_ISPC_codegen_enabled(false); + set_CUDA_codegen_enabled(false); + + int NUM_I = 200021/10; + int NUM_J = 200039/10; + float SPARSITY = .2; + Tensor A("A", {NUM_I, NUM_J}, CSR); + Tensor x("x", {NUM_J}, Format({Dense})); + Tensor y("y", {NUM_I}, Format({Dense})); + + srand(120); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + if (rand_float < SPARSITY) { + A.insert({i, j}, (double) ((int) (rand_float * 3 / SPARSITY))); + } + } + } + + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + x.insert({j}, (double) ((int) (rand_float*3/SPARSITY))); + } + + x.pack(); + A.pack(); + + set_ISPC_codegen_enabled(true); + + y(i) = A(i, j) * x(j); + + IndexStmt stmt = y.getAssignment().concretize(); + // stmt = scheduleSpMVISPC(stmt); + + printToFile("spmv_cpu", stmt); + + y.compile(stmt); + y.assemble(); + // y.compile(); + + set_ISPC_codegen_enabled(false); + + // Tensor expected("expected", {NUM_I}, Format({Dense})); + // expected(i) = A(i, j) * x(j); + // expected.compile(); + // expected.assemble(); + // expected.compute(); + + + Tensor expected("expected", {NUM_I}, Format({Dense})); + expected(i) = A(i, j) * x(j); + IndexStmt stmt_taco = expected.getAssignment().concretize(); + stmt_taco = scheduleSpMVCPU(stmt_taco); + + expected.compile(stmt_taco); + expected.assemble(); + // expected.compile(); + + + TOOL_BENCHMARK_TIMER(y.compute(), "Compute ISPC: ", timevalue); + TOOL_BENCHMARK_TIMER(expected.compute(), "Compute TACO: ", timevalue); + + + ASSERT_TENSOR_EQ(expected, y); + + // float ERROR_MARGIN = 0.01; + // ASSERT_TENSOR_VAL(expected, y); + for (int j = 0; j < NUM_J; j++) { + if (expected(j) <= y(j) + ERROR_MARGIN && expected(j) >= y(j) - ERROR_MARGIN) { + // std::cout << "matched values: expected -> " << expected(j) << " == " << y(j) << " <- actual\n"; + } + else { + std::cout << "unmatched values: expected -> " << expected(j) << " != " << y(j) << " <- actual\n"; + ASSERT_TRUE(false); + }; + } + + std::cout << "test scheduling_eval.spmvISPC passed\n"; + + for (int i=0; i<10; i++) { + TOOL_BENCHMARK_TIMER(y.compute(), "Compute ISPC: ", timevalue); + TOOL_BENCHMARK_TIMER(expected.compute(), "Compute TACO: ", timevalue); + } + + +} + +TEST(scheduling_eval, ttvCPU) { + if (should_use_CUDA_codegen()) { + return; + } + int NUM_I = 1021/10; + int NUM_J = 1039/10; + int NUM_K = 1057/10; + float SPARSITY = .3; + Tensor A("A", {NUM_I, NUM_J}, {Dense, Dense}); // TODO: change to sparse outputs + Tensor B("B", {NUM_I, NUM_J, NUM_K}, {Sparse, Sparse, Sparse}); + Tensor c("c", {NUM_K}, Format({Dense})); + + srand(9536); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float) rand() / (float) (RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, j, k}, (double) ((int) (rand_float * 3 / SPARSITY))); + } + } + } + } + + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + c.insert({k}, (double) ((int) (rand_float*3))); + } + + B.pack(); + c.pack(); + + A(i,j) = B(i,j,k) * c(k); + + IndexStmt stmt = A.getAssignment().concretize(); + stmt = scheduleTTVCPU(stmt, B); + + printToFile("ttv_cpu", stmt); + + A.compile(stmt); + A.assemble(); + A.compute(); + + Tensor expected("expected", {NUM_I, NUM_J}, {Dense, Dense}); + expected(i,j) = B(i,j,k) * c(k); + expected.compile(); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, A); +} + + +TEST(scheduling_eval, ttvISPC) { + if (should_use_CUDA_codegen()) { + return; + } + set_CUDA_codegen_enabled(false); + set_ISPC_codegen_enabled(false); + int NUM_I = 1021/10; + int NUM_J = 1039/10; + int NUM_K = 1057/10; + float SPARSITY = .3; + Tensor A("A", {NUM_I, NUM_J}, {Dense, Dense}); // TODO: change to sparse outputs Tensor B("B", {NUM_I, NUM_J, NUM_K}, {Sparse, Sparse, Sparse}); Tensor c("c", {NUM_K}, Format({Dense})); @@ -899,25 +1674,30 @@ TEST(scheduling_eval, ttvCPU) { B.pack(); c.pack(); + set_ISPC_codegen_enabled(true); A(i,j) = B(i,j,k) * c(k); IndexStmt stmt = A.getAssignment().concretize(); - stmt = scheduleTTVCPU(stmt, B); + stmt = scheduleTTVISPC(stmt, B); - //printToFile("ttv_cpu", stmt); + printToFile("ttv_ispc", "__ttv_ispc", stmt); A.compile(stmt); A.assemble(); A.compute(); + set_ISPC_codegen_enabled(false); Tensor expected("expected", {NUM_I, NUM_J}, {Dense, Dense}); expected(i,j) = B(i,j,k) * c(k); + IndexStmt stmt_taco = expected.getAssignment().concretize(); + stmt_taco = scheduleTTVCPU(stmt_taco, B); expected.compile(); expected.assemble(); expected.compute(); ASSERT_TENSOR_EQ(expected, A); } + TEST(scheduling_eval, ttvCPU_CSR) { if (should_use_CUDA_codegen()) { return; @@ -928,7 +1708,7 @@ TEST(scheduling_eval, ttvCPU_CSR) { int NUM_K = 1057/10; float SPARSITY = .3; Tensor A("A", {NUM_I, NUM_J}, {Dense, Sparse}); - Tensor B("B", {NUM_I, NUM_J, NUM_K}, {Sparse, Sparse, Sparse}); + Tensor B("B", {NUM_I, NUM_J, NUM_K}, {Dense, Sparse, Sparse}); Tensor c("c", {NUM_K}, Format({Dense})); srand(9536); @@ -956,11 +1736,13 @@ TEST(scheduling_eval, ttvCPU_CSR) { IndexStmt stmt = A.getAssignment().concretize(); stmt = scheduleTTVCPUCSR(stmt); + printToFile("ttv_cpu_csr", stmt); + A.compile(stmt); A.assemble(); A.compute(); - Tensor expected("expected", {NUM_I, NUM_J}, {Dense, Dense}); + Tensor expected("expected", {NUM_I, NUM_J}, {Dense, Sparse}); expected(i,j) = B(i,j,k) * c(k); expected.compile(); expected.assemble(); @@ -968,6 +1750,82 @@ TEST(scheduling_eval, ttvCPU_CSR) { ASSERT_TENSOR_EQ(expected, A); } +TEST(scheduling_eval, ttvISPC_CSR) { + if (should_use_CUDA_codegen()) { + return; + } + + int NUM_I = 10000; + int NUM_J = 1039/10; + int NUM_K = 128; + float SPARSITY = .3; + Tensor A("A", {NUM_I, NUM_J}, {Dense, Sparse}); + Tensor B("B", {NUM_I, NUM_J, NUM_K}, {Dense, Sparse, Sparse}); + Tensor c("c", {NUM_K}, Format({Dense})); + + srand(9536); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float) rand() / (float) (RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, j, k}, (double) ((int) (rand_float * 3 / SPARSITY))); + } + } + } + } + + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + c.insert({k}, (double) ((int) (rand_float*3))); + } + + B.pack(); + c.pack(); + + set_ISPC_codegen_enabled(true); + A(i,j) = B(i,j,k) * c(k); + + IndexStmt stmt = A.getAssignment().concretize(); + stmt = scheduleTTVISPCCSR(stmt); + printToFile("ttv_ispc_csr", "__ttv_ispc_csr", stmt); + + A.compile(stmt); + A.assemble(); + A.compute(); + + set_ISPC_codegen_enabled(false); + Tensor expected("expected", {NUM_I, NUM_J}, {Dense, Sparse}); + expected(i,j) = B(i,j,k) * c(k); + IndexStmt taco_stmt = expected.getAssignment().concretize(); + taco_stmt = scheduleTTVCPUCSR_ST(taco_stmt); + expected.compile(taco_stmt); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, A); + + Tensor A2("A2", {NUM_I, NUM_J}, {Dense, Sparse}); + set_ISPC_codegen_enabled(true); + A2(i,j) = B(i,j,k) * c(k); + + IndexStmt stmt2 = A2.getAssignment().concretize(); + + A2.compile(stmt2); + A2.assemble(); + A2.compute(); + + taco::util::TimeResults timevalue; + bool time = true; + + for (int i=0; i<3; i++) { + TOOL_BENCHMARK_TIMER(expected.compute(), "Compute TACO1: ", timevalue); + TOOL_BENCHMARK_TIMER(A.compute(), "Compute ISPC1: ", timevalue); + TOOL_BENCHMARK_TIMER(A2.compute(), "Compute ISPC2: ", timevalue); + } + + +} + TEST(scheduling_eval, ttmCPU) { if (should_use_CUDA_codegen()) { return; @@ -1010,39 +1868,318 @@ TEST(scheduling_eval, ttmCPU) { //printToFile("ttm_cpu", stmt); - A.compile(stmt); - A.assemble(); - A.compute(); + A.compile(stmt); + A.assemble(); + A.compute(); + + Tensor expected("expected", {NUM_I, NUM_J, NUM_L}, {Dense, Dense, Dense}); + expected(i,j,l) = B(i,j,k) * C(k,l); + expected.compile(); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, A); +} + +TEST(scheduling_eval, ttmISPC) { + if (should_use_CUDA_codegen()) { + return; + } + int NUM_I = 1021/40; + int NUM_J = 1039/40; + int NUM_K = 1057/40; + int NUM_L = 1232/40; + float SPARSITY = .1; + Tensor A("A", {NUM_I, NUM_J, NUM_L}, {Dense, Dense, Dense}); // TODO: change to sparse outputs + Tensor B("B", {NUM_I, NUM_J, NUM_K}, {Sparse, Sparse, Sparse}); + Tensor C("C", {NUM_K, NUM_L}, {Dense, Dense}); + + srand(935); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float) rand() / (float) (RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, j, k}, (double) ((int) (rand_float * 3 / SPARSITY))); + } + } + } + } + + for (int k = 0; k < NUM_K; k++) { + for (int l = 0; l < NUM_L; l++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + C.insert({k, l}, (double) ((int) (rand_float*3))); + } + } + + B.pack(); + C.pack(); + + A(i,j,l) = B(i,j,k) * C(k,l); + + IndexStmt stmt = A.getAssignment().concretize(); + stmt = scheduleTTMCPU(stmt, B); + + //printToFile("ttm_cpu", stmt); + + A.compile(stmt); + A.assemble(); + A.compute(); + + Tensor expected("expected", {NUM_I, NUM_J, NUM_L}, {Dense, Dense, Dense}); + expected(i,j,l) = B(i,j,k) * C(k,l); + expected.compile(); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, A); +} + +TEST(scheduling_eval, mttkrpCPU) { + if (should_use_CUDA_codegen()) { + return; + } + int NUM_I = 1021/20; + int NUM_J = 1039/20; + int NUM_K = 1057/20; + int NUM_L = 1232/20; + float SPARSITY = .1; + Tensor A("A", {NUM_I, NUM_J}, {Dense, Dense}); + Tensor B("B", {NUM_I, NUM_K, NUM_L}, {Dense, Sparse, Sparse}); + Tensor C("C", {NUM_K, NUM_J}, {Dense, Dense}); + Tensor D("D", {NUM_L, NUM_J}, {Dense, Dense}); + + srand(549694); + for (int i = 0; i < NUM_I; i++) { + for (int k = 0; k < NUM_K; k++) { + for (int l = 0; l < NUM_L; l++) { + float rand_float = (float) rand() / (float) (RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, k, l}, (double) ((int) (rand_float * 3 / SPARSITY))); + } + } + } + } + + for (int k = 0; k < NUM_K; k++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + C.insert({k, j}, (double) ((int) (rand_float*3))); + } + } + + for (int l = 0; l < NUM_L; l++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + D.insert({l, j}, (double) ((int) (rand_float*3))); + } + } + + B.pack(); + C.pack(); + D.pack(); + + A(i,j) = B(i,k,l) * C(k,j) * D(l,j); + + IndexStmt stmt = A.getAssignment().concretize(); + stmt = scheduleMTTKRPCPU(stmt, B); + //printToFile("mttkrp_cpu", stmt); + + A.compile(stmt); + A.assemble(); + A.compute(); + + Tensor expected("expected", {NUM_I, NUM_J}, {Dense, Dense}); + expected(i,j) = B(i,k,l) * C(k,j) * D(l,j); + expected.compile(); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, A); +} + +TEST(scheduling_eval, temp) { + if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) { + return; + } + std::default_random_engine gen(0); + std::uniform_real_distribution unif(0.0, 1.0); + // Predeclare the storage formats that the inputs and output will be stored as. + // To define a format, you must specify whether each dimension is dense or sparse + // and (optionally) the order in which dimensions should be stored. The formats + // declared below correspond to doubly compressed sparse row (dcsr), row-major + // dense (rm), and column-major dense (dm). + Format dcsr({Sparse,Sparse}); + Format rm({Dense,Dense}); + Format cm({Dense,Dense}, {1,0}); + + // Load a sparse matrix from file (stored in the Matrix Market format) and + // store it as a doubly compressed sparse row matrix. Matrices correspond to + // order-2 tensors in taco. The matrix in this example can be download from: + // https://www.cise.ufl.edu/research/sparse/MM/Williams/webbase-1M.tar.gz + Tensor B = read("/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx", dcsr); + // Generate a random dense matrix and store it in row-major (dense) format. + Tensor C({B.getDimension(0), 1000}, rm); + for (int i = 0; i < C.getDimension(0); ++i) { + for (int j = 0; j < C.getDimension(1); ++j) { + C.insert({i,j}, unif(gen)); + } + } + C.pack(); + + // Generate another random dense matrix and store it in column-major format. + Tensor D({1000, B.getDimension(1)}, cm); + for (int i = 0; i < D.getDimension(0); ++i) { + for (int j = 0; j < D.getDimension(1); ++j) { + D.insert({i,j}, unif(gen)); + } + } + D.pack(); + + // Declare the output matrix to be a sparse matrix with the same dimensions as + // input matrix B, to be also stored as a doubly compressed sparse row matrix. + Tensor A(B.getDimensions(), dcsr); + + // Define the SDDMM computation using index notation. + IndexVar i, j, k; + A(i,j) = B(i,j) * C(i,k) * D(k,j); + + // At this point, we have defined how entries in the output matrix should be + // computed from entries in the input matrices but have not actually performed + // the computation yet. To do so, we must first tell taco to generate code that + // can be executed to compute the SDDMM operation. + A.compile(); + // We can now call the functions taco generated to assemble the indices of the + // output matrix and then actually compute the SDDMM. + A.assemble(); + A.compute(); + // Write the output of the computation to file (stored in the Matrix Market format). + write("A.mtx", A); +} + +TEST(scheduling_eval, mttkrpISPC) { + if (should_use_CUDA_codegen()) { + return; + } + set_ISPC_codegen_enabled(false); + set_CUDA_codegen_enabled(false); + int NUM_I = 10000; // 1021/20; + int NUM_J = 256; + int NUM_K = 1057/20; + int NUM_L = 1232/20; + float SPARSITY = .1; + Tensor B("B", {NUM_I, NUM_K, NUM_L}, {Dense, Sparse, Sparse}); + Tensor C("C", {NUM_K, NUM_J}, {Dense, Dense}); + Tensor D("D", {NUM_L, NUM_J}, {Dense, Dense}); + + srand(549694); + for (int i = 0; i < NUM_I; i++) { + for (int k = 0; k < NUM_K; k++) { + for (int l = 0; l < NUM_L; l++) { + float rand_float = (float) rand() / (float) (RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, k, l}, (double) ((int) (rand_float * 3 / SPARSITY))); + } + } + } + } + + for (int k = 0; k < NUM_K; k++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + C.insert({k, j}, (double) ((int) (rand_float*3))); + } + } + + for (int l = 0; l < NUM_L; l++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + D.insert({l, j}, (double) ((int) (rand_float*3))); + } + } + + B.pack(); + C.pack(); + D.pack(); - Tensor expected("expected", {NUM_I, NUM_J, NUM_L}, {Dense, Dense, Dense}); - expected(i,j,l) = B(i,j,k) * C(k,l); - expected.compile(); - expected.assemble(); - expected.compute(); - ASSERT_TENSOR_EQ(expected, A); + set_ISPC_codegen_enabled(true); + + Tensor A1("A1", {NUM_I, NUM_J}, {Dense, Dense}); + A1(i,j) = B(i,k,l) * C(k,j) * D(l,j); + IndexStmt stmt1 = A1.getAssignment().concretize(); + stmt1 = scheduleMTTKRPISPC(stmt1, B); + // printToFile("mttkrp1_cpu_ispc", stmt1); + A1.compile(stmt1); + A1.assemble(); + A1.compute(); + + set_ISPC_codegen_enabled(false); + Tensor expected1("expected1", {NUM_I, NUM_J}, {Dense, Dense}); + expected1(i,j) = B(i,k,l) * C(k,j) * D(l,j); + IndexStmt taco_stmt1 = expected1.getAssignment().concretize(); + taco_stmt1 = scheduleMTTKRPCPU(taco_stmt1, B); + expected1.compile(taco_stmt1); + expected1.assemble(); + expected1.compute(); + ASSERT_TENSOR_EQ(expected1, A1); + + set_ISPC_codegen_enabled(true); + Tensor A2("A2", {NUM_I, NUM_J}, {Dense, Dense}); + A2(i,j) = B(i,k,l) * C(k,j) * D(l,j); + IndexStmt stmt2 = A1.getAssignment().concretize(); + stmt2 = scheduleMTTKRPPrecomputedISPC_ST(stmt2, B); + // printToFile("mttkrp_cpu_ispc", stmt); + A2.compile(stmt2); + A2.assemble(); + A2.compute(); + ASSERT_TENSOR_EQ(expected1, A2); + + set_ISPC_codegen_enabled(false); + Tensor expected2("expected2", {NUM_I, NUM_J}, {Dense, Dense}); + expected2(i,j) = B(i,k,l) * C(k,j) * D(l,j); + IndexStmt taco_stmt2 = expected2.getAssignment().concretize(); + taco_stmt2 = scheduleMTTKRPPrecomputedCPU_ST(taco_stmt2, B); + expected2.compile(taco_stmt2); + expected2.assemble(); + expected2.compute(); + ASSERT_TENSOR_EQ(expected1, expected2); + + taco::util::TimeResults timevalue; + bool time = true; + + for (int i=0; i<3; i++) { + TOOL_BENCHMARK_TIMER(expected1.compute(), "Compute TACO1: ", timevalue); + TOOL_BENCHMARK_TIMER(A1.compute(), "Compute ISPC1: ", timevalue); + TOOL_BENCHMARK_TIMER(expected2.compute(), "Compute TACO2: ", timevalue); + TOOL_BENCHMARK_TIMER(A2.compute(), "Compute ISPC2: ", timevalue); + } } -TEST(scheduling_eval, mttkrpCPU) { + +TEST(scheduling_eval, mttkrp4ISPC) { if (should_use_CUDA_codegen()) { return; } - int NUM_I = 1021/20; - int NUM_J = 1039/20; + set_ISPC_codegen_enabled(false); + set_CUDA_codegen_enabled(false); + int NUM_I = 1000; // 1021/20; + int NUM_J = 16; int NUM_K = 1057/20; int NUM_L = 1232/20; + int NUM_M = 1124/20; float SPARSITY = .1; - Tensor A("A", {NUM_I, NUM_J}, {Dense, Dense}); - Tensor B("B", {NUM_I, NUM_K, NUM_L}, {Dense, Sparse, Sparse}); + Tensor B("B", {NUM_I, NUM_K, NUM_L, NUM_M}, {Dense, Sparse, Sparse, Sparse}); Tensor C("C", {NUM_K, NUM_J}, {Dense, Dense}); Tensor D("D", {NUM_L, NUM_J}, {Dense, Dense}); + Tensor E("E", {NUM_M, NUM_J}, {Dense, Dense}); srand(549694); for (int i = 0; i < NUM_I; i++) { for (int k = 0; k < NUM_K; k++) { for (int l = 0; l < NUM_L; l++) { - float rand_float = (float) rand() / (float) (RAND_MAX); - if (rand_float < SPARSITY) { - B.insert({i, k, l}, (double) ((int) (rand_float * 3 / SPARSITY))); + for (int m = 0; m < NUM_M; m++) { + float rand_float = (float) rand() / (float) (RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, k, l, m}, (double) ((int) (rand_float * 3 / SPARSITY))); + } } } } @@ -1062,27 +2199,83 @@ TEST(scheduling_eval, mttkrpCPU) { } } + for (int m = 0; m < NUM_M; m++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + E.insert({m, j}, (double) ((int) (rand_float*3))); + } + } + B.pack(); C.pack(); D.pack(); + E.pack(); + + set_ISPC_codegen_enabled(true); + Tensor A1("A1", {NUM_I, NUM_J}, {Dense, Dense}); + A1(i,j) = B(i,k,l,m) * C(k,j) * D(l,j) * E(m,j); + IndexStmt stmt1 = A1.getAssignment().concretize(); + stmt1 = scheduleMTTKRP4ISPC_ST(stmt1, B); + // printToFile("mttkrp1_cpu_ispc", stmt1); + A1.compile(stmt1); + A1.assemble(); + A1.compute(); + + set_ISPC_codegen_enabled(false); + Tensor expected1("expected1", {NUM_I, NUM_J}, {Dense, Dense}); + expected1(i,j) = B(i,k,l,m) * C(k,j) * D(l,j) * E(m,j); + IndexStmt taco_stmt1 = expected1.getAssignment().concretize(); + taco_stmt1 = scheduleMTTKRP4CPU_ST(taco_stmt1, B); + expected1.compile(taco_stmt1); + expected1.assemble(); + expected1.compute(); + ASSERT_TENSOR_EQ(expected1, A1); + + // set_ISPC_codegen_enabled(true); + // Tensor A2("A2", {NUM_I, NUM_J}, {Dense, Dense}); + // A2(i,j) = B(i,k,l) * C(k,j) * D(l,j); + // IndexStmt stmt2 = A1.getAssignment().concretize(); + // stmt2 = scheduleMTTKRPPrecomputedISPC_ST(stmt2, B); + // // printToFile("mttkrp_cpu_ispc", stmt); + // A2.compile(stmt2); + // A2.assemble(); + // A2.compute(); + // ASSERT_TENSOR_EQ(expected1, A2); + + set_ISPC_codegen_enabled(false); + Tensor expected2("expected2", {NUM_I, NUM_J}, {Dense, Dense}); + expected2(i,j) = B(i,k,l,m) * C(k,j) * D(l,j) * E(m,j); + + IndexExpr BE = B(i,k,l,m) * E(m,j); + IndexExpr BDE = BE * D(l, j); + expected2(i,j) = BDE * C(k,j); + IndexStmt taco_stmt2 = expected2.getAssignment().concretize(); + TensorVar BE_workspace("BE_workspace", Type(Float64, {Dimension(j)}), taco::dense); + TensorVar BDE_workspace("BDE_workspace", Type(Float64, {Dimension(j)}), taco::dense); + + IndexStmt precomputed_stmt = forall(i, forall(k, + where(forall(j, expected2(i,j) += BDE_workspace(j) * C(k,j)), + forall(l, where(forall(j, BDE_workspace(j) += BE_workspace(j) * D(l,j)), + forall(m, forall(j, BE_workspace(j) += B(i,k,l,m) * E(m,j)))))))); + + // IndexStmt scheduled2 = scheduleMTTKRPPrecomputedCPU(precomputed_stmt, B, 64); + // expected2.compile(scheduled2); + // expected2.assemble(); + // expected2.compute(); + // ASSERT_TENSOR_EQ(expected1, expected2); + + taco::util::TimeResults timevalue; + bool time = true; + + for (int i=0; i<3; i++) { + TOOL_BENCHMARK_TIMER(expected1.compute(), "Compute TACO1: ", timevalue); + TOOL_BENCHMARK_TIMER(A1.compute(), "Compute ISPC1: ", timevalue); + // TOOL_BENCHMARK_TIMER(expected2.compute(), "Compute TACO2: ", timevalue); + // TOOL_BENCHMARK_TIMER(A2.compute(), "Compute ISPC2: ", timevalue); + } +} - A(i,j) = B(i,k,l) * C(k,j) * D(l,j); - - IndexStmt stmt = A.getAssignment().concretize(); - stmt = scheduleMTTKRPCPU(stmt, B); - //printToFile("mttkrp_cpu", stmt); - - A.compile(stmt); - A.assemble(); - A.compute(); - Tensor expected("expected", {NUM_I, NUM_J}, {Dense, Dense}); - expected(i,j) = B(i,k,l) * C(k,j) * D(l,j); - expected.compile(); - expected.assemble(); - expected.compute(); - ASSERT_TENSOR_EQ(expected, A); -} TEST(scheduling_eval, spmvGPU) { if (!should_use_CUDA_codegen()) { @@ -1463,7 +2656,336 @@ TEST(scheduling_eval, mttkrpGPU) { ASSERT_TENSOR_EQ(expected, A); } -TEST(generate_evaluation_files, DISABLED_cpu) { +TEST(generate_evaluation_files, ispc) { + std::cout << "Hi Adhitha!\n" << std::endl ; + set_CUDA_codegen_enabled(false); + set_ISPC_codegen_enabled(true); + + vector> spmv_parameters = {{32}}; + vector> spmspv_parameters = {{8}}; + + // 4 to 512 and 4, 8, 16 + vector> spmm_dcsr_parameters = {{16, 8}}; + vector> spmm_parameters = {{16,4}}; + + vector> mttkrp_parameters = {}; + mttkrp_parameters.push_back({64,0}); + + vector> sddmm_parameters = {{8, 8}}; + vector> ttv_parameters = {{32}}; + + int NUM_I = 100; + int NUM_J = 100; + int NUM_K = 100; + int NUM_L = 100; + + string c_file_ending = ".h"; + string file_ending = ".ispc"; + string file_path = "eval_prepared_ispc/"; + mkdir(file_path.c_str(), 0777); + + // spmv + { + stringstream source1; + stringstream source2; + std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); + Tensor A("A", {NUM_I, NUM_J}, CSR); + Tensor x("x", {NUM_J}, {Dense}); + Tensor y("y", {NUM_I}, {Dense}); + y(i) = A(i, j) * x(j); + std::cout << "concretizing the assignment statement\n"; + IndexStmt stmt = y.getAssignment().concretize(); + std::cout << "Printing the original IndexStmt: " << stmt << std::endl; + + for (auto paramSet : spmv_parameters) { + std::cout << "param set: " << paramSet[0] << std::endl; + IndexStmt scheduled = scheduleSpMVISPC(stmt, paramSet[0]); + std::cout << "scheduled IndexStmt: " << scheduled << std::endl; + ir::Stmt compute = lower(scheduled, string("compute_") + util::join(paramSet, "_"), false, true); + std::cout << "computed statement: \n" << compute << std::endl; + codegen->compile(compute, false); + } + ofstream source_file; + source_file.open(file_path + "spmv_csr_ispc_taco" + c_file_ending); + source_file << source1.str(); + source_file.close(); + + ofstream ispc_source_file; + ispc_source_file.open(file_path + "__spmv_csr_ispc_taco" + file_ending); + ispc_source_file << source2.str(); + ispc_source_file.close(); + + } + + // spmm + { + stringstream source1; + stringstream source2; + std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); + Tensor A("A", {NUM_I, NUM_J}, CSR); + Tensor X("X", {NUM_J, NUM_K}, {Dense, Dense}); + Tensor Y("Y", {NUM_I, NUM_K}, {Dense, Dense}); + Y(i, k) = A(i, j) * X(j, k); + IndexStmt stmt = Y.getAssignment().concretize(); + bool isFirst = true; + for (auto paramSet : spmm_parameters) { + IndexStmt scheduled = scheduleSpMMISPC1(stmt, A, paramSet[0], paramSet[1]); + ir::Stmt compute = lower(scheduled, string("compute1_") + util::join(paramSet, "_"), false, true); + codegen->compile(compute, isFirst); + isFirst = false; + } + ofstream source_file; + source_file.open(file_path + "spmm_csr_ispc_taco1" + c_file_ending); + source_file << source1.str(); + source_file.close(); + + ofstream ispc_source_file; + ispc_source_file.open(file_path + "__spmm_csr_ispc_taco1" + file_ending); + ispc_source_file << source2.str(); + ispc_source_file.close(); + } + + // spmm omp + { + stringstream source1; + stringstream source2; + std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); + Tensor A("A", {NUM_I, NUM_J}, CSR); + Tensor X("X", {NUM_J, NUM_K}, {Dense, Dense}); + Tensor Y("Y", {NUM_I, NUM_K}, {Dense, Dense}); + Y(i, k) = A(i, j) * X(j, k); + IndexStmt stmt = Y.getAssignment().concretize(); + bool isFirst = true; + for (auto paramSet : spmm_parameters) { + IndexStmt scheduled = scheduleSpMMISPCOMP1(stmt, A, paramSet[0], paramSet[1]); + ir::Stmt compute = lower(scheduled, string("compute1_") + util::join(paramSet, "_"), false, true); + codegen->compile(compute, isFirst); + isFirst = false; + } + ofstream source_file; + source_file.open(file_path + "spmm_omp_ispc_taco1" + c_file_ending); + source_file << source1.str(); + source_file.close(); + + ofstream ispc_source_file; + ispc_source_file.open(file_path + "__spmm_omp_ispc_taco1" + file_ending); + ispc_source_file << source2.str(); + ispc_source_file.close(); + } + + // spmm2 + { + stringstream source1; + stringstream source2; + std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); + Tensor A("A", {NUM_I, NUM_J}, CSR); + Tensor X("X", {NUM_J, NUM_K}, {Dense, Dense}); + Tensor Y("Y", {NUM_I, NUM_K}, {Dense, Dense}); + Y(i, k) = A(i, j) * X(j, k); + IndexStmt stmt = Y.getAssignment().concretize(); + bool isFirst = true; + for (auto paramSet : spmm_parameters) { + IndexStmt scheduled = scheduleSpMMISPC2(stmt, A, paramSet[0], paramSet[1]); + ir::Stmt compute = lower(scheduled, string("compute2_") + util::join(paramSet, "_"), false, true); + codegen->compile(compute, isFirst); + isFirst = false; + } + ofstream source_file; + source_file.open(file_path + "spmm_csr_ispc_taco2" + c_file_ending); + source_file << source1.str(); + source_file.close(); + + ofstream ispc_source_file; + ispc_source_file.open(file_path + "__spmm_csr_ispc_taco2" + file_ending); + ispc_source_file << source2.str(); + ispc_source_file.close(); + } + + // spmm + { + stringstream source1; + stringstream source2; + std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); + Tensor A("A", {NUM_I, NUM_J}, CSR); + Tensor X("X", {NUM_J, NUM_K}, {Dense, Dense}); + Tensor Y("Y", {NUM_I, NUM_K}, {Dense, Dense}); + Y(i, k) = A(i, j) * X(j, k); + IndexStmt stmt = Y.getAssignment().concretize(); + bool isFirst = true; + for (auto paramSet : spmm_parameters) { + IndexStmt scheduled = scheduleSpMMISPC3(stmt, A, paramSet[0], paramSet[1]); + ir::Stmt compute = lower(scheduled, string("compute3_") + util::join(paramSet, "_"), false, true); + codegen->compile(compute, isFirst); + isFirst = false; + } + ofstream source_file; + source_file.open(file_path + "spmm_csr_ispc_taco3" + c_file_ending); + source_file << source1.str(); + source_file.close(); + + ofstream ispc_source_file; + ispc_source_file.open(file_path + "__spmm_csr_ispc_taco3" + file_ending); + ispc_source_file << source2.str(); + ispc_source_file.close(); + } + + // ttv + { + stringstream source; + stringstream source2; + std::shared_ptr codegen = ir::CodeGen::init_default(source, source2, ir::CodeGen::ImplementationGen); + Tensor A("A", {NUM_I, NUM_J}, {Dense, Dense}); // TODO: change to sparse outputs + Tensor B("B", {NUM_I, NUM_J, NUM_K}, {Sparse, Sparse, Sparse}); + Tensor c("c", {NUM_K}, Format({Dense})); + A(i,j) = B(i,j,k) * c(k); + IndexStmt stmt = A.getAssignment().concretize(); + bool isFirst = true; + for (auto paramSet : ttv_parameters) { + IndexStmt scheduled = scheduleTTVCPU(stmt, B, paramSet[0]); + ir::Stmt compute = lower(scheduled, string("compute_") + util::join(paramSet, "_"), false, true); + codegen->compile(compute, isFirst); + isFirst = false; + } + ofstream source_file; + source_file.open(file_path + "ttv_cpu" + c_file_ending); + source_file << source.str(); + source_file.close(); + + ofstream ispc_source_file; + ispc_source_file.open(file_path + "__ttv_cpu" + file_ending); + ispc_source_file << source2.str(); + ispc_source_file.close(); + } + + + // mttkrp3 + { + stringstream source; + stringstream source2; + std::shared_ptr codegen = ir::CodeGen::init_default(source, source2, ir::CodeGen::ImplementationGen); + Tensor A("A", {NUM_I, NUM_J}, {Dense, Dense}); + Tensor B("B", {NUM_I, NUM_K, NUM_L}, {Dense, Sparse, Sparse}); + Tensor C("C", {NUM_K, NUM_J}, {Dense, Dense}); + Tensor D("D", {NUM_L, NUM_J}, {Dense, Dense}); + A(i,j) = B(i,k,l) * C(k,j) * D(l,j); + IndexStmt stmt = A.getAssignment().concretize(); + bool isFirst = true; + for (auto paramSet : mttkrp_parameters) { + IndexStmt scheduled = scheduleMTTKRPCPU(stmt, B, paramSet[0], paramSet[1]); + ir::Stmt compute = lower(scheduled, string("compute_") + util::join(paramSet, "_"), false, true); + codegen->compile(compute, isFirst); + isFirst = false; + } + ofstream source_file; + source_file.open(file_path + "mttkrp3_cpu" + c_file_ending); + source_file << source.str(); + source_file.close(); + + ofstream ispc_source_file; + ispc_source_file.open(file_path + "__mttkrp3_cpu" + file_ending); + ispc_source_file << source2.str(); + ispc_source_file.close(); + } + + + return; +} + + + +TEST(generate_ispc_sddmm_evaluation_files, ispc) { + std::cout << "Hi Adhitha!\n" << std::endl ; + set_CUDA_codegen_enabled(false); + set_ISPC_codegen_enabled(true); + + vector> spmv_parameters = {{32}}; + vector> spmspv_parameters = {{8}}; + + // 4 to 512 and 4, 8, 16 + vector> spmm_dcsr_parameters = {{16, 8}}; + vector> spmm_parameters = {{16,4}}; + + vector> mttkrp_parameters = {}; + mttkrp_parameters.push_back({64,0}); + + vector> sddmm_parameters = {{8, 8}}; + vector> ttv_parameters = {{32}}; + + int NUM_I = 100; + int NUM_J = 100; + int NUM_K = 100; + + string c_file_ending = ".h"; + string file_ending = ".ispc"; + string file_path = "eval_prepared_ispc/sddmm/"; + mkdir(file_path.c_str(), 0777); + + // sddmm + { + stringstream source1; + stringstream source2; + std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); + Tensor A("A", {NUM_I, NUM_K}, {Dense, Dense}); + Tensor B("B", {NUM_I, NUM_K}, CSR); + Tensor C("C", {NUM_I, NUM_J}, {Dense, Dense}); + Tensor D("D", {NUM_J, NUM_K}, {Dense, Dense}); + A(i,k) = B(i,k) * C(i,j) * D(j,k); + IndexStmt stmt = A.getAssignment().concretize(); + bool isFirst = true; + for (auto paramSet : sddmm_parameters) { + IndexStmt scheduled = scheduleSDDMMISPC1(stmt, B, paramSet[0], paramSet[1]); + ir::Stmt compute = lower(scheduled, string("compute1_") + util::join(paramSet, "_"), false, true); + codegen->compile(compute, isFirst); + isFirst = false; + } + ofstream source_file; + source_file.open(file_path + "sddmm_cpu_ispc_taco1" + file_ending); + source_file << source1.str(); + source_file.close(); + + ofstream ispc_source_file; + ispc_source_file.open(file_path + "__sddmm_cpu_ispc_taco1" + file_ending); + ispc_source_file << source2.str(); + ispc_source_file.close(); + } + + + // sddmm + { + stringstream source1; + stringstream source2; + std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); + Tensor Y("Y", {NUM_I, NUM_K}, {Dense, Dense}); + Tensor A("A", {NUM_I, NUM_K}, CSR); + Tensor X("X", {NUM_I, NUM_J}, {Dense, Dense}); + Y(i,j) = A(i,j) * X(i,k) * X(j,k); + IndexStmt stmt = Y.getAssignment().concretize(); + bool isFirst = true; + for (auto paramSet : sddmm_parameters) { + IndexStmt scheduled = scheduleSDDMMISPC2(stmt, A, paramSet[0], paramSet[1]); + ir::Stmt compute = lower(scheduled, string("compute2_") + util::join(paramSet, "_"), false, true); + codegen->compile(compute, isFirst); + isFirst = false; + } + ofstream source_file; + source_file.open(file_path + "sddmm_cpu_ispc_taco2" + file_ending); + source_file << source1.str(); + source_file.close(); + + ofstream ispc_source_file; + ispc_source_file.open(file_path + "__sddmm_cpu_ispc_taco2" + file_ending); + ispc_source_file << source2.str(); + ispc_source_file.close(); + } + + + return; +} + + + + +TEST(generate_evaluation_files, cpu) { if (should_use_CUDA_codegen()) { return; } @@ -1779,10 +3301,63 @@ TEST(generate_evaluation_files, DISABLED_cpu) { } } -TEST(generate_evaluation_files, DISABLED_gpu) { - if (!should_use_CUDA_codegen()) { - return; +TEST(generate_evaluation_files, spmv_ispc) { + set_CUDA_codegen_enabled(false); + set_ISPC_codegen_enabled(true); + + std::cout << "executing generate_evaluation_file.ispc\n"; + + int NUM_I = 100; + int NUM_J = 100; + + vector> spmv_parameters = {}; // {NNZ_PER_THREAD, BLOCK_SIZE} + for (int i = 3; i <= 20; i++) { + spmv_parameters.push_back({i, 512}); + } + + string file_ending_c = ".c"; + string file_ending_ispc = ".ispc"; + string file_path = "eval_prepared_ispc/spmv/"; + mkdir(file_path.c_str(), 0777); + + // spmv + { + stringstream source1; + stringstream source2; + std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); + Tensor A("A", {NUM_I, NUM_J}, CSR); + Tensor x("x", {NUM_J}, Format({Dense})); + Tensor y("y", {NUM_I}, Format({Dense})); + IndexExpr precomputed = A(i, j) * x(j); + y(i) = precomputed; + IndexStmt stmt = y.getAssignment().concretize(); + bool isFirst = true; + for (auto paramSet : spmv_parameters) { + IndexStmt scheduled = scheduleSpMVCPU(stmt); + ir::Stmt compute = lower(scheduled, string("compute_") + util::join(paramSet, "_"), false, true); + codegen->compile(compute, isFirst); + isFirst = false; + } + ofstream source_file1; + source_file1.open(file_path + "spmv_ispc" + file_ending_c); + source_file1 << source1.str(); + source_file1.close(); + + ofstream source_file2; + source_file2.open(file_path + "__spmv_ispc" + file_ending_ispc); + source_file2 << source2.str(); + source_file2.close(); } +} + +TEST(generate_evaluation_files, gpu) { + // if (!should_use_CUDA_codegen()) { + // return; + // } + set_CUDA_codegen_enabled(true); + set_ISPC_codegen_enabled(false); + + std::cout << "executing generate_evaluation_file.gpu\n"; vector> spmv_parameters = {}; // {NNZ_PER_THREAD, BLOCK_SIZE} for (int i = 3; i <= 20; i++) { diff --git a/test/tests-scheduling-fuse.cpp b/test/tests-scheduling-fuse.cpp new file mode 100644 index 000000000..41fb86f6f --- /dev/null +++ b/test/tests-scheduling-fuse.cpp @@ -0,0 +1,2891 @@ +#include "taco/cuda.h" +#include "taco/tensor.h" +#include "test.h" +#include "util.h" +#include +#include "gtest/gtest.h" +#include +#include + +#define NUM_THREADS_TO_USE 1 +// #define NUM_THREADS_TO_USE 32 + +void handle_error (int retval) +{ + printf("PAPI error %d: %s\n", retval, PAPI_strerror(retval)); + exit(1); +} + +TEST(scheduling_eval, spmvFusedWithSyntheticData) { + if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) { + return; + } + taco_set_num_threads(NUM_THREADS_TO_USE); + + std::default_random_engine gen(0); + std::uniform_real_distribution unif(0.0, 1.0); + + Format csr({dense, sparse}); + Format rm({dense}); + + // uncomment this for reading the csr matrix saved in mtx file + std::cout << "reading B mat mtx\n"; + + int NUM_I = 5; // 1021/10; + int NUM_J = 5; // 1039/10; + int NUM_K = 8; + float SPARSITY = .3; + Tensor B("B", {NUM_I, NUM_J}, csr); + srand(75883); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + } + B.pack(); + + + std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl; + std::cout << "adding c mat\n"; + Tensor C("C", {NUM_J, NUM_K}, csr); + for (int i = 0; i < C.getDimension(0); ++i) { + for (int j = 0; j < C.getDimension(1); ++j) { + C.insert({i,j}, unif(gen)); + } + } + std::cout << "packing C mat\n"; + C.pack(); + + Tensor v("v", {NUM_K}, rm); + for (int i = 0; i < v.getDimension(0); ++i) { + v.insert({i}, unif(gen)); + } + std::cout << "packing D mat\n"; + v.pack(); + + Tensor A("A", {NUM_I}, rm); + Tensor ref("ref", {NUM_I}, rm); + IndexVar i, j, k, l, m; + A(i) = B(i,j) * C(j,k) * v(k); + + // IndexStmt stmt = A.getAssignment().concretize(); + IndexStmt stmt = makeReductionNotation(A.getAssignment()); + stmt = makeConcreteNotation(stmt); + printToFile("SpMVfused", stmt); + stmt = reorderLoopsTopologically(stmt); + stmt = loopFusionOverFission(stmt, A.getAssignment(), "f", 1); + stmt = insertTemporaries(stmt); + stmt = parallelizeOuterLoop(stmt); + + A.compile(stmt); + // We can now call the functions taco generated to assemble the indices of the + // output matrix and then actually compute the MTTKRP. + A.assemble(); + + + // ref(i) = B(i,j) * C(j,k) * v(k); + // IndexStmt refStmt = makeReductionNotation(ref.getAssignment()); + // refStmt = makeConcreteNotation(refStmt); + // refStmt = insertTemporaries(refStmt); + // refStmt = parallelizeOuterLoop(refStmt); + // ref.compile(refStmt); + // ref.assemble(); + + // Tensor ref1({NUM_J}, rm); + // Tensor ref2({NUM_I}, rm); + // ref1(j) = C(j,k) * v(k); + // ref2(i) = B(i,j) * ref1(j); + + // IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment()); + // ref1Stmt = makeConcreteNotation(ref1Stmt); + // ref1Stmt = insertTemporaries(ref1Stmt); + // ref1Stmt = parallelizeOuterLoop(ref1Stmt); + // ref1.compile(ref1Stmt); + // ref1.assemble(); + + // IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment()); + // ref2Stmt = makeConcreteNotation(ref2Stmt); + // ref2Stmt = insertTemporaries(ref2Stmt); + // ref2Stmt = parallelizeOuterLoop(ref2Stmt); + // ref2.compile(ref2Stmt); + // ref2.assemble(); + + std::cout << "compute start\n"; + taco::util::TimeResults timevalue; + bool time = true; + // TOOL_BENCHMARK_TIMER(ref.compute(), "\n\nReference Kernel: ", timevalue); + TOOL_BENCHMARK_TIMER(A.compute(), "\n\nFused Kernel: ", timevalue); + // ASSERT_TENSOR_EQ(ref, A); + + // // check results + // for (int q = 0; q < A.getDimension(0); ++q) { + // if ( abs(A(q) - ref(q))/abs(ref(q)) > ERROR_MARGIN) { + // std::cout << "error: results don't match A("<< q << "): " + // << A(q) << ", ref: " << ref(q) << std::endl; + // ASSERT_TRUE(false); + // } + // } + // // ASSERT_TENSOR_EQ(A, ref); + // TOOL_BENCHMARK_TIMER(ref1.compute(), "\n\nSDDMM Kernel: ", timevalue); + // TOOL_BENCHMARK_TIMER(ref2.compute(), "\n\nSpMM Kernel: ", timevalue); + // ASSERT_TENSOR_EQ(ref, ref2); + + // for (int q = 0; q < ref2.getDimension(0); ++q) { + // for (int w = 0; w < ref2.getDimension(1); ++w) { + // if ( abs(ref2(q,w) - ref(q,w))/abs(ref(q,w)) > ERROR_MARGIN) { + // std::cout << "error: results don't match A("<< q << "," << w << "): " + // << ref2(q,w) << ", ref: " << ref(q,w) << std::endl; + // ASSERT_TRUE(false); + // } + // } + // } + +} + +TEST(scheduling_eval, spmvFused) { + if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) { + return; + } + + ofstream statfile; + statfile.open( + "/home/min/a/kadhitha/workspace/my_taco/taco/test/stats/spmv-spmv.txt", std::ios::app); + if (statfile.is_open()) { + statfile << "\nspmv-spmv execution\n"; + statfile << "\n-----------------------------------------\n"; + } + taco_set_num_threads(NUM_THREADS_TO_USE); + + std::default_random_engine gen(0); + std::uniform_real_distribution unif(0.0, 1.0); + + Format csr({dense, sparse}); + Format rm({dense}); + + + + int filenum = 1; + + std::vector matfiles = { + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx", // 5 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx", // 8 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx", // 10 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx", // 12 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wiki-Talk/wiki-Talk.mtx", // 13 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/com-Orkut/com-Orkut.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx", // 15 + "/home/min/a/kadhitha/workspace/my_taco/FusedMM/dataset/harvard.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/twitter7/twitter7.mtx" + }; + std::vector matfilesrw = { + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/synthetic.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cage3.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/bcsstk17.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/pdb1HYS.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/rma10.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cant.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/consph.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cop20k_A.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/shipsec1.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/scircuit.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/webbase-1M.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/wiki-Talk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/com-Orkut.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/circuit5M.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/harvard.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/twitter7.mtx" + }; + + // uncomment this for reading the csr matrix saved in mtx file + std::cout << "reading B mat mtx\n"; + + + int kDim = 8; + float SPARSITY = .3; + std::string matfile = matfiles[filenum]; + std::cout << "reading B mat mtx\n"; + Tensor B = read(matfile, csr, true); + B.setName("B"); + B.pack(); + + std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl; + std::cout << "adding c mat\n"; + + std::cout << "reading B mat mtx\n"; + Tensor C = read(matfile, csr, true); + C.setName("C"); + C.pack(); + + + Tensor v("v", {C.getDimension(1)}, rm); + for (int i = 0; i < v.getDimension(0); ++i) { + v.insert({i}, unif(gen)); + } + std::cout << "packing D mat\n"; + v.pack(); + + if (statfile.is_open()) { + statfile + << "A(i) = B(i,j) * C(j,k) * v(k);" << std::endl + << "B1_dimension: " << B.getDimension(0) << ", B2_dimension: " << B.getDimension(1) << ", vals: " << B.getStorage().getValues().getSize() << std::endl + << "C1_dimension: " << C.getDimension(0) << ", C2_dimension: " << C.getDimension(1) << ", vals: " << C.getStorage().getValues().getSize() << std::endl + << "D1_dimension: " << v.getDimension(0) << ", vals: " << v.getStorage().getValues().getSize() << std::endl + << std::endl; + } + + Tensor A("A", {B.getDimension(0)}, rm); + Tensor ref("ref", {B.getDimension(0)}, rm); + IndexVar i, j, k, l, m; + A(i) = B(i,j) * C(j,k) * v(k); + + ref(i) = B(i,j) * C(j,k) * v(k); + IndexStmt refStmt = makeReductionNotation(ref.getAssignment()); + refStmt = makeConcreteNotation(refStmt); + refStmt = insertTemporaries(refStmt); + refStmt = parallelizeOuterLoop(refStmt); + ref.compile(refStmt); + ref.assemble(); + + // IndexStmt stmt = A.getAssignment().concretize(); + IndexStmt stmt = makeReductionNotation(A.getAssignment()); + stmt = makeConcreteNotation(stmt); + printToFile("SpMVfused", stmt); + stmt = reorderLoopsTopologically(stmt); + stmt = loopFusionOverFission(stmt, A.getAssignment(), "f", 1); + stmt = insertTemporaries(stmt); + stmt = parallelizeOuterLoop(stmt); + A.compile(stmt); + A.assemble(); + + + // Tensor ref1({NUM_J}, rm); + // Tensor ref2({NUM_I}, rm); + // ref1(j) = C(j,k) * v(k); + // ref2(i) = B(i,j) * ref1(j); + + // IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment()); + // ref1Stmt = makeConcreteNotation(ref1Stmt); + // ref1Stmt = insertTemporaries(ref1Stmt); + // ref1Stmt = parallelizeOuterLoop(ref1Stmt); + // ref1.compile(ref1Stmt); + // ref1.assemble(); + + // IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment()); + // ref2Stmt = makeConcreteNotation(ref2Stmt); + // ref2Stmt = insertTemporaries(ref2Stmt); + // ref2Stmt = parallelizeOuterLoop(ref2Stmt); + // ref2.compile(ref2Stmt); + // ref2.assemble(); + + std::cout << "compute start\n"; + taco::util::TimeResults timevalue; + bool time = true; + std::string sofused = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/spmv_spmv/spmv_fused.so"; + + TOOL_BENCHMARK_TIMER(ref.compute(statfile, sofused), "\n\nReference Kernel: ", timevalue); + + + std::cout << "b1 dim: " << B.getTacoTensorT()->dimensions[1] << std::endl; + // TOOL_BENCHMARK_TIMER(ref.compute(statfile, sofused), "\n\nFused Kernel: ", timevalue); + // ASSERT_TENSOR_EQ(ref, A); + + // // check results + // for (int q = 0; q < A.getDimension(0); ++q) { + // if ( abs(A(q) - ref(q))/abs(ref(q)) > ERROR_MARGIN) { + // std::cout << "error: results don't match A("<< q << "): " + // << A(q) << ", ref: " << ref(q) << std::endl; + // ASSERT_TRUE(false); + // } + // } + // // ASSERT_TENSOR_EQ(A, ref); + // TOOL_BENCHMARK_TIMER(ref1.compute(), "\n\nSDDMM Kernel: ", timevalue); + // TOOL_BENCHMARK_TIMER(ref2.compute(), "\n\nSpMM Kernel: ", timevalue); + // ASSERT_TENSOR_EQ(ref, ref2); + + // for (int q = 0; q < ref2.getDimension(0); ++q) { + // for (int w = 0; w < ref2.getDimension(1); ++w) { + // if ( abs(ref2(q,w) - ref(q,w))/abs(ref(q,w)) > ERROR_MARGIN) { + // std::cout << "error: results don't match A("<< q << "," << w << "): " + // << ref2(q,w) << ", ref: " << ref(q,w) << std::endl; + // ASSERT_TRUE(false); + // } + // } + // } + + if (statfile.is_open()) { + statfile.close(); + } + +} + +TEST(scheduling_eval, sddmmFusedWithSyntheticData) { + if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) { + return; + } + + taco_set_num_threads(NUM_THREADS_TO_USE); + + std::default_random_engine gen(0); + std::uniform_real_distribution unif(0.0, 1.0); + + Format csr({dense, sparse}); + Format rm({dense, dense}); + int ldim = 4; + int kdim = 8; + + // uncomment this for reading the csr matrix saved in mtx file + std::cout << "reading B mat mtx\n"; + + int NUM_I = 1021/10; + int NUM_J = 1039/10; + float SPARSITY = .3; + Tensor B("B", {NUM_I, NUM_J}, csr); + srand(75883); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + } + B.pack(); + write("/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx", B); + + std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl; + std::cout << "adding c mat\n"; + Tensor C({B.getDimension(0), kdim}, rm); + for (int i = 0; i < C.getDimension(0); ++i) { + for (int j = 0; j < C.getDimension(1); ++j) { + C.insert({i,j}, unif(gen)); + } + } + std::cout << "packing C mat\n"; + C.pack(); + + Tensor D({B.getDimension(1), kdim}, rm); + for (int i = 0; i < D.getDimension(0); ++i) { + for (int j = 0; j < D.getDimension(1); ++j) { + D.insert({i,j}, unif(gen)); + } + } + std::cout << "packing D mat\n"; + D.pack(); + + Tensor F({B.getDimension(1), ldim}, rm); + for (int i = 0; i < F.getDimension(0); ++i) { + for (int j = 0; j < F.getDimension(1); ++j) { + F.insert({i,j}, unif(gen)); + } + } + std::cout << "packing F mat\n"; + F.pack(); + + Tensor A({B.getDimension(0), ldim}, rm); + Tensor ref({B.getDimension(0), ldim}, rm); + IndexVar i, j, k, l; + A(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); + + // IndexStmt stmt = A.getAssignment().concretize(); + IndexStmt stmt = makeReductionNotation(A.getAssignment()); + stmt = makeConcreteNotation(stmt); + printToFile("fusedMMConcrete", stmt); + + stmt = reorderLoopsTopologically(stmt); + printToFile("fusedMMOrdered", stmt); + + stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 1); + printToFile("fusedMMFused", stmt); + + stmt = insertTemporaries(stmt); + printToFile("fusedMMWithTemps", stmt); + stmt = parallelizeOuterLoop(stmt); + printToFile("fusedMMFusedPar", stmt); + + A.compile(stmt); + // We can now call the functions taco generated to assemble the indices of the + // output matrix and then actually compute the MTTKRP. + A.assemble(); + + + ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); + IndexStmt refStmt = makeReductionNotation(ref.getAssignment()); + refStmt = makeConcreteNotation(refStmt); + refStmt = insertTemporaries(refStmt); + refStmt = parallelizeOuterLoop(refStmt); + ref.compile(refStmt); + ref.assemble(); + + Tensor ref1({B.getDimension(0), B.getDimension(1)}, csr); + Tensor ref2({B.getDimension(0), ldim}, rm); + ref1(i,j)=B(i,j)*C(i,k)*D(j,k); + ref2(i,l)=ref1(i,j)*F(j,l); + + IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment()); + ref1Stmt = makeConcreteNotation(ref1Stmt); + ref1Stmt = insertTemporaries(ref1Stmt); + ref1Stmt = parallelizeOuterLoop(ref1Stmt); + ref1.compile(ref1Stmt); + ref1.assemble(); + + IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment()); + ref2Stmt = makeConcreteNotation(ref2Stmt); + ref2Stmt = insertTemporaries(ref2Stmt); + ref2Stmt = parallelizeOuterLoop(ref2Stmt); + ref2.compile(ref2Stmt); + ref2.assemble(); + + std::cout << "compute start\n"; + taco::util::TimeResults timevalue; + bool time = true; + TOOL_BENCHMARK_TIMER(ref.compute(), "\n\nReference Kernel: ", timevalue); + TOOL_BENCHMARK_TIMER(A.compute(), "\n\nFused Kernel: ", timevalue); + + // check results + for (int q = 0; q < A.getDimension(0); ++q) { + for (int w = 0; w < A.getDimension(1); ++w) { + if ( abs(A(q,w) - ref(q,w))/abs(ref(q,w)) > ERROR_MARGIN) { + std::cout << "error: results don't match A("<< q << "," << w << "): " + << A(q,w) << ", ref: " << ref(q,w) << std::endl; + ASSERT_TRUE(false); + } + } + } + // ASSERT_TENSOR_EQ(A, ref); + TOOL_BENCHMARK_TIMER(ref1.compute(), "\n\nSDDMM Kernel: ", timevalue); + TOOL_BENCHMARK_TIMER(ref2.compute(), "\n\nSpMM Kernel: ", timevalue); + + for (int q = 0; q < ref2.getDimension(0); ++q) { + for (int w = 0; w < ref2.getDimension(1); ++w) { + if ( abs(ref2(q,w) - ref(q,w))/abs(ref(q,w)) > ERROR_MARGIN) { + std::cout << "error: results don't match A("<< q << "," << w << "): " + << ref2(q,w) << ", ref: " << ref(q,w) << std::endl; + ASSERT_TRUE(false); + } + } + } + +} + + +IndexStmt scheduleSDDMMCPU_forfuse(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i, j, k, l, m; + IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1"); + return stmt.split(i, i0, i1, CHUNK_SIZE) + .pos(k, kpos, B(i,k)) + .split(kpos, kpos0, kpos1, UNROLL_FACTOR) + .reorder({i0, i1, kpos0, j, kpos1}) + .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + .parallelize(kpos1, ParallelUnit::CPUVector, OutputRaceStrategy::ParallelReduction); +} + +TEST(scheduling_eval, sddmmFused) { + if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) { + return; + } + + taco_set_num_threads(NUM_THREADS_TO_USE); + + ofstream statfile; + statfile.open( + "/home/min/a/kadhitha/workspace/my_taco/taco/test/stats/sddmm-spmm.txt", std::ios::app); + if (statfile.is_open()) { + statfile << "\nsddmm-spmm execution\n"; + statfile << "\n-----------------------------------------\n"; + } + + std::default_random_engine gen(0); + std::uniform_real_distribution unif(0.0, 1.0); + + Format csr({dense, sparse}); + Format rm({dense, dense}); + int ldim = 128; + int kdim = 128; + + // vector filenums = {2,3,4,5,6,7,8,9,10,12,15}; + + vector filenums = {0}; + + for (auto filenum : filenums) { + + // int filenum = 5; + + std::vector matfiles = { + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx", + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx", // 5 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx", // 8 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx", // 10 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx", // 12 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wiki-Talk/wiki-Talk.mtx", // 13 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/com-Orkut/com-Orkut.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx", // 15 + "/home/min/a/kadhitha/workspace/my_taco/FusedMM/dataset/harvard.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/twitter7/twitter7.mtx" + }; + std::vector matfilesrw = { + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/rw/cora.mtx", + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/rw/amazon.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/synthetic.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cage3.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/bcsstk17.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/pdb1HYS.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/rma10.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cant.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/consph.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cop20k_A.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/shipsec1.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/scircuit.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/webbase-1M.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/wiki-Talk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/com-Orkut.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/circuit5M.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/harvard.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/twitter7.mtx" + }; + + std::string matfile = matfiles[filenum]; + std::cout << "reading B mat mtx\n"; + Tensor B = read(matfile, csr, true); + B.setName("B"); + B.pack(); + // write(matfilesrw[filenum], B); + + if (statfile.is_open()) { + statfile << matfile << std::endl; + } + + std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl; + std::cout << "adding c mat\n"; + Tensor C({B.getDimension(0), kdim}, rm); + for (int i = 0; i < C.getDimension(0); ++i) { + for (int j = 0; j < C.getDimension(1); ++j) { + C.insert({i,j}, unif(gen)); + } + } + std::cout << "packing C mat\n"; + C.pack(); + + Tensor D({B.getDimension(1), kdim}, rm); + for (int i = 0; i < D.getDimension(0); ++i) { + for (int j = 0; j < D.getDimension(1); ++j) { + D.insert({i,j}, unif(gen)); + } + } + std::cout << "packing D mat\n"; + D.pack(); + + Tensor F({B.getDimension(1), ldim}, rm); + for (int i = 0; i < F.getDimension(0); ++i) { + for (int j = 0; j < F.getDimension(1); ++j) { + F.insert({i,j}, unif(gen)); + } + } + std::cout << "packing F mat\n"; + F.pack(); + + Tensor A({B.getDimension(0), ldim}, rm); + Tensor ref({B.getDimension(0), ldim}, rm); + IndexVar i, j, k, l, m; + IndexVar i0("i0"), i1("i1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"), k0("k0"), k1("k1"); + A(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); + if (statfile.is_open()) { + statfile + << "ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);" << std::endl + << "B1_dimension: " << B.getDimension(0) << ", B2_dimension: " << B.getDimension(1) << ", vals: " << B.getStorage().getValues().getSize() << std::endl + << "C1_dimension: " << C.getDimension(0) << ", C2_dimension: " << C.getDimension(1) << ", vals: " << C.getStorage().getValues().getSize() << std::endl + << "D1_dimension: " << D.getDimension(0) << ", D2_dimension: " << D.getDimension(1) << ", vals: " << D.getStorage().getValues().getSize() << std::endl + << "E1_dimension: " << F.getDimension(0) << ", E2_dimension: " << F.getDimension(1) << ", vals: " << F.getStorage().getValues().getSize() << std::endl + << std::endl; + } + + // IndexStmt stmt = A.getAssignment().concretize(); + IndexStmt stmt = makeReductionNotation(A.getAssignment()); + stmt = makeConcreteNotation(stmt); + stmt = reorderLoopsTopologically(stmt); + stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 1); + stmt = stmt + .split(i, i0, i1, 16); + stmt = insertTemporaries(stmt); + stmt = parallelizeOuterLoop(stmt); + + A.compile(stmt); + A.assemble(); + + + ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); + IndexStmt refStmt = makeReductionNotation(ref.getAssignment()); + refStmt = makeConcreteNotation(refStmt); + refStmt = insertTemporaries(refStmt); + refStmt = refStmt + .split(i, i0, i1, 16) + .reorder({i0, i1, j, k, l}); + stmt = insertTemporaries(stmt); + refStmt = parallelizeOuterLoop(refStmt); + ref.compile(refStmt); + ref.assemble(); + + Tensor ref1({B.getDimension(0), B.getDimension(1)}, csr); + Tensor ref2({B.getDimension(0), ldim}, rm); + ref1(i,j)=B(i,j)*C(i,k)*D(j,k); + ref2(i,l)=ref1(i,j)*F(j,l); + + IndexStmt ref1Stmt = ref1.getAssignment().concretize(); // anyway Ryan's kernel is used here + + ref1Stmt = ref1Stmt.split(i, i0, i1, 16); + // .pos(j, jpos, B(i,j)); + // .split(k, k0, k1, 8); + // .reorder({i0, i1, jpos0, k, jpos1}); + // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + // .parallelize(jpos1, ParallelUnit::CPUVector, OutputRaceStrategy::ParallelReduction); + // ref1Stmt.split(i, ); + // stmt = scheduleSDDMMCPU_forfuse(ref1Stmt, B); + // IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment()); + // ref1Stmt = makeConcreteNotation(ref1Stmt); + ref1Stmt = insertTemporaries(ref1Stmt); + ref1Stmt = parallelizeOuterLoop(ref1Stmt); + ref1.compile(ref1Stmt); + ref1.assemble(); + + IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment()); // Ryan's SpMM kernel is used here + ref2Stmt = makeConcreteNotation(ref2Stmt); + ref2Stmt = insertTemporaries(ref2Stmt); + ref2Stmt = parallelizeOuterLoop(ref2Stmt); + ref2.compile(ref2Stmt); + ref2.assemble(); + + std::cout << "compute start\n"; + taco::util::TimeResults timevalue; + bool time = true; + + std::string sofile_fused = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/fused_kernel.so"; + TOOL_BENCHMARK_TIMER(A.compute(statfile), "\n\nFused Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "fused time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + statfile << "\nseparate execution\n"; + + // std::string sofile_sddmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so"; + std::string sofile_sddmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.so"; + TOOL_BENCHMARK_TIMER(ref1.compute(statfile, sofile_sddmm), "\n\nSDDMM Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "sddmm time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + std::string sofile_sddmm_ryan = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/sddmm_ryan.so"; + TOOL_BENCHMARK_TIMER(ref1.compute(statfile, sofile_sddmm_ryan), "\n\nSDDMM Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "sddmm time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + std::string sofile_spmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so"; + TOOL_BENCHMARK_TIMER(ref2.compute(statfile, sofile_spmm), "\n\nSpMM Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "spmm time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + statfile << "\nreference execution \n"; + + std::string sofile_original = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/taco_original.so"; + TOOL_BENCHMARK_TIMER(ref.compute(statfile, sofile_original), "\n\nReference Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "taco reference time: "; + statfile << timevalue << std::endl; + } else { std::cout << " stat file is not open\n"; } + + double* A_vals = (double*) (A.getTacoTensorT()->vals); + double* ref_vals = (double*) (ref.getTacoTensorT()->vals); + double* ref2_vals = (double*) (ref2.getTacoTensorT()->vals); + + // int* A2_pos = (double*) (ref.getTacoTensorT()->vals); + + // for (size_t q=0; q < B.getStorage().getValues().getSize(); q++) { + // if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) { + // std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + // << "refvals: " << ref_vals[q] << std::endl; + // ASSERT_TRUE(false); + // } + // } + + for (size_t q=0; q < A.getDimension(0)* A.getDimension(1); q++) { + if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) { + std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + << "refvals: " << ref_vals[q] << std::endl; + ASSERT_TRUE(false); + } + } + for (size_t q=0; q < A.getDimension(0)* A.getDimension(1); q++) { + if ( abs(A_vals[q] - ref2_vals[q])/abs(ref2_vals[q]) > ERROR_MARGIN) { + std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + << "refvals: " << ref2_vals[q] << std::endl; + ASSERT_TRUE(false); + } + } + // // for (int q= 0; q< A_vals + // for (int q = 0; q < A.getDimension(0); ++q) { + // for (int w = 0; w < A.getDimension(1); ++w) { + // if ( abs(A(q,w) - ref(q,w))/abs(ref(q,w)) > ERROR_MARGIN) { + // std::cout << "error: results don't match A("<< q << "," << w << "): " + // << A(q,w) << ", ref: " << ref(q,w) << std::endl; + // ASSERT_TRUE(false); + // } + // } + // } + // ASSERT_TENSOR_EQ(A, ref); + + } // end of for loop + + + if (statfile.is_open()) { + statfile.close(); + } +} + + + + +TEST(scheduling_eval, hadamardFused) { + if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) { + return; + } + + taco_set_num_threads(NUM_THREADS_TO_USE); + + ofstream statfile; + statfile.open( + "/home/min/a/kadhitha/workspace/my_taco/taco/test/stats/hadamard-gemm.txt", std::ios::app); + if (statfile.is_open()) { + statfile << "\nsddmm-spmm execution\n"; + statfile << "\n-----------------------------------------\n"; + } + + std::default_random_engine gen(0); + std::uniform_real_distribution unif(0.0, 1.0); + + Format csr({dense, sparse}); + Format rm({dense, dense}); + int kdim = 128; + int ldim = 128; + + // vector filenums = {2,3,4,5,6,7,8,9,10,12,15}; + vector filenums = {0}; + + for (auto filenum : filenums) { + + // int filenum = 15; + + std::vector matfiles = { + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx", + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx", // 2 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx", // 5 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx", // 8 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx", // 10 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx", // 12 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wiki-Talk/wiki-Talk.mtx", // 13 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/com-Orkut/com-Orkut.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx", // 15 + "/home/min/a/kadhitha/workspace/my_taco/FusedMM/dataset/harvard.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/twitter7/twitter7.mtx" + }; + std::vector matfilesrw = { + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/rw/cora.mtx", + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/rw/amazon.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/synthetic.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cage3.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/bcsstk17.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/pdb1HYS.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/rma10.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cant.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/consph.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cop20k_A.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/shipsec1.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/scircuit.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/webbase-1M.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/wiki-Talk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/com-Orkut.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/circuit5M.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/harvard.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/twitter7.mtx" + }; + + std::string matfile = matfiles[filenum]; + std::cout << "reading B mat mtx\n"; + Tensor B = read(matfile, csr, true); + B.setName("B"); + B.pack(); + // write(matfilesrw[filenum], B); + + if (statfile.is_open()) { + statfile << matfile << std::endl; + } + + std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl; + std::cout << "adding c mat\n"; + Tensor C({B.getDimension(1), kdim}, rm); + for (int i = 0; i < C.getDimension(0); ++i) { + for (int j = 0; j < C.getDimension(1); ++j) { + C.insert({i,j}, unif(gen)); + } + } + std::cout << "packing C mat\n"; + C.pack(); + + Tensor D({B.getDimension(1), kdim}, rm); + for (int i = 0; i < D.getDimension(0); ++i) { + for (int j = 0; j < D.getDimension(1); ++j) { + D.insert({i,j}, unif(gen)); + } + } + std::cout << "packing D mat\n"; + D.pack(); + + Tensor F({kdim, ldim}, rm); + for (int i = 0; i < F.getDimension(0); ++i) { + for (int j = 0; j < F.getDimension(1); ++j) { + F.insert({i,j}, unif(gen)); + } + } + std::cout << "packing F mat\n"; + F.pack(); + + Tensor A({B.getDimension(0), ldim}, rm); + Tensor ref({B.getDimension(0), ldim}, rm); + IndexVar i, j, k, l, m; + IndexVar i0("i0"), i1("i1"), l0("l0"), l1("l1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"), k0("k0"), k1("k1"); + A(i,l)=B(i,j)*C(j,k)*D(j,k)*F(k,l); + if (statfile.is_open()) { + statfile + << "ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);" << std::endl + << "B1_dimension: " << B.getDimension(0) << ", B2_dimension: " << B.getDimension(1) << ", vals: " << B.getStorage().getValues().getSize() << std::endl + << "C1_dimension: " << C.getDimension(0) << ", C2_dimension: " << C.getDimension(1) << ", vals: " << C.getStorage().getValues().getSize() << std::endl + << "D1_dimension: " << D.getDimension(0) << ", D2_dimension: " << D.getDimension(1) << ", vals: " << D.getStorage().getValues().getSize() << std::endl + << "E1_dimension: " << F.getDimension(0) << ", E2_dimension: " << F.getDimension(1) << ", vals: " << F.getStorage().getValues().getSize() << std::endl + << std::endl; + } + + // IndexStmt stmt = A.getAssignment().concretize(); + IndexStmt stmt = makeReductionNotation(A.getAssignment()); + stmt = makeConcreteNotation(stmt); + stmt = reorderLoopsTopologically(stmt); + stmt = stmt.reorder({i, j, k, l}); + stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 1); + stmt = stmt + .split(i, i0, i1, 16); + stmt = insertTemporaries(stmt); + stmt = parallelizeOuterLoop(stmt); + printToFile("fusedMMFusedPar", stmt); + + A.compile(stmt); + A.assemble(); + + + ref(i,l)=B(i,j)*C(j,k)*D(j,k)*F(k,l); + IndexStmt refStmt = makeReductionNotation(ref.getAssignment()); + refStmt = makeConcreteNotation(refStmt); + refStmt = refStmt + .split(i, i0, i1, 16) + .reorder({i0, i1, j, k, l}); + refStmt = insertTemporaries(refStmt); + refStmt = parallelizeOuterLoop(refStmt); + ref.compile(refStmt); + ref.assemble(); + + Tensor ref1({B.getDimension(0), kdim}, rm); + Tensor ref2({B.getDimension(0), ldim}, rm); + ref1(i,k)=B(i,j)*C(j,k)*D(j,k); + ref2(i,l)=ref1(i,k)*F(k,l); + + // IndexStmt ref1Stmt = ref1.getAssignment().concretize(); + + // ref1Stmt = ref1Stmt.split(i, i0, i1, 16); + // // .pos(j, jpos, B(i,j)); + // // .split(k, k0, k1, 8); + // // .reorder({i0, i1, jpos0, k, jpos1}); + // // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + // // .parallelize(jpos1, ParallelUnit::CPUVector, OutputRaceStrategy::ParallelReduction); + // // ref1Stmt.split(i, ); + // // stmt = scheduleSDDMMCPU_forfuse(ref1Stmt, B); + IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment()); + ref1Stmt = makeConcreteNotation(ref1Stmt); + ref1Stmt = ref1Stmt + .split(i, i0, i1, 16) + .reorder({i0, i1, j, k}); + // .pos(j, jpos, B(i,j)) + // .split(jpos, jpos0, jpos1, 32) + // .split(k, k0, k1, 32) + // .reorder({i0, i1, jpos0, k0, jpos1, k1}); + ref1Stmt = insertTemporaries(ref1Stmt); + ref1Stmt = parallelizeOuterLoop(ref1Stmt); + ref1.compile(ref1Stmt); + ref1.assemble(); + + IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment()); + ref2Stmt = makeConcreteNotation(ref2Stmt); + ref2Stmt = ref2Stmt + .split(i, i0, i1, 32) + .split(k, k0, k1, 32) + .split(l, l0, l1, 32) + .reorder({i0, k0, l0, i1, k1, l1}); + ref2Stmt = insertTemporaries(ref2Stmt); + ref2Stmt = parallelizeOuterLoop(ref2Stmt); + ref2.compile(ref2Stmt); + ref2.assemble(); + + std::cout << "compute start\n"; + taco::util::TimeResults timevalue; + bool time = true; + + TOOL_BENCHMARK_TIMER(A.compute(statfile), "\n\nFused Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "fused time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + // // std::string sofile_sddmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so"; + // std::string sofile_sddmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.so"; + TOOL_BENCHMARK_TIMER(ref1.compute(statfile), "\n\nHadamard Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "hadamard time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + // std::string sofile_sddmm_ryan = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/sddmm_ryan.so"; + // TOOL_BENCHMARK_TIMER(ref1.compute(statfile, sofile_sddmm_ryan), "\n\nSDDMM Kernel: ", timevalue); + // if (statfile.is_open()) { + // statfile << "sddmm time: "; + // statfile << timevalue.mean << std::endl; + // } else { std::cout << " stat file is not open\n"; } + + // std::string sofile_spmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so"; + TOOL_BENCHMARK_TIMER(ref2.compute(statfile), "\n\nGeMM Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "gemm time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + // std::string sofile_original = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/taco_original.so"; + TOOL_BENCHMARK_TIMER(ref.compute(statfile), "\n\nReference Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "taco reference time: "; + statfile << timevalue << std::endl; + } else { std::cout << " stat file is not open\n"; } + + double* A_vals = (double*) (A.getTacoTensorT()->vals); + double* ref_vals = (double*) (ref.getTacoTensorT()->vals); + double* ref2_vals = (double*) (ref2.getTacoTensorT()->vals); + + // // int* A2_pos = (double*) (ref.getTacoTensorT()->vals); + + for (int q=0; q < A.getDimension(0)* A.getDimension(1); q++) { + if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) { + std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + << "refvals: " << ref_vals[q] << std::endl; + ASSERT_TRUE(false); + } + } + + for (int q=0; q < A.getDimension(0)* A.getDimension(1); q++) { + if ( abs(A_vals[q] - ref2_vals[q])/abs(ref2_vals[q]) > ERROR_MARGIN) { + std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + << "refvals: " << ref2_vals[q] << std::endl; + ASSERT_TRUE(false); + } + } + + } // end of for loop + + if (statfile.is_open()) { + statfile.close(); + } + +} + + + + + + +TEST(scheduling_eval, mttkrpFusedWithSyntheticData) { + if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) { + return; + } + taco_set_num_threads(NUM_THREADS_TO_USE); + + std::default_random_engine gen(0); + std::uniform_real_distribution unif(0.0, 1.0); + // Predeclare the storage formats that the inputs and output will be stored as. + // To define a format, you must specify whether each dimension is dense or + // sparse and (optionally) the order in which dimensions should be stored. The + // formats declared below correspond to compressed sparse fiber (csf) and + // row-major dense (rm). + Format csf({Sparse,Sparse,Sparse}); + Format rm({Dense,Dense}); + Format sd({Dense,Dense}); + + int NUM_I = 1021/20; + int NUM_J = 1039/20; + int NUM_K = 1057/20; + int NUM_L = 1232/20; + int NUM_M = 1231/20; + float SPARSITY = .1; + Tensor A("A", {NUM_I, NUM_M}, sd); + Tensor B("B", {NUM_I, NUM_K, NUM_L}, csf); + Tensor C("C", {NUM_K, NUM_J}, rm); + Tensor D("D", {NUM_L, NUM_J}, rm); + Tensor E("E", {NUM_J, NUM_M}, rm); + Tensor ref({NUM_I, NUM_M}, sd); + + srand(549694); + for (int i = 0; i < NUM_I; i++) { + for (int k = 0; k < NUM_K; k++) { + for (int l = 0; l < NUM_L; l++) { + float rand_float = (float) rand() / (float) (RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, k, l}, (double) ((int) (rand_float * 3 / SPARSITY))); + } + } + } + } + B.pack(); + write("/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/synthetic.tns", B); + + // Generate a random dense matrix and store it in row-major (dense) format. + // Matrices correspond to order-2 tensors in taco. + for (int k = 0; k < NUM_K; k++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + C.insert({k, j}, (double) ((int) (rand_float*3))); + } + } + C.pack(); + + for (int l = 0; l < NUM_L; l++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + D.insert({l, j}, (double) ((int) (rand_float*3))); + } + } + D.pack(); + + for (int i = 0; i < E.getDimension(0); ++i) { + for (int j = 0; j < E.getDimension(1); ++j) { + E.insert({i,j}, unif(gen)); + } + } + E.pack(); + + // Define the MTTKRP computation using index notation. + IndexVar i, k, l, j, m; + A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m); + + + IndexStmt stmt = makeReductionNotation(A.getAssignment()); + stmt = makeConcreteNotation(stmt); + printToFile("fusedMTTKRPConcrete", stmt); + + stmt = reorderLoopsTopologically(stmt); + printToFile("fusedMTTKRPOrdered", stmt); + + stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 1); + printToFile("fusedMTTKRPFused", stmt); + + stmt = insertTemporaries(stmt); + printToFile("fusedMTTKRPWithTemps", stmt); + stmt = parallelizeOuterLoop(stmt); + printToFile("fusedMTTKRPFusedPar", stmt); + + + // At this point, we have defined how entries in the output matrix should be + // computed from entries in the input tensor and matrices but have not actually + // performed the computation yet. To do so, we must first tell taco to generate + // code that can be executed to compute the MTTKRP operation. + A.compile(stmt); + // We can now call the functions taco generated to assemble the indices of the + // output matrix and then actually compute the MTTKRP. + A.assemble(); + + + ref(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m); + IndexStmt refStmt = makeReductionNotation(ref.getAssignment()); + refStmt = makeConcreteNotation(refStmt); + refStmt = insertTemporaries(refStmt); + refStmt = parallelizeOuterLoop(refStmt); + ref.compile(refStmt); + ref.assemble(); + + // Tensor ref2({NUM_I, NUM_J}, sd); + // ref2(i,j) = B(i,k,l) * D(l,j) * C(k,j); + // IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment()); + // ref2Stmt = makeConcreteNotation(ref2Stmt); + // ref2Stmt = insertTemporaries(ref2Stmt); + // ref2Stmt = parallelizeOuterLoop(ref2Stmt); + // ref2.compile(ref2Stmt); + // ref2.assemble(); + + // Tensor ref3({NUM_I, NUM_M}, sd); + // ref3(i,m) = ref2(i,j) * E(j,m); + // IndexStmt ref3Stmt = makeReductionNotation(ref3.getAssignment()); + // ref3Stmt = makeConcreteNotation(ref3Stmt); + // ref3Stmt = insertTemporaries(ref3Stmt); + // ref3Stmt = parallelizeOuterLoop(ref3Stmt); + // ref3.compile(ref3Stmt); + // ref3.assemble(); + + std::cout << "compute start\n"; + taco::util::TimeResults timevalue; + bool time = true; + // TOOL_BENCHMARK_TIMER(ref.compute(), "\n\nReference ISPC: ", timevalue); + TOOL_BENCHMARK_TIMER(A.compute(), "\n\nFused MTTKRP+SPMM: ", timevalue); + TOOL_BENCHMARK_TIMER(ref.compute(), "\n\nReference MTTKRP+SPMM: ", timevalue); + // TOOL_BENCHMARK_TIMER(ref2.compute(), "\n\nReference MTTKRP: ", timevalue); + // TOOL_BENCHMARK_TIMER(ref3.compute(), "\n\nReference SPMM: ", timevalue); + ASSERT_TENSOR_EQ(ref, A); + // ASSERT_TENSOR_EQ(ref, ref3); + +} + + +TEST(scheduling_eval, mttkrpFused) { + if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) { + return; + } + + taco_set_num_threads(NUM_THREADS_TO_USE); + + ofstream statfile; + statfile.open( + "/home/min/a/kadhitha/workspace/my_taco/taco/test/stats/mttkrp-spmm.txt", std::ios::app); + if (statfile.is_open()) { + statfile << "\nmttkrp-spmm execution\n"; + statfile << "\n-----------------------------------------\n"; + } + + std::default_random_engine gen(0); + std::uniform_real_distribution unif(0.0, 1.0); + // Predeclare the storage formats that the inputs and output will be stored as. + // To define a format, you must specify whether each dimension is dense or + // sparse and (optionally) the order in which dimensions should be stored. The + // formats declared below correspond to compressed sparse fiber (csf) and + // row-major dense (rm). + Format csf({Dense,Sparse,Sparse}); + Format rm({Dense,Dense}); + Format sd({Dense,Dense}); + int jDim = 32; + int mDim = 64; + + int matfilenum = 3; + + // Load a sparse order-3 tensor from file (stored in the FROSTT format) and + // store it as a compressed sparse fiber tensor. The tensor in this example + // can be download from: http://frostt.io/tensors/nell-2/ + std::vector matfiles = { + "/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns", + "/home/min/a/kadhitha/ispc-examples/data/tns/delicious-3d.tns", + "/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns", // 2 + "/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns", // 3 + "/home/min/a/kadhitha/ispc-examples/data/tns/nell-1.tns", // 4 + "/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns", // 5 + "/home/min/a/kadhitha/ispc-examples/data/tns/darpa1998.tns", // 6 + "/home/min/a/kadhitha/ispc-examples/data/tns/freebase_music.tns", + "/home/min/a/kadhitha/ispc-examples/data/tns/freebase_sampled.tns" // 8 + }; + std::vector matfilesrw = { + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/matmul_5-5-5.tns", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/delicious-3d.tns", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/flickr-3d.tns", // 2 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/nell-2.tns", // 3 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/nell-1.tns", // 4 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/vast-2015-mc1-3d.tns", // 5 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/darpa1998.tns", // 6 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/freebase_music.tns", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/freebase_sampled.tns" + }; + std::string matfile = matfiles[matfilenum]; + Tensor B = read(matfile, csf, true); + // write(matfilesrw[matfilenum], B); + + // Generate a random dense matrix and store it in row-major (dense) format. + // Matrices correspond to order-2 tensors in taco. + Tensor C({B.getDimension(1), jDim}, rm); + for (int i = 0; i < C.getDimension(0); ++i) { + for (int j = 0; j < C.getDimension(1); ++j) { + C.insert({i,j}, unif(gen)); + } + } + C.pack(); + + // Generate another random dense matrix and store it in row-major format. + Tensor D({B.getDimension(2), jDim}, rm); + for (int i = 0; i < D.getDimension(0); ++i) { + for (int j = 0; j < D.getDimension(1); ++j) { + D.insert({i,j}, unif(gen)); + } + } + D.pack(); + + Tensor E({jDim, mDim}, rm); + for (int i = 0; i < E.getDimension(0); ++i) { + for (int j = 0; j < E.getDimension(1); ++j) { + E.insert({i,j}, unif(gen)); + } + } + E.pack(); + + if (statfile.is_open()) { + statfile + << matfile << std::endl + << "A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m)" << std::endl + << "B1_dimension: " << B.getDimension(0) << ", B2_dimension: " << B.getDimension(1) << ", B3_dimension: " << B.getDimension(0) << ", vals: " << B.getStorage().getValues().getSize() << std::endl + << "C1_dimension: " << C.getDimension(0) << ", C2_dimension: " << C.getDimension(1) << ", vals: " << C.getStorage().getValues().getSize() << std::endl + << "D1_dimension: " << D.getDimension(0) << ", D2_dimension: " << D.getDimension(1) << ", vals: " << D.getStorage().getValues().getSize() << std::endl + << "E1_dimension: " << E.getDimension(0) << ", E2_dimension: " << E.getDimension(1) << ", vals: " << E.getStorage().getValues().getSize() << std::endl + << std::endl; + } + + // Declare the output matrix to be a dense matrix with 25 columns and the same + // number of rows as the number of slices along the first dimension of input + // tensor B, to be also stored as a row-major dense matrix. + Tensor A({B.getDimension(0), mDim}, sd); + Tensor ref({B.getDimension(0), mDim}, sd); + + // Define the MTTKRP computation using index notation. + IndexVar i, k, l, j, m; + IndexVar i1("i1"), i2("i2"), j1("j1"), j2("j2"), m1("m1"), m2("m2"); + + A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m); + + IndexStmt stmt = makeReductionNotation(A.getAssignment()); + stmt = makeConcreteNotation(stmt); + stmt = reorderLoopsTopologically(stmt); + // stmt = stmt.reorder({i,j,k,l,m}); + stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 1); + stmt = stmt.split(i, i1, i2, 16); + stmt = insertTemporaries(stmt); + stmt = parallelizeOuterLoop(stmt); + printToFile("fusedMTTKRPFusedPar", stmt); + A.compile(stmt); + A.assemble(); + + + ref(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m); + IndexStmt refStmt = makeReductionNotation(ref.getAssignment()); + refStmt = makeConcreteNotation(refStmt); + refStmt = refStmt + .split(i, i1, i2, 16); + refStmt = insertTemporaries(refStmt); + refStmt = parallelizeOuterLoop(refStmt); + ref.compile(refStmt); + ref.assemble(); + + Tensor ref2({B.getDimension(0), jDim}, sd); + ref2(i,j) = B(i,k,l) * D(l,j) * C(k,j); + IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment()); + ref2Stmt = makeConcreteNotation(ref2Stmt); + ref2Stmt = ref2Stmt + .split(i, i1, i2, 16); + ref2Stmt = insertTemporaries(ref2Stmt); + ref2Stmt = parallelizeOuterLoop(ref2Stmt); + ref2.compile(ref2Stmt); + ref2.assemble(); + + Tensor ref2_ryan({B.getDimension(0), jDim}, sd); + ref2_ryan(i,j) = B(i,k,l) * D(l,j) * C(k,j); + + IndexStmt ref2RyanStmt = makeReductionNotation(ref2_ryan.getAssignment()); + ref2RyanStmt = makeConcreteNotation(ref2RyanStmt); + + IndexExpr precomputeExpr = ref2RyanStmt.as().getStmt().as().getStmt() + .as().getStmt().as().getStmt() + .as().getRhs().as().getA(); + TensorVar w("w", Type(Float64, {Dimension(j)}), taco::dense); + ref2RyanStmt = ref2RyanStmt.split(i, i1, i2, 16) + .reorder({i1, i2, k, l, j}) + .precompute(precomputeExpr, j, j, w) + .parallelize(i1, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); + ref2RyanStmt = insertTemporaries(ref2RyanStmt); + // ref2RyanStmt = parallelizeOuterLoop(ref2RyanStmt); + ref2_ryan.compile(ref2RyanStmt); + ref2_ryan.assemble(); + + Tensor ref3({B.getDimension(0), mDim}, sd); + ref3(i,m) = ref2(i,j) * E(j,m); + IndexStmt ref3Stmt = makeReductionNotation(ref3.getAssignment()); + ref3Stmt = makeConcreteNotation(ref3Stmt); + ref3Stmt = ref3Stmt + .split(i, i1, i2, 16) + .split(j, j1, j2, 16) + .split(m, m1, m2, 16) + .reorder({i1, j1, m1, i2, j2, m2}) + .parallelize(i1, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); + ref3Stmt = insertTemporaries(ref3Stmt); + ref3Stmt = parallelizeOuterLoop(ref3Stmt); + ref3.compile(ref3Stmt); + ref3.assemble(); + + + std::cout << "compute start\n"; + taco::util::TimeResults timevalue; + bool time = true; + + TOOL_BENCHMARK_TIMER(ref2.compute(statfile), "\n\nDefault MTTKRP: ", timevalue); + if (statfile.is_open()) { + statfile << "default mttkrp time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + TOOL_BENCHMARK_TIMER(ref2_ryan.compute(statfile), "\n\nRyan MTTKRP workspace: ", timevalue); + if (statfile.is_open()) { + statfile << "ryan mttkrp workspace time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + double* ref2_vals = (double*) (ref2.getTacoTensorT()->vals); + double* ref2_ryan_vals = (double*) (ref2_ryan.getTacoTensorT()->vals); + for (int q=0; q < B.getDimension(0)* jDim; q++) { + if ( abs(ref2_vals[q] - ref2_ryan_vals[q])/abs(ref2_ryan_vals[q]) > ERROR_MARGIN) { + std::cout << "error: results don't match i: " << q << ", avals: " << ref2_vals[q] << " " + << "refvals: " << ref2_ryan_vals[q] << std::endl; + ASSERT_TRUE(false); + } + } + + TOOL_BENCHMARK_TIMER(ref3.compute(statfile), "\n\nGeMM time: ", timevalue); + if (statfile.is_open()) { + statfile << "GeMM time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + + TOOL_BENCHMARK_TIMER(ref.compute(statfile), "\n\nReference MTTKRP+GEMM: ", timevalue); + if (statfile.is_open()) { + statfile << "reference asymptotic blowup time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + double* ref3_vals = (double*) (ref3.getTacoTensorT()->vals); + double* ref_vals = (double*) (ref.getTacoTensorT()->vals); + for (int q=0; q < B.getDimension(0)* mDim; q++) { + if ( abs(ref3_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) { + std::cout << "error: results don't match i: " << q << ", avals: " << ref3_vals[q] << " " + << "refvals: " << ref_vals[q] << std::endl; + ASSERT_TRUE(false); + } + } + + TOOL_BENCHMARK_TIMER(A.compute(statfile), "\n\nFused MTTKRP+GEMM: ", timevalue); + if (statfile.is_open()) { + statfile << "fused mttkrp+gemm time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + if (statfile.is_open()) { + statfile.close(); + } + + double* A_vals = (double*) (A.getTacoTensorT()->vals); + for (int q=0; q < B.getDimension(0)* mDim; q++) { + if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) { + std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + << "refvals: " << ref_vals[q] << std::endl; + ASSERT_TRUE(false); + } + } + + +} + +TEST(scheduling_eval, ttmFusedWithSyntheticData) { + if (should_use_CUDA_codegen()) { + return; + } + + taco_set_num_threads(NUM_THREADS_TO_USE); + + std::default_random_engine gen(0); + std::uniform_real_distribution unif(0.0, 1.0); + Format csf({Sparse,Sparse,Sparse}); + Format custom({Sparse,Sparse,Dense}); + Format rm({Dense,Dense}); + + int NUM_I = 5; + int NUM_J = 5; + int NUM_K = 5; + int NUM_L = 64; + int NUM_M = 1024; + float SPARSITY = .1; + + Tensor B("B", {NUM_I, NUM_J, NUM_K}, csf); + srand(549694); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float) rand() / (float) (RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, j, k}, (double) ((int) (rand_float * 3 / SPARSITY))); + } + } + } + } + B.pack(); + write("/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/synthetic.tns", B); + + // Generate a random dense matrix and store it in row-major (dense) format. + // Matrices correspond to order-2 tensors in taco. + Tensor C({B.getDimension(2), NUM_L}, rm); + for (int i = 0; i < C.getDimension(0); ++i) { + for (int j = 0; j < C.getDimension(1); ++j) { + C.insert({i,j}, unif(gen)); + } + } + C.pack(); + + // Generate another random dense matrix and store it in row-major format. + Tensor D({NUM_L, NUM_M}, rm); + for (int i = 0; i < D.getDimension(0); ++i) { + for (int j = 0; j < D.getDimension(1); ++j) { + D.insert({i,j}, unif(gen)); + } + } + D.pack(); + + Tensor A({B.getDimension(0), B.getDimension(1), NUM_M}, custom); + Tensor ref({B.getDimension(0), B.getDimension(1), NUM_M}, custom); + + // Define the MTTKRP computation using index notation. + IndexVar i, j, k, l, m; + A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m); + + IndexStmt stmt = makeReductionNotation(A.getAssignment()); + stmt = makeConcreteNotation(stmt); + printToFile("fusedTTMTTKRPConcrete", stmt); + + stmt = reorderLoopsTopologically(stmt); + printToFile("fusedTTMOrdered", stmt); + + stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 1); + printToFile("fusedTTMFused", stmt); + + stmt = insertTemporaries(stmt); + printToFile("fusedTTMWithTemps", stmt); + stmt = parallelizeOuterLoop(stmt); + printToFile("fusedTTMFinal", stmt); + + + // At this point, we have defined how entries in the output matrix should be + // computed from entries in the input tensor and matrices but have not actually + // performed the computation yet. To do so, we must first tell taco to generate + // code that can be executed to compute the MTTKRP operation. + A.compile(stmt); + // We can now call the functions taco generated to assemble the indices of the + // output matrix and then actually compute the MTTKRP. + A.assemble(); + + + ref(i,j,m) = B(i,j,k) * C(k,l) * D(l,m); + IndexStmt refStmt = makeReductionNotation(ref.getAssignment()); + refStmt = makeConcreteNotation(refStmt); + refStmt = insertTemporaries(refStmt); + refStmt = parallelizeOuterLoop(refStmt); + printToFile("tacoFusedTTM", refStmt); + ref.compile(refStmt); + ref.assemble(); + + Tensor ref1({B.getDimension(0), B.getDimension(1), NUM_L}, custom); + ref1(i,j,l) = B(i,j,k) * C(k,l); + IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment()); + ref1Stmt = makeConcreteNotation(ref1Stmt); + ref1Stmt = insertTemporaries(ref1Stmt); + ref1Stmt = parallelizeOuterLoop(ref1Stmt); + ref1.compile(ref1Stmt); + ref1.assemble(); + + Tensor ref2({B.getDimension(0), B.getDimension(1), NUM_M}, custom); + ref2(i,j,m) = ref1(i,j,l) * D(l,m); + IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment()); + ref2Stmt = makeConcreteNotation(ref2Stmt); + ref2Stmt = insertTemporaries(ref2Stmt); + ref2Stmt = parallelizeOuterLoop(ref2Stmt); + ref2.compile(ref2Stmt); + ref2.assemble(); + + Tensor ref3({B.getDimension(2), NUM_M}, rm); + ref3(k,m) = C(k,l) * D(l,m); + IndexStmt ref3Stmt = makeReductionNotation(ref3.getAssignment()); + ref3Stmt = makeConcreteNotation(ref3Stmt); + ref3Stmt = insertTemporaries(ref3Stmt); + ref3Stmt = parallelizeOuterLoop(ref3Stmt); + ref3.compile(ref3Stmt); + ref3.assemble(); + + Tensor ref4({B.getDimension(0), B.getDimension(1), NUM_M}, custom); + ref4(i,j,m) = B(i,j,k) * ref3(k,m); + IndexStmt ref4Stmt = makeReductionNotation(ref4.getAssignment()); + ref4Stmt = makeConcreteNotation(ref4Stmt); + ref4Stmt = insertTemporaries(ref4Stmt); + ref4Stmt = parallelizeOuterLoop(ref4Stmt); + ref4.compile(ref4Stmt); + ref4.assemble(); + + std::cout << "compute start\n"; + taco::util::TimeResults timevalue; + bool time = true; + // TOOL_BENCHMARK_TIMER(ref.compute(), "\n\nReference ISPC: ", timevalue); + TOOL_BENCHMARK_TIMER(A.compute(), "\n\nFused TTM->TTM: ", timevalue); + TOOL_BENCHMARK_TIMER(ref.compute(), "\n\nReference TTM->TTM: ", timevalue); + TOOL_BENCHMARK_TIMER(ref1.compute(), "\n\nTTM1: ", timevalue); + TOOL_BENCHMARK_TIMER(ref2.compute(), "\n\nTTM1: ", timevalue); + TOOL_BENCHMARK_TIMER(ref3.compute(), "\n\ndense: ", timevalue); + TOOL_BENCHMARK_TIMER(ref4.compute(), "\n\nTTM after dense: ", timevalue); + ASSERT_TENSOR_EQ(ref, A); + ASSERT_TENSOR_EQ(ref, ref2); + ASSERT_TENSOR_EQ(ref, ref4); + + for (int q = 0; q < A.getDimension(0); ++q) { + for (int w = 0; w < A.getDimension(1); ++w) { + for (int z = 0; z < A.getDimension(2); ++z) { + // std::cout << "(" << q << "," << w << "," << z << ")" + // << "a: " << A(q,w,z) << ", ref: " << ref(q,w,z) << std::endl; + if ( abs(A(q,w,z) - ref(q,w,z))/abs(ref(q,w,z)) > ERROR_MARGIN) { + std::cout << "error: results don't match A: " + << A(q,w,z) << ", ref: " << ref(q,w,z) << std::endl; + ASSERT_TRUE(false); + } + } + } + } + +} + +TEST(scheduling_eval, ttmFused) { + if (should_use_CUDA_codegen()) { + return; + } + + int retval, EventSet = PAPI_NULL; + retval = PAPI_hl_region_begin("dummy"); + if ( retval != PAPI_OK ) handle_error(1); + + retval = PAPI_hl_region_end("dummy"); + if ( retval != PAPI_OK ) handle_error(1); + + taco_set_num_threads(NUM_THREADS_TO_USE); + + ofstream statfile; + statfile.open( + "/home/min/a/kadhitha/workspace/my_taco/taco/test/stats/ttm-ttm.txt", std::ios::app); + if (statfile.is_open()) { + statfile << "\nttm-ttm execution\n"; + statfile << "\n-----------------------------------------\n"; + } + + std::default_random_engine gen(0); + std::uniform_real_distribution unif(0.0, 1.0); + Format csf({Dense,Sparse,Sparse}); + Format custom({Dense,Sparse,Dense}); + Format rm({Dense,Dense}); + int ldim = 32; + int mdim = 64; + + int64_t dummy_array_size = 2e6; + int64_t* dummy_array_to_flush_cache = (int64_t*) malloc(dummy_array_size*sizeof(int64_t)); + + vector matfilenums = {5}; + + for (auto matfilenum : matfilenums) { + + // int matfilenum = 0; + + + + // Load a sparse order-3 tensor from file (stored in the FROSTT format) and + // store it as a compressed sparse fiber tensor. The tensor in this example + // can be download from: http://frostt.io/tensors/nell-2/ + std::vector matfiles = { + "/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns", + "/home/min/a/kadhitha/ispc-examples/data/tns/delicious-3d.tns", + "/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns", // 2 + "/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns", // 3 + "/home/min/a/kadhitha/ispc-examples/data/tns/nell-1.tns", // 4 + "/home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns", // 5 + "/home/min/a/kadhitha/workspace/my_taco/tns/darpa1998.tns", // 6 + "/home/min/a/kadhitha/ispc-examples/data/tns/freebase_music.tns", + "/home/min/a/kadhitha/ispc-examples/data/tns/freebase_sampled.tns" + }; + std::vector matfilesrw = { + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/matmul_5-5-5.tns", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/delicious-3d.tns", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/flickr-3d.tns", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/nell-2.tns", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/nell-1.tns", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/vast-2015-mc1-3d.tns", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/darpa1998.tns", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/freebase_music.tns", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/freebase_sampled.tns" + }; + statfile << "\nfile: " << matfiles[matfilenum] << std::endl; + statfile << "----------------------------------------------------------------\n"; + + std::string matfile = matfiles[matfilenum]; + Tensor B = read(matfile, csf); + B.setName("B"); + B.pack(); + // write(matfilesrw[matfilenum], B); + + // Generate a random dense matrix and store it in row-major (dense) format. + // Matrices correspond to order-2 tensors in taco. + Tensor C("C", {B.getDimension(2), ldim}, rm); + for (int i = 0; i < C.getDimension(0); ++i) { + for (int j = 0; j < C.getDimension(1); ++j) { + C.insert({i,j}, unif(gen)); + } + } + C.pack(); + + // Generate another random dense matrix and store it in row-major format. + Tensor D("D", {ldim, mdim}, rm); + for (int i = 0; i < D.getDimension(0); ++i) { + for (int j = 0; j < D.getDimension(1); ++j) { + D.insert({i,j}, unif(gen)); + } + } + D.pack(); + + if (statfile.is_open()) { + statfile + << matfile << std::endl + << "A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)" << std::endl + << "B1_dimension: " << B.getDimension(0) << ", B2_dimension: " << B.getDimension(1) << ", B3_dimension: " << B.getDimension(2) << ", vals: " << B.getStorage().getValues().getSize() << std::endl + << "C1_dimension: " << C.getDimension(0) << ", C2_dimension: " << C.getDimension(1) << ", vals: " << C.getStorage().getValues().getSize() << std::endl + << "D1_dimension: " << D.getDimension(0) << ", D2_dimension: " << D.getDimension(1) << ", vals: " << D.getStorage().getValues().getSize() << std::endl + << std::endl; + } + + Tensor A({B.getDimension(0), B.getDimension(1), mdim}, custom); + Tensor ref({B.getDimension(0), B.getDimension(1), mdim}, custom); + Tensor refn({B.getDimension(0), B.getDimension(1), mdim}, custom); + + // Define the MTTKRP computation using index notation. + IndexVar i, j, k, l, m; + IndexVar i0,i1, j0, j1, k0, k1, l0, l1, m0, m1; + A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m); + + + IndexStmt stmt = makeReductionNotation(A.getAssignment()); + stmt = makeConcreteNotation(stmt); + stmt = reorderLoopsTopologically(stmt); + stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 1); + stmt = stmt.split(i, i0, i1, 16); + stmt = insertTemporaries(stmt); + stmt = parallelizeOuterLoop(stmt); + printToFile("fusedTTMFinal", stmt); + + A.compile(stmt); + A.assemble(); + + + ref(i,j,m) = B(i,j,k) * C(k,l) * D(l,m); // TTM->TTM TACO + IndexStmt refStmt = makeReductionNotation(ref.getAssignment()); + refStmt = makeConcreteNotation(refStmt); + refStmt = refStmt + .split(i, i0, i1, 16); + refStmt = insertTemporaries(refStmt); + refStmt = parallelizeOuterLoop(refStmt); + printToFile("tacoFusedTTM", refStmt); + ref.compile(refStmt); + ref.assemble(); + + refn(i,j,m) = B(i,j,k) * C(k,l) * D(l,m); // TTM->TTM TACO + IndexStmt refnStmt = makeReductionNotation(refn.getAssignment()); + refnStmt = makeConcreteNotation(refnStmt); + refnStmt = refnStmt + .split(i, i0, i1, 16) + .reorder({i0, i1, j, k, l, m}); + refnStmt = insertTemporaries(refnStmt); + refnStmt = parallelizeOuterLoop(refnStmt); + printToFile("tacoFusedTTM", refnStmt); + refn.compile(refnStmt); + refn.assemble(); + + Tensor ref1({B.getDimension(0), B.getDimension(1), ldim}, custom); + ref1(i,j,l) = B(i,j,k) * C(k,l); // TTM1 + IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment()); + ref1Stmt = makeConcreteNotation(ref1Stmt); + // ref1Stmt = ref1Stmt.split(i, i0, i1, 16); + ref1Stmt = insertTemporaries(ref1Stmt); + ref1Stmt = parallelizeOuterLoop(ref1Stmt); + ref1.compile(ref1Stmt); + ref1.assemble(); + + Tensor ref2({B.getDimension(0), B.getDimension(1), mdim}, custom); + ref2(i,j,m) = ref1(i,j,l) * D(l,m); // TTM2 + IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment()); + ref2Stmt = makeConcreteNotation(ref2Stmt); + // ref2Stmt = ref2Stmt.split(i, i0, i1, 16); + ref2Stmt = insertTemporaries(ref2Stmt); + ref2Stmt = parallelizeOuterLoop(ref2Stmt); + ref2.compile(ref2Stmt); + ref2.assemble(); + + Tensor ref3({B.getDimension(2), mdim}, rm); + ref3(k,m) = C(k,l) * D(l,m); // GeMM + IndexStmt ref3Stmt = makeReductionNotation(ref3.getAssignment()); + ref3Stmt = makeConcreteNotation(ref3Stmt); + ref3Stmt = ref3Stmt + .split(k, k0, k1, 32) + .split(l, l0, l1, 32) + .split(m, m0, m1, 32) + .reorder({k0, l0, m0, k1, l1, m1}); + ref3Stmt = insertTemporaries(ref3Stmt); + ref3Stmt = parallelizeOuterLoop(ref3Stmt); + ref3.compile(ref3Stmt); + ref3.assemble(); + + Tensor ref4({B.getDimension(0), B.getDimension(1), mdim}, custom); + ref4(i,j,m) = B(i,j,k) * ref3(k,m); // TTM1 + IndexStmt ref4Stmt = makeReductionNotation(ref4.getAssignment()); + ref4Stmt = makeConcreteNotation(ref4Stmt); + // ref4Stmt = ref4Stmt + // .split(i, i0, i1, 16); + // // .split(k, k0, k1, 16) + // .split(m, m0, m1, 16) + // .reorder({i0, i1, j, m0, k, m1}); + ref4Stmt = insertTemporaries(ref4Stmt); + ref4Stmt = parallelizeOuterLoop(ref4Stmt); + ref4.compile(ref4Stmt); + ref4.assemble(); + + std::cout << "compute start\n"; + taco::util::TimeResults timevalue; + bool time = true; + + int r = rand(); + for (int64_t i=0; iTTM: ", timevalue); + retval = PAPI_hl_region_end("fusedTTM"); if ( retval != PAPI_OK ) handle_error(1); + if (statfile.is_open()) { + statfile << "fused time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + r = rand(); + for (int64_t i=0; iTTM: ", timevalue); + retval = PAPI_hl_region_end("referenceTTM"); if ( retval != PAPI_OK ) handle_error(1); + if (statfile.is_open()) { + statfile << "reference time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + r = rand(); + for (int64_t i=0; iTTM: ", timevalue); + retval = PAPI_hl_region_end("ref2TTM"); if ( retval != PAPI_OK ) handle_error(1); + if (statfile.is_open()) { + statfile << "reference new time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + statfile << "\nschedule 1\n"; + + r = rand(); + for (int64_t i=0; ivals); + double* ref_vals = (double*) (ref.getTacoTensorT()->vals); + double* ref2_vals = (double*) (ref2.getTacoTensorT()->vals); + double* ref4_vals = (double*) (ref4.getTacoTensorT()->vals); + + // int* A2_pos = (double*) (ref.getTacoTensorT()->vals); + + // for (size_t q=0; q < B.getStorage().getValues().getSize(); q++) { + // if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) { + // std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + // << "refvals: " << ref_vals[q] << std::endl; + // ASSERT_TRUE(false); + // } + // } + + // std::cout << "our fused vs taco original fused check\n"; + // for (size_t q=0; q < A.getStorage().getValues().getSize(); q++) { + // if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) { + // std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + // << "refvals: " << ref_vals[q] << std::endl; + // ASSERT_TRUE(false); + // } + // } + // std::cout << "taco original fused vs TTM1, TTM2 check\n"; + // for (size_t q=0; q < A.getStorage().getValues().getSize(); q++) { + // if ( abs(ref_vals[q] - ref2_vals[q])/abs(ref2_vals[q]) > ERROR_MARGIN) { + // std::cout << "error: results don't match i: " << q << ", avals: " << ref_vals[q] << " " + // << "refvals: " << ref2_vals[q] << std::endl; + // ASSERT_TRUE(false); + // } + // } + // std::cout << "taco original fused vs GeMM, TTM1 check\n"; + // for (size_t q=0; q < A.getStorage().getValues().getSize(); q++) { + // if ( abs(ref_vals[q] - ref4_vals[q])/abs(ref4_vals[q]) > ERROR_MARGIN) { + // std::cout << "error: results don't match i: " << q << ", avals: " << ref_vals[q] << " " + // << "refvals: " << ref4_vals[q] << std::endl; + // ASSERT_TRUE(false); + // } + // } + + } // end of forloop + + if (statfile.is_open()) { + statfile.close(); + } + +} + + + + +TEST(scheduling_eval, spmmFusedWithSyntheticData) { + if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) { + return; + } + + taco_set_num_threads(NUM_THREADS_TO_USE); + + std::default_random_engine gen(0); + std::uniform_real_distribution unif(0.0, 1.0); + + Format csr({dense, sparse}); + Format rm({dense, dense}); + int ldim = 32; + int kdim = 64; + + // uncomment this for reading the csr matrix saved in mtx file + std::cout << "reading B mat mtx\n"; + + int NUM_I = 128; + int NUM_J = 96; + int NUM_K = 64; + float SPARSITY = .3; + Tensor B("B", {NUM_I, NUM_J}, csr); + srand(75883); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + } + B.pack(); + + Tensor C("C", {NUM_J, NUM_K}, csr); + for (int j = 0; j < NUM_J; j++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({j, k}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + } + C.pack(); + // write("/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx", B); + + std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl; + std::cout << "adding c mat\n"; + Tensor D({C.getDimension(1), ldim}, rm); + for (int i = 0; i < D.getDimension(0); ++i) { + for (int j = 0; j < D.getDimension(1); ++j) { + D.insert({i,j}, unif(gen)); + } + } + std::cout << "packing C mat\n"; + D.pack(); + + // Tensor E({B.getDimension(1), kdim}, rm); + // for (int i = 0; i < D.getDimension(0); ++i) { + // for (int j = 0; j < D.getDimension(1); ++j) { + // D.insert({i,j}, unif(gen)); + // } + // } + // std::cout << "packing D mat\n"; + // D.pack(); + + // Tensor F({B.getDimension(1), ldim}, rm); + // for (int i = 0; i < F.getDimension(0); ++i) { + // for (int j = 0; j < F.getDimension(1); ++j) { + // F.insert({i,j}, unif(gen)); + // } + // } + // std::cout << "packing F mat\n"; + // F.pack(); + + Tensor A({B.getDimension(0), ldim}, rm); + Tensor ref({B.getDimension(0), ldim}, rm); + IndexVar i, j, k, l; + A(i,l)=B(i,j)*C(j,k)*D(k,l); + + // IndexStmt stmt = A.getAssignment().concretize(); + IndexStmt stmt = makeReductionNotation(A.getAssignment()); + stmt = makeConcreteNotation(stmt); + printToFile("fusedMMConcrete", stmt); + + stmt = reorderLoopsTopologically(stmt); + printToFile("fusedMMOrdered", stmt); + + stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 1); + printToFile("fusedMMFused", stmt); + + stmt = insertTemporaries(stmt); + printToFile("fusedMMWithTemps", stmt); + stmt = parallelizeOuterLoop(stmt); + printToFile("fusedMMFusedPar", stmt); + + A.compile(stmt); + // We can now call the functions taco generated to assemble the indices of the + // output matrix and then actually compute the MTTKRP. + A.assemble(); + + + ref(i,l)=B(i,j)*C(j,k)*D(k,l); + IndexStmt refStmt = makeReductionNotation(ref.getAssignment()); + refStmt = makeConcreteNotation(refStmt); + refStmt = insertTemporaries(refStmt); + refStmt = parallelizeOuterLoop(refStmt); + ref.compile(refStmt); + ref.assemble(); + + // Tensor ref1({B.getDimension(0), B.getDimension(1)}, csr); + // Tensor ref2({B.getDimension(0), ldim}, rm); + // ref1(i,j)=B(i,j)*C(i,k)*D(j,k); + // ref2(i,l)=ref1(i,j)*F(j,l); + + // IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment()); + // ref1Stmt = makeConcreteNotation(ref1Stmt); + // ref1Stmt = insertTemporaries(ref1Stmt); + // ref1Stmt = parallelizeOuterLoop(ref1Stmt); + // ref1.compile(ref1Stmt); + // ref1.assemble(); + + // IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment()); + // ref2Stmt = makeConcreteNotation(ref2Stmt); + // ref2Stmt = insertTemporaries(ref2Stmt); + // ref2Stmt = parallelizeOuterLoop(ref2Stmt); + // ref2.compile(ref2Stmt); + // ref2.assemble(); + + std::cout << "compute start\n"; + taco::util::TimeResults timevalue; + bool time = true; + TOOL_BENCHMARK_TIMER(ref.compute(), "\n\nReference Kernel: ", timevalue); + TOOL_BENCHMARK_TIMER(A.compute(), "\n\nFused Kernel: ", timevalue); + + // check results + for (int q = 0; q < A.getDimension(0); ++q) { + for (int w = 0; w < A.getDimension(1); ++w) { + if ( abs(A(q,w) - ref(q,w))/abs(ref(q,w)) > ERROR_MARGIN) { + std::cout << "error: results don't match A("<< q << "," << w << "): " + << A(q,w) << ", ref: " << ref(q,w) << std::endl; + ASSERT_TRUE(false); + } + } + } + // // ASSERT_TENSOR_EQ(A, ref); + // TOOL_BENCHMARK_TIMER(ref1.compute(), "\n\nSDDMM Kernel: ", timevalue); + // TOOL_BENCHMARK_TIMER(ref2.compute(), "\n\nSpMM Kernel: ", timevalue); + + // for (int q = 0; q < ref2.getDimension(0); ++q) { + // for (int w = 0; w < ref2.getDimension(1); ++w) { + // if ( abs(ref2(q,w) - ref(q,w))/abs(ref(q,w)) > ERROR_MARGIN) { + // std::cout << "error: results don't match A("<< q << "," << w << "): " + // << ref2(q,w) << ", ref: " << ref(q,w) << std::endl; + // ASSERT_TRUE(false); + // } + // } + // } + +} + + +TEST(scheduling_eval, spmmFused) { + if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) { + return; + } + + // int retval, EventSet = PAPI_NULL; + // retval = PAPI_hl_region_begin("dummy"); + // if ( retval != PAPI_OK ) handle_error(1); + + /* Do some computation */ + + // retval = PAPI_hl_region_end("dummy"); + // if ( retval != PAPI_OK ) handle_error(1); + + taco_set_num_threads(NUM_THREADS_TO_USE); + + ofstream statfile; + statfile.open( + "/home/min/a/kadhitha/workspace/my_taco/taco/test/stats/spmm-gemm.txt", std::ios::app); + if (statfile.is_open()) { + statfile << "\nspmm-spmm execution\n"; + statfile << "\n-----------------------------------------\n"; + } + + std::default_random_engine gen(0); + std::uniform_real_distribution unif(0.0, 1.0); + + Format csr({dense, sparse}); + Format rm({dense, dense}); + int kdim = 128; + int ldim = 64; + + // vector filenums = {2,3,4,5,6,7,8,9,10,12,15}; + vector filenums = {0}; + + for (auto filenum : filenums) { + + + statfile << "filenum: " << filenum << std::endl; + statfile << "---------------------------------\n"; + // int filenum = 7; + + std::vector matfiles = { + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx", + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx", // 2 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx", // 10 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx", // 12 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wiki-Talk/wiki-Talk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/com-Orkut/com-Orkut.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx", // 15 + "/home/min/a/kadhitha/workspace/my_taco/FusedMM/dataset/harvard.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/twitter7/twitter7.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k.mtx", + }; + std::vector matfilesrw = { + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/rw/cora.mtx", + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/rw/amazon.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/synthetic.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cage3.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/bcsstk17.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/pdb1HYS.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/rma10.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cant.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/consph.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cop20k_A.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/shipsec1.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/scircuit.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/webbase-1M.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/wiki-Talk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/com-Orkut.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/circuit5M.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/harvard.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/twitter7.mtx" + }; + + std::string matfile = matfiles[filenum]; + std::cout << "reading B mat mtx\n"; + Tensor B = read(matfile, csr); + B.pack(); + // write(matfilesrw[filenum], B); + + if (statfile.is_open()) { + statfile << matfile << std::endl; + } + + std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl; + std::cout << "adding c mat\n"; + // Tensor C = read(matfiles2[filenum], csr, true); + // std::cout << "packing C mat\n"; + + std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl; + std::cout << "adding c mat\n"; + Tensor C("C", {B.getDimension(1), kdim}, rm); + for (int i = 0; i < C.getDimension(0); ++i) { + for (int j = 0; j < C.getDimension(1); ++j) { + C.insert({i,j}, unif(gen)); + } + } + std::cout << "packing C mat\n"; + C.pack(); + + Tensor D({C.getDimension(1), ldim}, rm); + for (int i = 0; i < D.getDimension(0); ++i) { + for (int j = 0; j < D.getDimension(1); ++j) { + D.insert({i,j}, unif(gen)); + } + } + std::cout << "packing D mat\n"; + D.pack(); + + // Tensor F({B.getDimension(1), ldim}, rm); + // for (int i = 0; i < F.getDimension(0); ++i) { + // for (int j = 0; j < F.getDimension(1); ++j) { + // F.insert({i,j}, unif(gen)); + // } + // } + // std::cout << "packing F mat\n"; + // F.pack(); + + Tensor A({B.getDimension(0), ldim}, rm); + Tensor ref({B.getDimension(0), ldim}, rm); + Tensor refn({B.getDimension(0), ldim}, rm); + IndexVar i, j, k, l; + IndexVar i0, i1, j0, j1, k0, k1, l0, l1; + + A(i,l)=B(i,j)*C(j,k)*D(k,l); + if (statfile.is_open()) { + statfile + << "ref(i,l)=B(i,j)*C(i,k)*D(j,k);" << std::endl + << "B1_dimension: " << B.getDimension(0) << ", B2_dimension: " << B.getDimension(1) << ", vals: " << B.getStorage().getValues().getSize() << std::endl + << "C1_dimension: " << C.getDimension(0) << ", C2_dimension: " << C.getDimension(1) << ", vals: " << C.getStorage().getValues().getSize() << std::endl + << "D1_dimension: " << D.getDimension(0) << ", D2_dimension: " << D.getDimension(1) << ", vals: " << D.getStorage().getValues().getSize() << std::endl + // << "E1_dimension: " << F.getDimension(0) << ", E2_dimension: " << F.getDimension(1) << ", vals: " << F.getStorage().getValues().getSize() << std::endl + << std::endl; + } + + // IndexStmt stmt = A.getAssignment().concretize(); + IndexStmt stmt = makeReductionNotation(A.getAssignment()); + stmt = makeConcreteNotation(stmt); + stmt = reorderLoopsTopologically(stmt); + stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 1); + stmt = stmt.split(i, i0, i1, 16); + stmt = insertTemporaries(stmt); + stmt = parallelizeOuterLoop(stmt); + + A.compile(stmt); + A.assemble(); + + + ref(i,l)=B(i,j)*C(j,k)*D(k,l); + refn(i,l)=B(i,j)*C(j,k)*D(k,l); + // IndexStmt refStmt = ref.getAssignment().concretize(); + + // ref1Stmt = ref1Stmt.split(i, i0, i1, 16); + // .pos(j, jpos, B(i,j)); + // .split(k, k0, k1, 8); + // .reorder({i0, i1, jpos0, k, jpos1}); + // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + // .parallelize(jpos1, ParallelUnit::CPUVector, OutputRaceStrategy::ParallelReduction); + IndexStmt refStmt = makeReductionNotation(ref.getAssignment()); + refStmt = makeConcreteNotation(refStmt); + refStmt = refStmt + .split(i, i0, i1, 16) + .split(k, k0, k1, 32) + .split(l, l0, l1, 32) + .reorder({i0, i1, j, k0, l0, k1, l1}); + refStmt = insertTemporaries(refStmt); + refStmt = parallelizeOuterLoop(refStmt); + ref.compile(refStmt); + ref.assemble(); + + IndexStmt refnStmt = makeReductionNotation(refn.getAssignment()); + refnStmt = makeConcreteNotation(refnStmt); + refnStmt = refnStmt + .split(i, i0, i1, 16); + refnStmt = insertTemporaries(refnStmt); + refnStmt = parallelizeOuterLoop(refnStmt); + refn.compile(refnStmt); + refn.assemble(); + + // SpMM , GEMM + + Tensor ref1({B.getDimension(0), kdim}, rm); + Tensor ref2({B.getDimension(0), ldim}, rm); + Tensor ref2_2({B.getDimension(0), ldim}, rm); + + ref1(i,k)=B(i,j)*C(j,k); + ref2(i,l)=ref1(i,k)*D(k,l); + ref2_2(i,l)=ref1(i,k)*D(k,l); + + IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment()); + ref1Stmt = makeConcreteNotation(ref1Stmt); + ref1Stmt = insertTemporaries(ref1Stmt); + ref1Stmt = parallelizeOuterLoop(ref1Stmt); + ref1.compile(ref1Stmt); + ref1.assemble(); + + IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment()); + ref2Stmt = makeConcreteNotation(ref2Stmt); + ref2Stmt = insertTemporaries(ref2Stmt); + ref2Stmt = ref2Stmt.split(i, i0, i1, 16); + ref2Stmt = parallelizeOuterLoop(ref2Stmt); + ref2.compile(ref2Stmt); + ref2.assemble(); + + IndexStmt ref2Stmt2 = makeReductionNotation(ref2_2.getAssignment()); + ref2Stmt2 = makeConcreteNotation(ref2Stmt2); + ref2Stmt2 = ref2Stmt2 + .split(i, i0, i1, 32) + .split(k,k0,k1, 32) + .split(l, l0, l1, 32) + .reorder({i0, k0, l0, i1, k1, l1}) + .parallelize(j0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); + ref2Stmt2 = insertTemporaries(ref2Stmt2); + // ref2Stmt2 = parallelizeOuterLoop(ref2Stmt2); + ref2_2.compile(ref2Stmt2); + ref2_2.assemble(); + + + // -------------- GeMM and SpMM + + Tensor ref3({C.getDimension(0), ldim}, rm); + Tensor ref4({C.getDimension(0), ldim}, rm); + ref3(j,l)=C(j,k)*D(k,l); // GEMM + ref4(i,l) = B(i,j)*ref3(j,l); // SpMM + + IndexStmt ref3Stmt = ref3.getAssignment().concretize(); + ref3Stmt = ref3Stmt + .split(j, j0, j1, 32) // changed to 32 + .split(k, k0, k1, 32) + .split(l, l0, l1, 32) + .reorder({j0, k0, l0, j1, k1, l1}) + .parallelize(j0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); + ref2Stmt2 = insertTemporaries(ref2Stmt2); + ref3.compile(ref3Stmt); + ref3.assemble(); + + IndexStmt ref4Stmt = makeReductionNotation(ref4.getAssignment()); // SpMM operation + ref4Stmt = makeConcreteNotation(ref4Stmt); + ref4Stmt = ref4Stmt.split(i, i0, i1, 16); + ref4Stmt = insertTemporaries(ref4Stmt); + ref4Stmt = parallelizeOuterLoop(ref4Stmt); + ref4.compile(ref4Stmt); + ref4.assemble(); + + + std::cout << "compute start\n"; + taco::util::TimeResults timevalue; + bool time = true; + + statfile << "\n--------- 1st pattern computation TTM, GEMM\n"; + + // retval = PAPI_hl_region_begin("spmm"); + // if ( retval != PAPI_OK ) handle_error(1); + TOOL_BENCHMARK_TIMER(ref1.compute(statfile), "\n\nSpMM Kernel: ", timevalue); + // retval = PAPI_hl_region_end("spmm"); + // if ( retval != PAPI_OK ) handle_error(1); + if (statfile.is_open()) { + statfile << "SpMM time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + std::string sofile_spmm_template = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so"; + // retval = PAPI_hl_region_begin("spmmtemplate"); + // if ( retval != PAPI_OK ) handle_error(1); + TOOL_BENCHMARK_TIMER(ref1.compute(statfile, sofile_spmm_template), "\n\nSpMM template Kernel: ", timevalue); + // retval = PAPI_hl_region_end("spmmtemplate"); + // if ( retval != PAPI_OK ) handle_error(1); + if (statfile.is_open()) { + statfile << "SpMM template time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + // retval = PAPI_hl_region_begin("gemm"); + // if ( retval != PAPI_OK ) handle_error(1); + TOOL_BENCHMARK_TIMER(ref2.compute(statfile), "\n\nGeMM Kernel: ", timevalue); + // retval = PAPI_hl_region_end("gemm"); + // if ( retval != PAPI_OK ) handle_error(1); + if (statfile.is_open()) { + statfile << "GeMM time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + // retval = PAPI_hl_region_begin("gemmtemplate"); + // if ( retval != PAPI_OK ) handle_error(1); + TOOL_BENCHMARK_TIMER(ref2_2.compute(statfile), "\n\nref GeMM template Kernel: ", timevalue); + // retval = PAPI_hl_region_end("gemmtemplate"); + // if ( retval != PAPI_OK ) handle_error(1); + if (statfile.is_open()) { + statfile << "ref 2 GeMM template time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + // std::string sofile_gemm_template = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/spmm_gemm/spmm_template.so"; + statfile << "\n--------- 2nd pattern computation GEMM, SpMM\n"; + // retval = PAPI_hl_region_begin("gemmtemplate2"); + // if ( retval != PAPI_OK ) handle_error(1); + TOOL_BENCHMARK_TIMER(ref3.compute(statfile), "\n\nGeMM template ref3 Kernel: ", timevalue); + // retval = PAPI_hl_region_end("gemmtemplate2"); + // if ( retval != PAPI_OK ) handle_error(1); + if (statfile.is_open()) { + statfile << "ref3 GeMM template time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + // retval = PAPI_hl_region_begin("spmm2"); + // if ( retval != PAPI_OK ) handle_error(1); + TOOL_BENCHMARK_TIMER(ref4.compute(statfile, sofile_spmm_template), "\n\nSpMM template Kernel ref4: ", timevalue); + // retval = PAPI_hl_region_end("spmm2"); + // if ( retval != PAPI_OK ) handle_error(1); + if (statfile.is_open()) { + statfile << "SpMM template time ref4: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + + statfile << "\n-------- reference pattern computation\n"; + + // retval = PAPI_hl_region_begin("ref"); + // if ( retval != PAPI_OK ) handle_error(1); + TOOL_BENCHMARK_TIMER(ref.compute(statfile), "\n\nReference Kernel: ", timevalue); + // retval = PAPI_hl_region_end("ref"); + // if ( retval != PAPI_OK ) handle_error(1); + if (statfile.is_open()) { + statfile << "taco reference time: "; + statfile << timevalue << std::endl; + } else { std::cout << " stat file is not open\n"; } + + // retval = PAPI_hl_region_begin("refnew"); + // if ( retval != PAPI_OK ) handle_error(1); + TOOL_BENCHMARK_TIMER(refn.compute(statfile), "\n\nReference new Kernel: ", timevalue); + // retval = PAPI_hl_region_end("refnew"); + // if ( retval != PAPI_OK ) handle_error(1); + if (statfile.is_open()) { + statfile << "taco reference new time: "; + statfile << timevalue << std::endl; + } else { std::cout << " stat file is not open\n"; } + + + // retval = PAPI_hl_region_begin("sparselnr"); + // if ( retval != PAPI_OK ) handle_error(1); + TOOL_BENCHMARK_TIMER(A.compute(statfile), "\n\nFused Kernel: ", timevalue); + // retval = PAPI_hl_region_end("sparselnr"); + // if ( retval != PAPI_OK ) handle_error(1); + if (statfile.is_open()) { + statfile << "fused time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + + double* A_vals = (double*) (A.getTacoTensorT()->vals); + double* ref_vals = (double*) (ref.getTacoTensorT()->vals); + double* ref2_vals = (double*) (ref2.getTacoTensorT()->vals); + double* ref4_vals = (double*) (ref2.getTacoTensorT()->vals); + + // int* A2_pos = (double*) (ref.getTacoTensorT()->vals); + + // for (size_t q=0; q < B.getStorage().getValues().getSize(); q++) { + // if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) { + // std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + // << "refvals: " << ref_vals[q] << std::endl; + // ASSERT_TRUE(false); + // } + // } + + for (int q=0; q < A.getDimension(0)* A.getDimension(1); q++) { + if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) { + std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + << "refvals: " << ref_vals[q] << std::endl; + ASSERT_TRUE(false); + } + } + for (int q=0; q < A.getDimension(0)* A.getDimension(1); q++) { + if ( abs(A_vals[q] - ref2_vals[q])/abs(ref2_vals[q]) > ERROR_MARGIN) { + std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + << "refvals: " << ref2_vals[q] << std::endl; + ASSERT_TRUE(false); + } + } + for (int q=0; q < A.getDimension(0)* A.getDimension(1); q++) { + if ( abs(A_vals[q] - ref4_vals[q])/abs(ref4_vals[q]) > ERROR_MARGIN) { + std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + << "refvals: " << ref4_vals[q] << std::endl; + ASSERT_TRUE(false); + } + } + + } // end of file num for loop + + if (statfile.is_open()) { + statfile.close(); + } + + + // unsigned int native = 0x0; + + // retval = PAPI_library_init(PAPI_VER_CURRENT); + + // if (retval != PAPI_VER_CURRENT) { + // printf("PAPI library init error!\n"); + // exit(1); + // } else { + // printf("PAPI library init success\n"); + // } + + // if (PAPI_create_eventset(&EventSet) != PAPI_OK) { + // handle_error(1); + // } + + // /* Add the native event */ + // native = () + + // retval = PAPI_hl_region_begin("computation1"); + // if ( retval != PAPI_OK ) + // handle_error(1); + + // /* Do some computation */ + + // retval = PAPI_hl_region_end("computation1"); + // if ( retval != PAPI_OK ) + // handle_error(1); + + // retval = PAPI_hl_region_begin("computation2"); + // if ( retval != PAPI_OK ) + // handle_error(1); + + // /* Do some computation */ + + // retval = PAPI_hl_region_end("computation2"); + // if ( retval != PAPI_OK ) + // handle_error(1); +} + + + + + + +TEST(scheduling_eval, sddmmspmmFused) { + if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) { + return; + } + + taco_set_num_threads(NUM_THREADS_TO_USE); + + ofstream statfile; + statfile.open( + "/home/min/a/kadhitha/workspace/my_taco/taco/test/stats/sddmm-spmm-gemm.txt", std::ios::app); + if (statfile.is_open()) { + statfile << "\nsddmm-spmm-gemm execution\n"; + statfile << "\n-----------------------------------------\n"; + } + + std::default_random_engine gen(0); + std::uniform_real_distribution unif(0.0, 1.0); + + Format csr({dense, sparse}); + Format rm({dense, dense}); + + int kdim = 64; + int ldim = 64; + int mdim = 64; + + // vector filenums{2, 3,4,5,6,7,8,9,10,12,15}; + vector filenums{0}; + + for (auto filenum : filenums) { + + + std::vector matfiles = { + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx", + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx", // 5 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx", // 8 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx", // 10 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx", // 12 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wiki-Talk/wiki-Talk.mtx", // 13 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/com-Orkut/com-Orkut.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx", // 15 + "/home/min/a/kadhitha/workspace/my_taco/FusedMM/dataset/harvard.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/twitter7/twitter7.mtx" + }; + std::vector matfilesrw = { + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/rw/cora.mtx", + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/rw/amazon.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/synthetic.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cage3.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/bcsstk17.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/pdb1HYS.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/rma10.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cant.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/consph.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cop20k_A.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/shipsec1.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/scircuit.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/webbase-1M.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/wiki-Talk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/com-Orkut.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/circuit5M.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/harvard.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/twitter7.mtx" + }; + + std::string matfile = matfiles[filenum]; + std::cout << "reading B mat mtx\n"; + Tensor B = read(matfile, csr, true); + B.setName("B"); + B.pack(); + // write(matfilesrw[filenum], B); + + if (statfile.is_open()) { + statfile << matfile << std::endl; + } + + std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl; + std::cout << "adding c mat\n"; + Tensor C({B.getDimension(0), kdim}, rm); + for (int i = 0; i < C.getDimension(0); ++i) { + for (int j = 0; j < C.getDimension(1); ++j) { + C.insert({i,j}, unif(gen)); + } + } + std::cout << "packing C mat\n"; + C.pack(); + + Tensor D({B.getDimension(1), kdim}, rm); + for (int i = 0; i < D.getDimension(0); ++i) { + for (int j = 0; j < D.getDimension(1); ++j) { + D.insert({i,j}, unif(gen)); + } + } + std::cout << "packing D mat\n"; + D.pack(); + + Tensor F({B.getDimension(1), ldim}, rm); + for (int i = 0; i < F.getDimension(0); ++i) { + for (int j = 0; j < F.getDimension(1); ++j) { + F.insert({i,j}, unif(gen)); + } + } + std::cout << "packing F mat\n"; + F.pack(); + + Tensor G({ldim, mdim}, rm); + for (int i = 0; i < G.getDimension(0); ++i) { + for (int j = 0; j < G.getDimension(1); ++j) { + G.insert({i,j}, unif(gen)); + } + } + std::cout << "packing F mat\n"; + G.pack(); + + Tensor A({B.getDimension(0), mdim}, rm); + Tensor ref({B.getDimension(0), mdim}, rm); + IndexVar i, j, k, l, m; + IndexVar i0("i0"), i1("i1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"), k0("k0"), k1("k1"); + IndexVar l0("l0"), l1("l1"), m0("m0"), m1("m1"); + + A(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); + + if (statfile.is_open()) { + statfile + << "ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);" << std::endl + << "B1_dimension: " << B.getDimension(0) << ", B2_dimension: " << B.getDimension(1) << ", vals: " << B.getStorage().getValues().getSize() << std::endl + << "C1_dimension: " << C.getDimension(0) << ", C2_dimension: " << C.getDimension(1) << ", vals: " << C.getStorage().getValues().getSize() << std::endl + << "D1_dimension: " << D.getDimension(0) << ", D2_dimension: " << D.getDimension(1) << ", vals: " << D.getStorage().getValues().getSize() << std::endl + << "E1_dimension: " << F.getDimension(0) << ", E2_dimension: " << F.getDimension(1) << ", vals: " << F.getStorage().getValues().getSize() << std::endl + << "G1_dimension: " << F.getDimension(0) << ", G2_dimension: " << G.getDimension(1) << ", vals: " << G.getStorage().getValues().getSize() << std::endl + << std::endl; + } + + // IndexStmt stmt = A.getAssignment().concretize(); + IndexStmt stmt = makeReductionNotation(A.getAssignment()); + stmt = makeConcreteNotation(stmt); + stmt = reorderLoopsTopologically(stmt); + stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 2); + stmt = stmt.split(i, i0, i1, 16); + + stmt = insertTemporaries(stmt); + stmt = parallelizeOuterLoop(stmt); + printToFile("sddmmSpMMGeMM", stmt); + + A.compile(stmt); + A.assemble(); + + + ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); + IndexStmt refStmt = makeReductionNotation(ref.getAssignment()); + refStmt = makeConcreteNotation(refStmt); + refStmt = refStmt.split(i, i0, i1, 16); + refStmt = insertTemporaries(refStmt); + refStmt = parallelizeOuterLoop(refStmt); + ref.compile(refStmt); + ref.assemble(); + + Tensor ref1({B.getDimension(0), B.getDimension(1)}, csr); + Tensor ref2({B.getDimension(0), ldim}, rm); + Tensor ref3({B.getDimension(0), mdim}, rm); + ref1(i,j)=B(i,j)*C(i,k)*D(j,k); + ref2(i,l)=ref1(i,j)*F(j,l); + ref3(i,m)=ref2(i,l)*G(l,m); + + IndexStmt ref1Stmt = ref1.getAssignment().concretize(); + + ref1Stmt = ref1Stmt.split(i, i0, i1, 16); + // // .pos(j, jpos, B(i,j)); + // // .split(k, k0, k1, 8); + // // .reorder({i0, i1, jpos0, k, jpos1}); + // // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + // // .parallelize(jpos1, ParallelUnit::CPUVector, OutputRaceStrategy::ParallelReduction); + // // ref1Stmt.split(i, ); + // // stmt = scheduleSDDMMCPU_forfuse(ref1Stmt, B); + // IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment()); + // ref1Stmt = makeConcreteNotation(ref1Stmt); + ref1Stmt = insertTemporaries(ref1Stmt); + ref1Stmt = parallelizeOuterLoop(ref1Stmt); + ref1.compile(ref1Stmt); + ref1.assemble(); + + IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment()); + ref2Stmt = makeConcreteNotation(ref2Stmt); + ref2Stmt = insertTemporaries(ref2Stmt); + ref2Stmt = parallelizeOuterLoop(ref2Stmt); + ref2.compile(ref2Stmt); + ref2.assemble(); + + // ref3(i,m)=ref2(i,l)*G(l,m); + IndexStmt ref3Stmt = makeReductionNotation(ref3.getAssignment()); + ref3Stmt = makeConcreteNotation(ref3Stmt); + ref3Stmt = ref3Stmt + .split(i, i0, i1, 32) + .split(l, l0, l1, 32) + .split(m, m0, m1, 32) + .reorder({i0, l0, m0, i1, l1, m1}); + ref3Stmt = insertTemporaries(ref3Stmt); + ref3Stmt = parallelizeOuterLoop(ref3Stmt); + ref3.compile(ref3Stmt); + ref3.assemble(); + + std::cout << "compute start\n"; + taco::util::TimeResults timevalue; + bool time = true; + + // std::string sofile_fused = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/fused_kernel.so"; + TOOL_BENCHMARK_TIMER(A.compute(statfile), "\n\nFused Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "fused time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + // std::string sofile_sddmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so"; + std::string sofile_sddmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.so"; + TOOL_BENCHMARK_TIMER(ref1.compute(statfile, sofile_sddmm), "\n\nSDDMM Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "sddmm time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + std::string sofile_sddmm_ryan = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/sddmm_ryan.so"; + TOOL_BENCHMARK_TIMER(ref1.compute(statfile, sofile_sddmm_ryan), "\n\nSDDMM ryan Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "sddmm ryan time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + std::string sofile_spmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so"; + TOOL_BENCHMARK_TIMER(ref2.compute(statfile, sofile_spmm), "\n\nSpMM ryan Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "spmm ryan time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + // std::string sofile_spmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so"; + TOOL_BENCHMARK_TIMER(ref3.compute(statfile), "\n\nGeMM Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "gemm time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + // std::string sofile_original = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/taco_original.so"; + TOOL_BENCHMARK_TIMER(ref.compute(statfile), "\n\nReference Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "taco reference time: "; + statfile << timevalue << std::endl; + } else { std::cout << " stat file is not open\n"; } + + double* A_vals = (double*) (A.getTacoTensorT()->vals); + double* ref_vals = (double*) (ref.getTacoTensorT()->vals); + double* ref3_vals = (double*) (ref3.getTacoTensorT()->vals); + + // int* A2_pos = (double*) (ref.getTacoTensorT()->vals); + + for (int q=0; q < A.getDimension(0)* A.getDimension(1); q++) { + if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) { + std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + << "refvals: " << ref_vals[q] << std::endl; + ASSERT_TRUE(false); + } + } + + for (int q=0; q < A.getDimension(0)* A.getDimension(1); q++) { + if ( abs(ref3_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) { + std::cout << "error: results don't match i: " << q << ", avals: " << ref3_vals[q] << " " + << "refvals: " << ref_vals[q] << std::endl; + ASSERT_TRUE(false); + } + } + + + + } + + // int filenum = 3; + + + // for (size_t q=0; q < A.getDimension(0)* A.getDimension(1); q++) { + // if ( abs(A_vals[q] - ref3_vals[q])/abs(ref3_vals[q]) > ERROR_MARGIN) { + // std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + // << "refvals: " << ref3_vals[q] << std::endl; + // ASSERT_TRUE(false); + // } + // } + // for (int q= 0; q< A_vals + // for (int q = 0; q < A.getDimension(0); ++q) { + // for (int w = 0; w < A.getDimension(1); ++w) { + // if ( abs(A(q,w) - ref(q,w))/abs(ref(q,w)) > ERROR_MARGIN) { + // std::cout << "error: results don't match A("<< q << "," << w << "): " + // << A(q,w) << ", ref: " << ref(q,w) << std::endl; + // ASSERT_TRUE(false); + // } + // } + // } + // ASSERT_TENSOR_EQ(A, ref); + + if (statfile.is_open()) { + statfile.close(); + } + +} \ No newline at end of file diff --git a/test/tests-scheduling-ispc-eval.cpp b/test/tests-scheduling-ispc-eval.cpp new file mode 100644 index 000000000..139597f9c --- /dev/null +++ b/test/tests-scheduling-ispc-eval.cpp @@ -0,0 +1,2 @@ + + diff --git a/test/tests-transformation.cpp b/test/tests-transformation.cpp index abfec3d45..9a472906f 100644 --- a/test/tests-transformation.cpp +++ b/test/tests-transformation.cpp @@ -255,6 +255,8 @@ INSTANTIATE_TEST_CASE_P(parallelize, apply, struct reorderLoopsTopologically : public TestWithParam {}; + +// TEST_P(reorderLoopsTopologically, test) { IndexStmt actual = taco::reorderLoopsTopologically(GetParam().actual); ASSERT_NOTATION_EQ(GetParam().expected, actual); diff --git a/test/util.h b/test/util.h new file mode 100644 index 000000000..f96087ba1 --- /dev/null +++ b/test/util.h @@ -0,0 +1,113 @@ +#ifndef __SCHEDULE_UTIL_HH__ +#define __SCHEDULE_UTIL_HH__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "taco/cuda.h" +#include "test.h" +#include "test_tensors.h" +#include "taco/tensor.h" +#include "taco/index_notation/index_notation.h" +#include "taco/index_notation/transformations.h" +#include "codegen/codegen.h" +#include "taco/lower/lower.h" +#include "taco/util/timers.h" + +using namespace taco; + +#define ERROR_MARGIN (1.0e-2) + +#define TOOL_BENCHMARK_TIMER(CODE,NAME,TIMER) { \ + if (time) { \ + taco::util::Timer timer; \ + timer.start(); \ + CODE; \ + timer.stop(); \ + taco::util::TimeResults result = timer.getResult(); \ + cout << NAME << " " << result << " ms" << endl; \ + TIMER=result; \ + } \ + else { \ + CODE; \ + } \ +} + +#define TOOL_BENCHMARK_TIMER2(CODE,NAME,TIMER) { \ + if (time) { \ + taco::util::Timer timer; \ + timer.start(); \ + CODE; \ + timer.stop(); \ + taco::util::TimeResults result = timer.getResult(); \ + if (statfile.is_open()) { \ + statfile << NAME << " " << result << " ms" << endl; \ + } else { \ + cout << NAME << " " << result << " ms" << endl; \ + } \ + TIMER=result; \ + } \ + else { \ + CODE; \ + } \ +} + +static void printToCout(IndexStmt stmt); +static void printToFile(string filename, IndexStmt stmt); +static void printToFile(string filename, string additional_filename, IndexStmt stmt); + + +static void printToCout(IndexStmt stmt) { + std::shared_ptr codegen = ir::CodeGen::init_default(cout, ir::CodeGen::ImplementationGen); + ir::Stmt compute = lower(stmt, "compute", false, true); + codegen->compile(compute, true); +} + +void printToFile(string filename, IndexStmt stmt) { + stringstream source; + + string file_path = "eval_generated/"; + mkdir(file_path.c_str(), 0777); + + std::shared_ptr codegen = ir::CodeGen::init_default(source, ir::CodeGen::ImplementationGen); + ir::Stmt compute = lower(stmt, "compute", false, true); + codegen->compile(compute, true); + + ofstream source_file; + string file_ending = should_use_CUDA_codegen() ? ".cu" : ".c"; + source_file.open(file_path + filename + file_ending); + source_file << source.str(); + source_file.close(); +} + +void printToFile(string filename, string additional_filename, IndexStmt stmt) { + stringstream source1; + stringstream source2; + + string file_path = "eval_generated/"; + mkdir(file_path.c_str(), 0777); + + std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); + ir::Stmt compute = lower(stmt, "compute", false, true); + codegen->compile(compute, true); + + ofstream source_file; + string file_ending = should_use_CUDA_codegen() ? ".cu" : ".c"; + source_file.open(file_path+filename+file_ending); + source_file << source1.str(); + source_file.close(); + + ofstream additional_source_file; + string additional_file_ending = ".ispc"; + additional_source_file.open(file_path+additional_filename+additional_file_ending); + additional_source_file << source2.str(); + additional_source_file.close(); + +} + +#endif // __SCHEDULE_UTIL_HH__ \ No newline at end of file diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt index 922f7e52e..41699d3fd 100644 --- a/tools/CMakeLists.txt +++ b/tools/CMakeLists.txt @@ -4,6 +4,7 @@ foreach(TOOL_SOURCE ${TOOL_SOURCES}) get_filename_component(TOOL ${TOOL_SOURCE} NAME_WE) add_executable("${TOOL}-tool" ${TOOL_SOURCE}) target_link_libraries("${TOOL}-tool" taco) + target_link_libraries("${TOOL}-tool" papi) target_include_directories("${TOOL}-tool" PRIVATE "${CMAKE_BINARY_DIR}/include") SET_TARGET_PROPERTIES("${TOOL}-tool" PROPERTIES OUTPUT_NAME ${TOOL}) install(TARGETS "${TOOL}-tool" DESTINATION bin) diff --git a/tools/taco.cpp b/tools/taco.cpp index cd351a203..7384874ec 100644 --- a/tools/taco.cpp +++ b/tools/taco.cpp @@ -9,6 +9,7 @@ #include "taco.h" #include "taco/error.h" +#include "taco/index_notation/index_notation.h" #include "taco/parser/lexer.h" #include "taco/parser/parser.h" #include "taco/parser/schedule_parser.h" @@ -20,6 +21,7 @@ #include "taco/lower/lower.h" #include "taco/codegen/module.h" #include "codegen/codegen_c.h" +#include "codegen/codegen_ispc.h" #include "codegen/codegen_cuda.h" #include "codegen/codegen.h" #include "taco/util/strings.h" @@ -188,6 +190,8 @@ static void printUsageInfo() { cout << endl; printFlag("print-nocolor", "Print without colors."); cout << endl; + printFlag("ispc", "Generate ISPC code for Intel CPUs"); + cout << endl; printFlag("cuda", "Generate CUDA code for NVIDIA GPUs"); cout << endl; printFlag("schedule", "Specify parallel execution schedule"); @@ -262,7 +266,7 @@ static void printSchedulingHelp() { "an output race strategy `strat`. Since the other transformations " "expect serial code, parallelize must come last in a series of " "transformations. Possible parallel hardware units are: " - "NotParallel, GPUBlock, GPUWarp, GPUThread, CPUThread, CPUVector. " + "NotParallel, GPUBlock, GPUWarp, GPUThread, CPUThread, CPUVector, CPUSimd, CPUSimd. " "Possible output race strategies are: " "IgnoreRaces, NoRaces, Atomics, Temporary, ParallelReduction."); } @@ -279,6 +283,8 @@ static void printVersionInfo() { cout << "Built with Python support." << endl; if(TACO_FEATURE_CUDA) cout << "Built with CUDA support." << endl; + if(TACO_FEATURE_ISPC) + cout << "Built with ISPC support." << endl; cout << endl; cout << "Built on: " << TACO_BUILD_DATE << endl; cout << "CMake build type: " << TACO_BUILD_TYPE << endl; @@ -308,7 +314,10 @@ static void printCommandLine(ostream& os, int argc, char* argv[]) { } } -static bool setSchedulingCommands(vector> scheduleCommands, parser::Parser& parser, IndexStmt& stmt) { +static int setSchedulingCommands(vector> scheduleCommands, + parser::Parser& parser, IndexStmt& stmt, Assignment assignment) { + + std::cout << "setting scheduling commands\n"; auto findVar = [&stmt](string name) { ProvenanceGraph graph(stmt); for (auto v : graph.getAllIndexVars()) { @@ -321,9 +330,15 @@ static bool setSchedulingCommands(vector> scheduleCommands, parse abort(); // to silence a warning: control reaches end of non-void function }; - bool isGPU = false; + int isGPU = 0; + int isISPC = 0; for(vector scheduleCommand : scheduleCommands) { + std::cout << "running schedluing command: "; + for (auto &command : scheduleCommand) { + std::cout << command << " "; + } + std::cout << std::endl; string command = scheduleCommand[0]; scheduleCommand.erase(scheduleCommand.begin()); @@ -352,6 +367,16 @@ static bool setSchedulingCommands(vector> scheduleCommands, parse IndexVar fused(f); stmt = stmt.fuse(findVar(i), findVar(j), fused); + } else if (command == "loopfuse") { + taco_uassert(scheduleCommand.size() == 2) + << "'loopfuse' scheduling directive takes 2 parameters: fuse(b, 2)"; + std::string side = scheduleCommand[0]; + taco_uassert(side == "b" || side == "f") + << "first parameter must be either 'f' or 'b'"; + + int iters = std::stoi(scheduleCommand[1]); + + stmt = loopFusionOverFission(stmt, assignment, side, iters); } else if (command == "split") { taco_uassert(scheduleCommand.size() == 4) << "'split' scheduling directive takes 4 parameters: split(i, i1, i2, splitFactor)"; @@ -536,7 +561,15 @@ static bool setSchedulingCommands(vector> scheduleCommands, parse parallel_unit = ParallelUnit::CPUThread; } else if (unit == "CPUVector") { parallel_unit = ParallelUnit::CPUVector; - } else { + } else if (unit == "CPUSimd") { + isISPC = true; + parallel_unit = ParallelUnit::CPUSimd; + } + else if (unit == "CPUSpmd") { + parallel_unit = ParallelUnit::CPUSpmd; + isISPC = true; + } + else { taco_uerror << "Parallel hardware not defined."; goto end; } @@ -557,6 +590,8 @@ static bool setSchedulingCommands(vector> scheduleCommands, parse goto end; } + std::cout << "stmt before parallelizing the statement: " << stmt << endl; + std::cout << "ParallelUnit: " << ParallelUnit_NAMES[(int) parallel_unit] << ", outputRaceStrategy: " << OutputRaceStrategy_NAMES[(int) output_race_strategy] << std::endl; stmt = stmt.parallelize(findVar(i), parallel_unit, output_race_strategy); } else if (command == "assemble") { @@ -612,7 +647,13 @@ static bool setSchedulingCommands(vector> scheduleCommands, parse end:; } - return isGPU; + if (isGPU) { + return 1; + } + else if (isISPC) { + return 2; + } + return 0; } int main(int argc, char* argv[]) { @@ -641,6 +682,7 @@ int main(int argc, char* argv[]) { bool color = true; bool readKernels = false; bool cuda = false; + bool ispc = false; bool setSchedule = false; @@ -949,6 +991,10 @@ int main(int argc, char* argv[]) { else if ("-cuda" == argName) { cuda = true; } + else if ("-ispc" == argName) { + std::cout << "ispc true\n"; + ispc = true; + } else if ("-schedule" == argName) { vector descriptor = util::split(argValue, ","); if (descriptor.size() > 2 || descriptor.empty()) { @@ -1001,6 +1047,8 @@ int main(int argc, char* argv[]) { } } + std::cout << "cuda: " << cuda << ", ispc: " << ispc << std::endl; + // Print compute is the default if nothing else was asked for if (!printAssemble && !printEvaluate && !printIterationGraph && !writeCompute && !writeAssemble && !writeKernels && !readKernels && @@ -1009,9 +1057,11 @@ int main(int argc, char* argv[]) { } // pre-parse expression, to determine existence and order of loaded tensors + std::cout << "pre-parse expression, to determine existence and order of loaded tensors\n"; map loadedTensors; TensorBase temp_tensor; parser::Parser temp_parser(exprStr, formats, dataTypes, tensorsDimensions, loadedTensors, 42); + std::cout << exprStr << std::endl; try { temp_parser.parse(); temp_tensor = temp_parser.getResultTensor(); @@ -1112,33 +1162,61 @@ int main(int argc, char* argv[]) { taco_set_parallel_schedule(sched, chunkSize); taco_set_num_threads(nthreads); - IndexStmt stmt = - makeConcreteNotation(makeReductionNotation(tensor.getAssignment())); + Assignment assignment = tensor.getAssignment(); + std::cout << "tensor.getAssignment(): " << assignment << std::endl; + + IndexStmt stmt2 = makeReductionNotation(tensor.getAssignment()); + std::cout << "reducedNotation: " << stmt2 << std::endl; + // IndexStmt stmt = + // makeConcreteNotation(makeReductionNotation(tensor.getAssignment())); + IndexStmt stmt = makeConcreteNotation(stmt2); + std::cout << "concrete index statement: " << stmt << std::endl; stmt = reorderLoopsTopologically(stmt); + std::cout << "topologically reordered loops statement: " << stmt << std::endl; + if (setSchedule) { - cuda |= setSchedulingCommands(scheduleCommands, parser, stmt); + int val = setSchedulingCommands(scheduleCommands, parser, stmt, tensor.getAssignment()); + // stmt = loopFusionOverFission(stmt, tensor.getAssignment()); + cuda |= (val==1); + ispc |= (val==2); } else { + // stmt = loopFusionOverFission(stmt, tensor.getAssignment()); stmt = insertTemporaries(stmt); stmt = parallelizeOuterLoop(stmt); } + std::cout << "after setting the scheduling commands\n"; + std::cout << stmt << std::endl; if (cuda) { if (!CUDA_BUILT && benchmark) { return reportError("TACO must be built for CUDA (cmake -DCUDA=ON ..) to benchmark", 2); } set_CUDA_codegen_enabled(true); + set_ISPC_codegen_enabled(false); + } + else if (ispc) { + if (!ISPC_BUILT && benchmark) { + return reportError("TACO must be built for ISPC (cmake -DISPC=ON .. to benchmark", 2); + } + set_CUDA_codegen_enabled(false); + set_ISPC_codegen_enabled(true); } else { set_CUDA_codegen_enabled(false); + set_ISPC_codegen_enabled(false); } + std::cout << "running scalar promote\n" << std::endl; // stmt = scalarPromote(stmt); + std::cout << "\nafter scalar promote: \n" << stmt << std::endl << std::endl; + if (printConcrete) { cout << stmt << endl; } + // lower index statement to ir statement Kernel kernel; if (benchmark) { if (time) cout << endl; @@ -1221,9 +1299,15 @@ int main(int argc, char* argv[]) { } } else { + std::cout << "lowering stmt: " << stmt << std::endl; compute = lower(stmt, prefix+"compute", computeWithAssemble, true); assemble = lower(stmt, prefix+"assemble", true, false); evaluate = lower(stmt, prefix+"evaluate", true, true); + + std::cout << "\n\ncompute kernel\n------------\n" << compute << std::endl << std::endl; + // compute kernel is the most basic kernel after lowering phase + + std::cout << "\n\nevaluate kernel\n------------\n" << evaluate << std::endl << std::endl; } string packComment = @@ -1278,6 +1362,7 @@ int main(int argc, char* argv[]) { } bool hasPrinted = false; + std::shared_ptr codegen = ir::CodeGen::init_default(cout, ir::CodeGen::ImplementationGen); codegen->setColor(color); if (printAssemble) { @@ -1298,6 +1383,7 @@ int main(int argc, char* argv[]) { } if (compute.defined()) { + std::cout << "Code generation\n"; codegen->compile(compute, false); } else { @@ -1355,7 +1441,7 @@ int main(int argc, char* argv[]) { } IterationGraph iterationGraph; - if (printIterationGraph) { + if (printIterationGraph) { // print iteration graph iterationGraph = IterationGraph::make(tensor.getAssignment()); }