diff --git a/.gitignore b/.gitignore index 16389f34e..215b56e9a 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,7 @@ CMakeCache.txt doc apps/tensor_times_vector/tensor_times_vector + +.cache +.vscode +compile_commands.json diff --git a/CMakeLists.txt b/CMakeLists.txt index a6a80d9d1..4f8b54eee 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,10 +11,10 @@ project(taco ) option(CUDA "Build for NVIDIA GPU (CUDA must be preinstalled)" OFF) option(PYTHON "Build TACO for python environment" OFF) -option(OPENMP "Build with OpenMP execution support" OFF) +option(OPENMP "Build with OpenMP execution support" ON) option(COVERAGE "Build with code coverage analysis" OFF) set(TACO_FEATURE_CUDA 0) -set(TACO_FEATURE_OPENMP 0) +set(TACO_FEATURE_OPENMP 1) set(TACO_FEATURE_PYTHON 0) if(CUDA) message("-- Searching for CUDA Installation") diff --git a/include/taco/codegen/module.h b/include/taco/codegen/module.h index 36eb34f1a..44431ef46 100644 --- a/include/taco/codegen/module.h +++ b/include/taco/codegen/module.h @@ -17,7 +17,7 @@ class Module { public: /// Create a module for some target Module(Target target=getTargetFromEnvironment()) - : lib_handle(nullptr), moduleFromUserSource(false), target(target) { + : lib_handle(nullptr), so_lib_handle(nullptr), moduleFromUserSource(false), target(target) { setJITLibname(); setJITTmpdir(); } @@ -44,11 +44,16 @@ class Module { /// before calling. If there's no function of this name then a nullptr is /// returned. void* getFuncPtr(std::string name); + void* getFuncPtr(std::string& sofile, std::string name); /// Call a raw function in this module and return the result + int callFuncPackedRaw(std::string name, std::string& sofile, void** args); int callFuncPackedRaw(std::string name, void** args); /// Call a raw function in this module and return the result + int callFuncPackedRaw(std::string name, std::string& sofile, std::vector args) { + return callFuncPackedRaw(name, sofile, args.data()); + } int callFuncPackedRaw(std::string name, std::vector args) { return callFuncPackedRaw(name, args.data()); } @@ -57,6 +62,10 @@ class Module { int callFuncPacked(std::string name, void** args) { return callFuncPackedRaw("_shim_"+name, args); } + + int callFuncPacked(std::string name, std::string& sofile, void** args) { + return callFuncPackedRaw("_shim_"+name, sofile,args); + } /// Call a function using the taco_tensor_t interface and return the result int callFuncPacked(std::string name, std::vector args) { @@ -72,6 +81,7 @@ class Module { std::string libname; std::string tmpdir; void* lib_handle; + void* so_lib_handle; std::vector funcs; // true iff the module was created from user-provided source diff --git a/include/taco/index_notation/transformations.h b/include/taco/index_notation/transformations.h index 7aa2579ad..4d6ec6830 100644 --- a/include/taco/index_notation/transformations.h +++ b/include/taco/index_notation/transformations.h @@ -223,6 +223,9 @@ IndexStmt parallelizeOuterLoop(IndexStmt stmt); */ IndexStmt reorderLoopsTopologically(IndexStmt stmt); +IndexStmt loopFusionOverFission(IndexStmt stmt, Assignment assignment, + std::string side, int iters); + /** * Performs scalar promotion so that reductions are done by accumulating into * scalar temporaries whenever possible. diff --git a/include/taco/tensor.h b/include/taco/tensor.h index b91782256..883718fb6 100644 --- a/include/taco/tensor.h +++ b/include/taco/tensor.h @@ -413,6 +413,8 @@ class TensorBase { /// Compile the tensor expression. void compile(); + void compute(std::ofstream& statfile); + void compute(std::ofstream& statfile, std::string& sofile); void compile(IndexStmt stmt, bool assembleWhileCompute=false); diff --git a/out/taco-uml/._taco.svg b/out/taco-uml/._taco.svg new file mode 100755 index 000000000..e88dbd51b Binary files /dev/null and b/out/taco-uml/._taco.svg differ diff --git a/out/taco-uml/taco.svg b/out/taco-uml/taco.svg new file mode 100644 index 000000000..57f7a18d1 --- /dev/null +++ b/out/taco-uml/taco.svg @@ -0,0 +1,878 @@ +IntrusivePtrT *ptrUncopyableIRNodevirtual void accept(IRVisitorStrict *v) const = 0virtual IRNodeType type_info() const = 0;BaseStmtNodeBaseExprNodeDatatype typeStmtNodevoid accept(IRVisitorStrict *v) constExprNodevoid accept(IRVisitorStrict *v) constIRHandlevoid accept(IRVisitorStrict *v) constExprStmtIRVisitorStrictvirtual void visit(const IRNode*) const = 0IRVisitorvirtual void visit(const IRNode*)IRRewriterExpr exprStmt stmtvirtual void visit(const ExprNode* op)virtual void visit(const StmtNode* op)Expr rewrite(Expr)Stmt rewrite(Stmt)IRPrinterstd::ostream &streamstd::ostream &stream2int indentbool colorbool simplifyenum PrecedencePrecedence parentPrecedence = BOTTOMNameGenerator varNameGeneratorscopedMap<Expr, std::String> varNamesvoid doIndent()void printBinOp(Expr a, Expr b, std::string op, Precedence precedence)void fewMoreMethods()virtual void visit(const ExprNode*)virtual void visit(const StmtNode*)setColor(bool color)print(Stmt)IRVerifierExpressionSimplifierRemoveRedundantStatementsRemoveRedundantLoopsRemoveDuplicateBodyCodeGenCodeGen_CCodeGen_CUDACodeGen_ISPCManageableIndexStmtNodevirtual void accept(IndexStmtVisitorStrict*) const = 0IndexExprNodevirtual void accept(IndexStmtVisitorStrict*) const = 0IndexStmtIndexExprIndexExprVisitorStrictvoid visit(const IndexStmt&)virtual void visit(const AccessNode*) = 0virtual void visit(const LiteralNode*) = 0virtual void visit(const NegNode*) = 0virtual void visit(const AddNode*) = 0virtual void visit(const SubNode*) = 0virtual void visit(const MulNode*) = 0virtual void visit(const DivNode*) = 0virtual void visit(const SqrtNode*) = 0virtual void visit(const CastNode*) = 0virtual void visit(const CallIntrinsicNode*) = 0virtual void visit(const ReductionNode*) = 0IndexStmtVisitorStrictvoid visit(const IndexStmt&)virtual void visit(const AssignmentNode*) = 0virtual void visit(const YieldNode*) = 0virtual void visit(const ForallNode*) = 0virtual void visit(const WhereNode*) = 0virtual void visit(const SequenceNode*) = 0virtual void visit(const AssembleNode*) = 0virtual void visit(const MultiNode*) = 0virtual void visit(const SuchThatNode*) = 0IndexNotationVisitorStrictIndexNotationPrintervoid print(const IndexExpr& expr)void print(const IndexStmt& expr)void visit(const AccessNode* node)void visit(const LiteralNode* node)void visit(const NegNode* node)void visit(const AddNode* node)void visit(const SubNode* node)void visit(const MulNode* node)void visit(const DivNode* node)void visit(const SqrtNode* node)void visit(const CastNode* node)void visit(const CallIntrinsicNode* node)void visit(const UnaryExprNode* node)void visit(const BinaryExprNode* node)void visit(const ReductionNode* node)void visit(const AssignmentNode* node)void visit(const YieldNode* node)void visit(const ForallNode* node)void visit(const WhereNode* node)void visit(const SequenceNode* node)void visit(const AssembleNode* node)void visit(const MultiNode* node)void visit(const SuchThatNode* node)IndexNotationVisitorvirtual void visit(const AccessNode* node)virtual void visit(const LiteralNode* node)virtual void visit(const NegNode* node)virtual void visit(const AddNode* node)virtual void visit(const SubNode* node)virtual void visit(const MulNode* node)virtual void visit(const DivNode* node)virtual void visit(const SqrtNode* node)virtual void visit(const CastNode* node)virtual void visit(const CallIntrinsicNode* node)virtual void visit(const UnaryExprNode* node)virtual void visit(const BinaryExprNode* node)virtual void visit(const ReductionNode* node)virtual void visit(const AssignmentNode* node)virtual void visit(const YieldNode* node)virtual void visit(const ForallNode* node)virtual void visit(const WhereNode* node)virtual void visit(const SequenceNode* node)virtual void visit(const AssembleNode* node)virtual void visit(const MultiNode* node)virtual void visit(const SuchThatNode* node)MatcherIndexExprRewriterStrictIndexExpr exprIndexExpr rewrite(IndexExpr)virtual void visit(const AccessNode* op) = 0virtual void visit(const LiteralNode* op) = 0virtual void visit(const NegNode* op) = 0virtual void visit(const SqrtNode* op) = 0virtual void visit(const AddNode* op) = 0virtual void visit(const SubNode* op) = 0virtual void visit(const MulNode* op) = 0virtual void visit(const DivNode* op) = 0virtual void visit(const CastNode* op) = 0virtual void visit(const CallIntrinsicNode* op) = 0virtual void visit(const ReductionNode* op) = 0IndexStmtRewriterStrictIndexStmt stmtIndexStmt rewrite(IndexStmt)virtual void visit(const AssignmentNode* op) = 0virtual void visit(const YieldNode* op) = 0virtual void visit(const ForallNode* op) = 0virtual void visit(const WhereNode* op) = 0virtual void visit(const SequenceNode* op) = 0virtual void visit(const AssembleNode* op) = 0virtual void visit(const MultiNode* op) = 0virtual void visit(const SuchThatNode* op) = 0IndexNotationRewriterStrictIndexNotationRewritervirtual void visit(const AccessNode* node)virtual void visit(const LiteralNode* node)virtual void visit(const NegNode* node)virtual void visit(const AddNode* node)virtual void visit(const SubNode* node)virtual void visit(const MulNode* node)virtual void visit(const DivNode* node)virtual void visit(const SqrtNode* node)virtual void visit(const CastNode* node)virtual void visit(const CallIntrinsicNode* node)virtual void visit(const UnaryExprNode* node)virtual void visit(const BinaryExprNode* node)virtual void visit(const ReductionNode* node)virtual void visit(const AssignmentNode* node)virtual void visit(const YieldNode* node)virtual void visit(const ForallNode* node)virtual void visit(const WhereNode* node)virtual void visit(const SequenceNode* node)virtual void visit(const AssembleNode* node)virtual void visit(const MultiNode* node)virtual void visit(const SuchThatNode* node)Lowererstd::shared_ptr<LowererImpl> impl;LowererImplclass Visitor;friend class Visitor;std::shared_ptr<Visitor> visitor;virtual ir::Stmt lower(IndexStmt stmt);virtual ir::Expr lower(IndexExpr expr);virtual ir::Expr lowerExpr(IndexExpr expr) = 0;virtual ir::Stmt lowerStmt(IndexStmt stmt) = 0;virtual ir::Stmt lower(IndexStmt stmt, std::string name,bool assemble, bool compute, bool pack, bool unpack) = 0;LowererImplImperativeclass Visitorfiend class Visitorstd::shared_ptr<Visitor> visitorbool assemblebool computevars a_bunch_of_other_fieldsvirtual ir::Stmt lowerExpr(IndexExpr expr);virtual ir::Stmt lowerStmt(IndexStmt stmt);ir::Stmt lower(IndexStmt stmt, std::string name,bool assemble, bool compute, bool pack, bool unpack)Stmt LowererImplImperative::lower(IndexStmt stmt) {return visitor->lower(stmt);}VisitorLowererImpl* implExpr exprStmt stmtvoid visit(const AssignmentNode* node)void visit(const YieldNode* node)void visit(const ForallNode* node)void visit(const WhereNode* node)void visit(const MultiNode* node)void visit(const SuchThatNode* node)void visit(const SequenceNode* node)void visit(const AssembleNode* node)void visit(const AccessNode* node)void visit(const LiteralNode* node)void visit(const NegNode* node)void visit(const AddNode* node)void visit(const SubNode* node)void visit(const MulNode* node)void visit(const DivNode* node)void visit(const SqrtNode* node)void visit(const CastNode* node)void visit(const CallIntrinsicNode* node)void visit(const ReductionNode* node)Visitor(LowererImplImperative* impl)Stmt lower(IndexStmt stmt)Expr lower(IndexExpr expr)Stmt lower(IndexStmt stmt) {this->stmt = Stmt();impl->accessibleIterators.scope();IndexStmtVisitorStrict::visit(stmt);impl->accessibleIterators.unscope();return this->stmt;}contains111111contains11contains11contains11contains11contains11 \ No newline at end of file diff --git a/src/codegen/codegen.cpp b/src/codegen/codegen.cpp index f0c09d98a..64c8b3f02 100644 --- a/src/codegen/codegen.cpp +++ b/src/codegen/codegen.cpp @@ -229,6 +229,49 @@ string CodeGen::printTensorProperty(string varname, const GetProperty* op, bool return ret.str(); } +string CodeGen::getUnpackedTensorArgument(string varname, const GetProperty* op, + bool is_output_prop) { + stringstream ret; + ret << ""; + + auto tensor = op->tensor.as(); + if (op->property == TensorProperty::Values) { + // for the values, it's in the last slot + ret << "uniform " << printType(tensor->type, false) << " " << varname << "[]"; + return ret.str(); + } else if (op->property == TensorProperty::ValuesSize) { + ret << "int32 " << varname; + return ret.str(); + } + + // for a Dense level, nnz is an int + // for a Fixed level, ptr is an int + // all others are int* + if (op->property == TensorProperty::Dimension) { + if (op->type == Int32) { + ret << "uniform int32 "; + } else if (op->type == Int64) { + ret << "uniform int64 "; + } else { + ret << "int "; + } + ret << varname; + + } else { + taco_iassert(op->property == TensorProperty::Indices); + if (op->type == Int32) { + ret << "uniform int32 "; + } else if (op->type == Int64) { + ret << "uniform int64 "; + } else { + ret << "uniform int "; + } + ret << varname << "[]"; + } + + return ret.str(); +} + string CodeGen::unpackTensorProperty(string varname, const GetProperty* op, bool is_output_prop) { stringstream ret; @@ -310,13 +353,9 @@ string CodeGen::pointTensorProperty(std::string varname) { return ret.str(); } -// helper to print declarations -string CodeGen::printDecls(map varMap, - vector inputs, vector outputs) { - stringstream ret; - unordered_set propsAlreadyGenerated; - - vector sortedProps; +void CodeGen::getSortedProps(map &varMap, + vector &sortedProps, vector &inputs, + vector &outputs) { for (auto const& p: varMap) { if (p.first.as()) @@ -355,6 +394,17 @@ string CodeGen::printDecls(map varMap, return a->index < b->index; }); +} + +// helper to print declarations +string CodeGen::printDecls(map varMap, + vector inputs, vector outputs) { + stringstream ret; + unordered_set propsAlreadyGenerated; + + vector sortedProps; + getSortedProps(varMap, sortedProps, inputs, outputs); + for (auto prop: sortedProps) { bool isOutputProp = (find(outputs.begin(), outputs.end(), prop->tensor) != outputs.end()); @@ -375,7 +425,6 @@ string CodeGen::printDecls(map varMap, return ret.str(); } - string CodeGen::printPack(map, string> outputProperties, vector outputs) { stringstream ret; diff --git a/src/codegen/codegen.h b/src/codegen/codegen.h index cc25c80d6..48540904e 100644 --- a/src/codegen/codegen.h +++ b/src/codegen/codegen.h @@ -16,7 +16,8 @@ class CodeGen : public IRPrinter { enum CodeGenType { C, CUDA }; CodeGen(std::ostream& stream, CodeGenType type) : IRPrinter(stream), codeGenType(type) {}; - CodeGen(std::ostream& stream, bool color, bool simplify, CodeGenType type) : IRPrinter(stream, color, simplify), codeGenType(type) {}; + CodeGen(std::ostream& stream, bool color, bool simplify, CodeGenType type) + : IRPrinter(stream, color, simplify), codeGenType(type) {}; /// Initialize the default code generator static std::shared_ptr init_default(std::ostream &dest, OutputKind outputKind); @@ -26,6 +27,9 @@ class CodeGen : public IRPrinter { protected: static bool checkForAlloc(const Function *func); static int countYields(const Function *func); + void getSortedProps(std::map &varMap, + std::vector &sortedProps, std::vector &inputs, + std::vector &outputs); static std::string printCType(Datatype type, bool is_ptr); static std::string printCUDAType(Datatype type, bool is_ptr); @@ -52,6 +56,10 @@ class CodeGen : public IRPrinter { std::string printFuncName(const Function *func, std::map inputMap={}, std::map outputMap={}); + + std::string printTensorProperty(std::string varname, const GetProperty* op, bool is_ptr); + std::string getUnpackedTensorArgument(std::string varname, const GetProperty* op, + bool is_output_prop); void resetUniqueNameCounters(); std::string genUniqueName(std::string name); @@ -61,9 +69,8 @@ class CodeGen : public IRPrinter { private: virtual std::string restrictKeyword() const { return ""; } - std::string printTensorProperty(std::string varname, const GetProperty* op, bool is_ptr); std::string unpackTensorProperty(std::string varname, const GetProperty* op, - bool is_output_prop); + bool is_output_prop); std::string packTensorProperty(std::string varname, Expr tnsr, TensorProperty property, int mode, int index); std::string pointTensorProperty(std::string varname); diff --git a/src/codegen/codegen_c.cpp b/src/codegen/codegen_c.cpp index 2ade9d7f6..d55adbe58 100644 --- a/src/codegen/codegen_c.cpp +++ b/src/codegen/codegen_c.cpp @@ -34,6 +34,7 @@ const string cHeaders = "#include \n" "#include \n" "#include \n" + "#include \n" "#if _OPENMP\n" "#include \n" "#endif\n" @@ -308,6 +309,7 @@ void CodeGen_C::visit(const Function* func) { // output body print(func->body); + // output repack only if we allocated memory if (checkForAlloc(func)) out << endl << printPack(varFinder.outputProperties, func->outputs); @@ -403,6 +405,7 @@ static string getAtomicPragma() { // Docs for vectorization pragmas: // http://clang.llvm.org/docs/LanguageExtensions.html#extensions-for-loop-hint-optimizations void CodeGen_C::visit(const For* op) { + switch (op->kind) { case LoopKind::Vectorized: doIndent(); diff --git a/src/codegen/codegen_c.h b/src/codegen/codegen_c.h index 55c9d01a8..37bda6046 100644 --- a/src/codegen/codegen_c.h +++ b/src/codegen/codegen_c.h @@ -28,23 +28,24 @@ class CodeGen_C : public CodeGen { protected: using IRPrinter::visit; - void visit(const Function*); - void visit(const VarDecl*); - void visit(const Yield*); - void visit(const Var*); - void visit(const For*); - void visit(const While*); - void visit(const GetProperty*); - void visit(const Min*); - void visit(const Max*); - void visit(const Allocate*); - void visit(const Sqrt*); - void visit(const Store*); - void visit(const Assign*); + virtual void visit(const Function*); + virtual void visit(const VarDecl*); + virtual void visit(const Yield*); + virtual void visit(const Var*); + virtual void visit(const For*); + virtual void visit(const While*); + virtual void visit(const GetProperty*); + virtual void visit(const Min*); + virtual void visit(const Max*); + virtual void visit(const Allocate*); + virtual void visit(const Sqrt*); + virtual void visit(const Store*); + virtual void visit(const Assign*); std::map varMap; std::vector localVars; std::ostream &out; + int count = 0; OutputKind outputKind; diff --git a/src/codegen/module.cpp b/src/codegen/module.cpp index bd0f487b1..c0192f243 100644 --- a/src/codegen/module.cpp +++ b/src/codegen/module.cpp @@ -42,6 +42,7 @@ void Module::addFunction(Stmt func) { void Module::compileToSource(string path, string prefix) { if (!moduleFromUserSource) { + std::cout << "module not from user source\n"; // create a codegen instance and add all the funcs bool didGenRuntime = false; @@ -109,6 +110,7 @@ void writeShims(vector funcs, string path, string prefix) { } // anonymous namespace string Module::compile() { + std::cout << "Module::compile\n"; string prefix = tmpdir+libname; string fullpath = prefix + ".so"; @@ -137,12 +139,24 @@ string Module::compile() { string cmd = cc + " " + cflags + " " + prefix + file_ending + " " + shims_file + " " + "-o " + fullpath + " -lm"; + std::cout << "--------------------------------------------------------------------------------tmpdir: " << tmpdir << std::endl; + std::cout << "--------------------------------------------------------------------------------libname: " << libname << std::endl; + std::cout << "--------------------------------------------------------------------------------prefix: " << prefix << std::endl; + std::cout << "--------------------------------------------------------------------------------fullpath: " << fullpath << std::endl; + std::cout << "--------------------------------------------------------------------------------cmd: " << cmd << std::endl; // open the output file & write out the source compileToSource(tmpdir, libname); + // write out the shims writeShims(funcs, tmpdir, libname); + for (auto &statement : funcs) { + std::cout << "----- statement --------" << std::endl; + // std::cout << statement; + std::cout << std::endl; + } + std::cout << tmpdir << std::endl << libname << std::endl; // now compile it int err = system(cmd.data()); @@ -168,10 +182,61 @@ string Module::getSource() { return source.str(); } +void* Module::getFuncPtr(std::string& sofile, std::string name) { + std::cout << "opening shared object 1\n"; + if (so_lib_handle) { + dlclose(so_lib_handle); + } + std::cout << "opening shared object 2\n"; + so_lib_handle = dlopen(sofile.data(), RTLD_NOW | RTLD_LOCAL); + std::cout << "opening shared object : " << sofile << std::endl; + return dlsym(so_lib_handle, name.data()); +} + void* Module::getFuncPtr(std::string name) { return dlsym(lib_handle, name.data()); } +int Module::callFuncPackedRaw(std::string name, std::string& sofile, void** args) { + typedef int (*fnptr_t)(void**); + static_assert(sizeof(void*) == sizeof(fnptr_t), + "Unable to cast dlsym() returned void pointer to function pointer"); + void* v_func_ptr = getFuncPtr(sofile, name); + fnptr_t func_ptr; + *reinterpret_cast(&func_ptr) = v_func_ptr; + +#if USE_OPENMP + omp_sched_t existingSched; + ParallelSchedule tacoSched; + int existingChunkSize, tacoChunkSize; + int existingNumThreads = omp_get_max_threads(); + omp_get_schedule(&existingSched, &existingChunkSize); + taco_get_parallel_schedule(&tacoSched, &tacoChunkSize); + switch (tacoSched) { + case ParallelSchedule::Static: + omp_set_schedule(omp_sched_static, tacoChunkSize); + break; + case ParallelSchedule::Dynamic: + omp_set_schedule(omp_sched_dynamic, tacoChunkSize); + break; + default: + break; + } + omp_set_num_threads(taco_get_num_threads()); +#endif + + std::cout << "calling the function\n"; + int ret = func_ptr(args); + std::cout << "function call completed\n"; + +#if USE_OPENMP + omp_set_schedule(existingSched, existingChunkSize); + omp_set_num_threads(existingNumThreads); +#endif + + return ret; +} + int Module::callFuncPackedRaw(std::string name, void** args) { typedef int (*fnptr_t)(void**); static_assert(sizeof(void*) == sizeof(fnptr_t), @@ -200,7 +265,9 @@ int Module::callFuncPackedRaw(std::string name, void** args) { omp_set_num_threads(taco_get_num_threads()); #endif + std::cout << "calling the function\n"; int ret = func_ptr(args); + std::cout << "function call completed\n"; #if USE_OPENMP omp_set_schedule(existingSched, existingChunkSize); diff --git a/src/index_notation/index_notation.cpp b/src/index_notation/index_notation.cpp index 51fb8770c..2e26460c7 100644 --- a/src/index_notation/index_notation.cpp +++ b/src/index_notation/index_notation.cpp @@ -2438,6 +2438,7 @@ bool isConcreteNotation(IndexStmt stmt, std::string* reason) { return isConcrete; } +// make reduction notation Assignment makeReductionNotation(Assignment assignment) { IndexExpr expr = assignment.getRhs(); std::vector free = assignment.getLhs().getIndexVars(); @@ -2513,7 +2514,10 @@ IndexStmt makeReductionNotation(IndexStmt stmt) { return makeReductionNotation(to(stmt)); } +// make concrete notation IndexStmt makeConcreteNotation(IndexStmt stmt) { + // std::cout << "concrete notation original assignment: " << stmt << std::endl; + std::string reason; taco_iassert(isReductionNotation(stmt, &reason)) << "Not reduction notation: " << stmt << std::endl << reason; @@ -2521,6 +2525,7 @@ IndexStmt makeConcreteNotation(IndexStmt stmt) { // Free variables and reductions covering the whole rhs become top level loops vector freeVars = to(stmt).getFreeVars(); + std::cout << "free vars: " << freeVars << std::endl; struct RemoveTopLevelReductions : IndexNotationRewriter { using IndexNotationRewriter::visit; @@ -2535,12 +2540,17 @@ IndexStmt makeConcreteNotation(IndexStmt stmt) { topLevelReductions.push_back(reduction.getVar()); rhs = reduction.getExpr(); } + // std::cout << "top level reductions: " << topLevelReductions << std::endl; if (rhs != node->rhs) { - stmt = Assignment(node->lhs, rhs, Add()); + stmt = Assignment(node->lhs, rhs, Add()); // write with add + int idx = 0; for (auto& i : util::reverse(topLevelReductions)) { + std::cout << idx << ": " << stmt << std::endl; + idx++; stmt = forall(i, stmt); } + std::cout << idx << ": " << stmt << std::endl; } else { stmt = node; @@ -2548,11 +2558,18 @@ IndexStmt makeConcreteNotation(IndexStmt stmt) { } }; stmt = RemoveTopLevelReductions().rewrite(stmt); + // std::cout << "after remove top level reductions: " << stmt << std::endl; + // now we form the stmt in reverse order of freeVars + int idx = 0; for (auto& i : util::reverse(freeVars)) { + std::cout << idx << ": " << stmt << std::endl; stmt = forall(i, stmt); + idx++; } + std::cout << idx << ": " << stmt << std::endl; + std::cout << "replacing reductions with whereas statements\n"; // Replace other reductions with where and forall statements struct ReplaceReductionsWithWheres : IndexNotationRewriter { using IndexNotationRewriter::visit; diff --git a/src/index_notation/index_notation_printer.cpp b/src/index_notation/index_notation_printer.cpp index 0b41615ad..d7ee998ae 100644 --- a/src/index_notation/index_notation_printer.cpp +++ b/src/index_notation/index_notation_printer.cpp @@ -224,9 +224,9 @@ void IndexNotationPrinter::visit(const YieldNode* op) { void IndexNotationPrinter::visit(const ForallNode* op) { os << "forall(" << op->indexVar << ", "; op->stmt.accept(this); - if (op->parallel_unit != ParallelUnit::NotParallel) { + // if (op->parallel_unit != ParallelUnit::NotParallel) { os << ", " << ParallelUnit_NAMES[(int) op->parallel_unit] << ", " << OutputRaceStrategy_NAMES[(int) op->output_race_strategy]; - } + // } os << ")"; } diff --git a/src/index_notation/transformations.cpp b/src/index_notation/transformations.cpp index 47fc1dd55..3846da6a8 100644 --- a/src/index_notation/transformations.cpp +++ b/src/index_notation/transformations.cpp @@ -1,9 +1,16 @@ #include "taco/index_notation/transformations.h" +#include "lower/iteration_graph.h" +#include "lower/tensor_path.h" +#include "taco/cuda.h" #include "taco/index_notation/index_notation.h" +#include "taco/index_notation/index_notation_nodes_abstract.h" #include "taco/index_notation/index_notation_rewriter.h" #include "taco/index_notation/index_notation_nodes.h" +#include "taco/index_notation/index_notation_printer.h" #include "taco/error/error_messages.h" +#include "taco/index_notation/intrinsic.h" +#include "taco/type.h" #include "taco/util/collections.h" #include "taco/lower/iterator.h" #include "taco/lower/merge_lattice.h" @@ -305,6 +312,7 @@ IndexStmt Precompute::apply(IndexStmt stmt, std::string* reason) const { IndexExpr e = precompute.getExpr(); IndexVar iw = precompute.getiw(); + // these lines of code looks interesting when creating the producer consumer relationship IndexStmt consumer = forall(i, replace(s, {{e, ws(i)}})); IndexStmt producer = forall(iw, Assignment(ws(iw), replace(e, {{i,iw}}), assign.getOperator())); @@ -592,7 +600,10 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const { std::string reason = ""; IndexStmt rewriteParallel(IndexStmt stmt) { + std::cout << "1 rewriting IndexStmt to support parallelize schedule directive\n--------------------------------------------\n"; + // std::cout << stmt << std::endl; provGraph = ProvenanceGraph(stmt); + std::cout << "2 rewriting IndexStmt to support parallelize schedule directive\n--------------------------------------------\n"; const auto reductionVars = getReductionVars(stmt); reductionIndexVars.clear(); @@ -607,15 +618,22 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const { tensorVars = createIRTensorVars(stmt); assembledByUngroupedInsert.clear(); + std::cout << "3 rewriting IndexStmt to support parallelize schedule directive\n--------------------------------------------\n"; for (const auto& result : getAssembledByUngroupedInsertion(stmt)) { assembledByUngroupedInsert.push_back(tensorVars[result]); } + std::cout << "4 rewriting IndexStmt to support parallelize schedule directive\n--------------------------------------------\n"; + // std::cout << stmt << std::endl; return rewrite(stmt); } void visit(const ForallNode* node) { + std::cout << "transformations.cpp void visit(const ForallNode* node)\n"; + std::cout << "node: \n" << node << std::endl; Forall foralli(node); + std::cout << "foralli: \n" << foralli << std::endl; + std::cout << "before stmt update stmt: \n" << stmt << std::endl; IndexVar i = parallelize.geti(); definedIndexVars.insert(foralli.getIndexVar()); @@ -632,6 +650,7 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const { Iterators iterators(foralli, tensorVars); MergeLattice lattice = MergeLattice::make(foralli, iterators, provGraph, definedIndexVars); + std::cout << "iter: " << i << ", lattice: \n" << lattice << std::endl; // Precondition 2: No coiteration of modes (i.e., merge lattice has // only one iterator) @@ -660,6 +679,7 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const { MergeLattice underivedLattice = MergeLattice::make(underivedForall, iterators, provGraph, definedIndexVars); + std::cout << "iter: " << i << ", underivedLattice: \n" << lattice << std::endl; // Precondition 3: Every result iterator must have insert capability for (Iterator iterator : underivedLattice.results()) { @@ -721,6 +741,7 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const { // build consumer that writes from temporary to output, mark consumer as parallel reduction ParallelUnit reductionUnit = ParallelUnit::CPUThreadGroupReduction; if (should_use_CUDA_codegen()) { + std::cout << "should_use_CUDA_codegen() true\n"; if (parentParallelUnits.count(ParallelUnit::GPUWarp)) { reductionUnit = ParallelUnit::GPUWarpReduction; } @@ -728,6 +749,9 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const { reductionUnit = ParallelUnit::GPUBlockReduction; } } + else { + std::cout << "should_use_CUDA_codegen() false\n"; + } IndexStmt consumer = forall(i, Assignment(assignment->lhs, w(i), assignment->op), reductionUnit, OutputRaceStrategy::ParallelReduction); precomputed_stmt = where(consumer, producer); } @@ -746,8 +770,9 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const { return; } - + std::cout << "updated stmt: \n"; stmt = forall(i, foralli.getStmt(), parallelize.getParallelUnit(), parallelize.getOutputRaceStrategy(), foralli.getUnrollFactor()); + std::cout << stmt << std::endl; return; } @@ -1181,6 +1206,7 @@ std::ostream& operator<<(std::ostream& os, IndexStmt parallelizeOuterLoop(IndexStmt stmt) { // get outer ForAll + std::cout << "get outer ForAll ----------------- \n"; Forall forall; bool matched = false; match(stmt, @@ -1216,6 +1242,7 @@ IndexStmt parallelizeOuterLoop(IndexStmt stmt) { return parallelized256; } else { + std::cout << "outer loop parallelization for CPU codgen index statement\n"; IndexStmt parallelized = Parallelize(forall.getIndexVar(), ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces).apply(stmt, &reason); if (parallelized == IndexStmt()) { // can't parallelize @@ -1274,6 +1301,7 @@ static vector topologicallySort(map> hardDeps, map> softDeps, vector originalOrder) { + std::cout << "originalOrder: " << std::endl; vector sortedVars; unsigned long countVars = originalOrder.size(); while (sortedVars.size() < countVars) { @@ -1295,6 +1323,9 @@ topologicallySort(map> hardDeps, } // No free var found there is a cycle + std::cout << "this is where the assert fails\n"; + std::cout << "freeVarPos: " << freeVarPos << std::endl; + std::cout << "limit: " << std::numeric_limits::max() << std::endl; taco_iassert(freeVarPos != std::numeric_limits::max()) << "Cycles in iteration graphs must be resolved, through transpose, " << "before the expression is passed to the topological sorting " @@ -1320,8 +1351,674 @@ topologicallySort(map> hardDeps, return sortedVars; } +bool checkFromBack(const TensorPath& resultTensorPath, + const vector& tensorPaths, + string& removedAccessNode, + vector& producerVars, + vector& consumerVars, + vector& modifiedResultIndexesAccessed, + vector& sortedAllIndexes) { + + std::cout << "check from back function execution\n"; + + const std::vector& resultIndexesVisited = resultTensorPath.getVariables(); + IndexVar lastVisitedIndexVar = resultIndexesVisited.back(); + + std::cout << "last visited index variable: " << lastVisitedIndexVar << std::endl; + + bool onlyLastTensorContainLastIndexOfOutput = true; + bool fissionFromBack = false; + + // check from the back + for (unsigned long i=0; i& indexesVisited = otherIndexPaths.getVariables(); + cout << "index paths: " << otherIndexPaths << endl; + + // if (i < tensorPaths.size()-1) { + // check if other tensors also contain last index of output tensor + for (auto index : indexesVisited) { + cout << "checking " << index << " " << lastVisitedIndexVar << endl; + if (index == lastVisitedIndexVar) { + onlyLastTensorContainLastIndexOfOutput = false; + } + } + // } + } + + if (onlyLastTensorContainLastIndexOfOutput) { // last accessed tensorVariable + const TensorPath& otherIndexPaths = tensorPaths.back(); + const vector& indexesVisited = otherIndexPaths.getVariables(); + cout << "index paths: " << otherIndexPaths << endl; + + cout << "index variable maybe removed from the back\n"; + auto lastTensorLastVisited = indexesVisited.back(); + cout << "last index last visited " << lastTensorLastVisited << endl; + + if (lastTensorLastVisited == lastVisitedIndexVar) { + cout << "we can diffuse from the back\n"; + fissionFromBack = true; + removedAccessNode = otherIndexPaths.getAccess().getTensorVar().getName(); + cout << "removed access node " << removedAccessNode << endl; + + // mark producer accessed index variables + for (auto indexVar : sortedAllIndexes) { + if (indexVar != lastVisitedIndexVar) { // add everything except the last accessed index + std::cout << "producer vars: " << indexVar << std::endl; + producerVars.push_back(indexVar); + } + } + + for (auto indexVar : sortedAllIndexes) { + if (indexVar != lastVisitedIndexVar) { + if ( + find(resultIndexesVisited.begin(), resultIndexesVisited.end(), indexVar) + != resultIndexesVisited.end() || + find(indexesVisited.begin(), indexesVisited.end(), indexVar) + != indexesVisited.end() + ) { + modifiedResultIndexesAccessed.push_back(indexVar); + } + } + } + + // // get modified index for the intermediate calculated tensor expression + // for (unsigned long j=0; j& tensorPaths, + string& removedAccessNode, + vector& producerVars, + vector& consumerVars, + vector& modifiedResultIndexesAccessed, + vector& sortedAllIndexes) { + + std::cout << "check from front function execution\n"; + + const std::vector& resultIndexesVisited = resultTensorPath.getVariables(); + IndexVar firstVisitedIndexVar = resultIndexesVisited.front(); + + std::cout << "first fisited index variable: " << firstVisitedIndexVar << std::endl; + std::cout << "tensor path size: " << tensorPaths.size() << std::endl; + + bool onlyFirstTensorContainFirstIndexOfOutput = true; + bool fissionFromFront = false; + + // check from the front + for (long i=tensorPaths.size()-1; i>0; i--) { // change tensor paths to recursively use the functionality + std::cout << "i: " << i << std::endl; + const TensorPath& otherIndexPaths = tensorPaths.at(i); + const vector& indexesVisited = otherIndexPaths.getVariables(); + cout << "index paths: " << otherIndexPaths << endl; + + if (i != 0) { // check if other tensors also contain last index of output tensor + for (auto index : indexesVisited) { + cout << "checking " << index << " " << firstVisitedIndexVar << endl; + if (index == firstVisitedIndexVar) { + onlyFirstTensorContainFirstIndexOfOutput = false; + } + } + } + } + + + if (onlyFirstTensorContainFirstIndexOfOutput) { // last accessed tensorVariable + const TensorPath& otherIndexPaths = tensorPaths.front(); + const vector& indexesVisited = otherIndexPaths.getVariables(); + cout << "index paths: " << otherIndexPaths << endl; + + cout << "index variable maybe removed from the front\n"; + auto firstTensorFirstVisited = indexesVisited.front(); + cout << "first index first visited " << firstTensorFirstVisited << endl; + + if (firstTensorFirstVisited == firstVisitedIndexVar) { + cout << "we can diffuse from the front\n"; + fissionFromFront = true; + removedAccessNode = otherIndexPaths.getAccess().getTensorVar().getName(); + cout << "removed access node " << removedAccessNode << endl; + + // mark producer accessed index variables + for (auto indexVar : sortedAllIndexes) { + if (indexVar != firstVisitedIndexVar) { // add everything except the first accessed index + producerVars.emplace_back(indexVar); + } + } + + for (auto indexVar : sortedAllIndexes) { + if (indexVar != firstVisitedIndexVar) { + if ( + find(resultIndexesVisited.begin(), resultIndexesVisited.end(), indexVar) + != resultIndexesVisited.end() || + find(indexesVisited.begin(), indexesVisited.end(), indexVar) + != indexesVisited.end() + ) { + modifiedResultIndexesAccessed.push_back(indexVar); + } + } + } + + std::cout << "printing modifiedResultIndexesAccessed\n"; + for (auto& idx : modifiedResultIndexesAccessed) { + std::cout << "modifiedResultIndexesAccessed: " << idx << std::endl; + } + std::cout << "printed modifiedResultIndexesAccessed\n"; + + // get modified index for the intermediate calculated tensor expression + // for (unsigned long j=0; j forallParallelUnit; + map forallOutputRaceStrategy; + vector sortedIndexes; + Assignment innerBody; + + SortedIndexVars() {}; + + void visit(const ForallNode* node) { + Forall forallNode(node); + IndexVar i = forallNode.getIndexVar(); + std::cout << forallNode << std::endl; + + sortedIndexes.push_back(i); + forallParallelUnit[i] = forallNode.getParallelUnit(); + forallOutputRaceStrategy[i] = forallNode.getOutputRaceStrategy(); + + if (isa(forallNode.getStmt())) { + cout << "assignment node found: " << forallNode.getStmt() << endl;; + innerBody = to(forallNode.getStmt()); + return; // Only reorder first contiguous section of ForAlls + } + + IndexNotationVisitor::visit(node); + } + }; + + std::cout << "traversing through the index statement\n"; + SortedIndexVars sortedIndexVars; + stmt.accept(&sortedIndexVars); + std::cout << std::endl; + + struct IndexExprBuilder : public IndexNotationVisitor { + + using IndexNotationVisitor::visit; + vector accessLeftToRight; + map>> indexDimensionsMap; + + void visit(const AccessNode* node) { + Access accessNode(node); + std::cout << "access node: " << accessNode << std::endl; + accessLeftToRight.push_back(accessNode); + + TensorVar tensorVar = accessNode.getTensorVar(); + + for (unsigned long i=0; i < accessNode.getIndexVars().size(); i++) { + auto var = accessNode.getIndexVars()[i]; + + if (indexDimensionsMap.find(var) != indexDimensionsMap.end()) { + indexDimensionsMap[var].emplace_back( + pair(tensorVar.getType().getShape().getDimension(i), + tensorVar.getType())); + } + else { + indexDimensionsMap[var] = { + pair( + tensorVar.getType().getShape().getDimension(i), + tensorVar.getType()) + }; + } + } + + } + + }; + + IndexExpr rhsExpr = assignment.getRhs(); + Access lhsAccess = to(assignment.getLhs()); + std::cout << "right hand side expression: " << rhsExpr << std::endl; + IndexExprBuilder indexExprBuilder; + rhsExpr.accept(&indexExprBuilder); + TensorVar resultVar = lhsAccess.getTensorVar(); + + for (auto item : indexExprBuilder.indexDimensionsMap) { + auto indexVar = item.first; + cout << "var: " << indexVar << " "; + for (auto elem : item.second) { + cout << elem.first << " " << elem.second << " " ; + } + cout << endl; + } + + + // now I have the iteration graph + IterationGraph iterationGraph = IterationGraph::make(assignment); + std::cout << "/*******************************************/\n"; + std::cout << "/********** ITERATION GRAPH ****************/\n"; + std::cout << "/*******************************************/\n"; + std::cout << iterationGraph << std::endl; + + const TensorPath& resultTensorPath = iterationGraph.getResultTensorPath(); + const std::vector& tensorPaths = iterationGraph.getTensorPaths(); + + + string removedAccessNode; + vector producerVars; // producer accessed index variables + vector consumerVars; // consumer accessed index variables + vector fusedVars; + vector modifiedResultIndexesAccessed; + bool fissionFromBack = false; + if (side == "b") { + fissionFromBack = true; + } + + if (fissionFromBack) { + fissionFromBack = checkFromBack(resultTensorPath, tensorPaths, + removedAccessNode, producerVars, consumerVars, + modifiedResultIndexesAccessed, sortedIndexVars.sortedIndexes + ); + } + + bool fissionFromFront = false; + if (side == "f") { + fissionFromFront = true; + } + if (fissionFromBack == false && fissionFromFront) { + fissionFromFront = checkFromFront(resultTensorPath, tensorPaths, + removedAccessNode, producerVars, consumerVars, + modifiedResultIndexesAccessed, sortedIndexVars.sortedIndexes + ); + } + + if (!fissionFromBack && !fissionFromFront) { + cout << "fission operation cannot be performed from the back\n"; + return stmt; + } + + vector newAccessDims{}; + for (auto var : modifiedResultIndexesAccessed) { + auto item = indexExprBuilder.indexDimensionsMap[var]; + cout << "shared vars: " << var << endl; + newAccessDims.emplace_back(item[0].first); + } + TensorVar newAccessVar(resultVar.getName() + "_inner", + Type(resultVar.getType().getDataType(), newAccessDims)); + cout << "new inner assignment statement: " << modifiedResultIndexesAccessed << std::endl; + Access newResultAccess(newAccessVar, modifiedResultIndexesAccessed); + cout << "new access variable for iterative apply: " << newResultAccess << std::endl; + + if (fissionFromBack) { + std::cout << "fission from the back is possible\n"; + } + if (fissionFromFront) { + std::cout << "fission from the front is possible\n"; + } + + // // check from the front + // struct IndexExprSeparator : public IndexNotationVisitor { + + // using IndexNotationVisitor::visit; + // vector accessLeftToRight; + + // void visit(const MulNode* node) { + // Mul mulNode(node); + // IndexExpr lhs = mulNode.getA(); + // IndexExpr rhs = mulNode.getB(); + // std::cout << "access node: " << accessNode << std::endl; + // accessLeftToRight.push_back(accessNode); + // } + + // }; + + + cout << "\n\nProducer accessed index variables\n"; + auto it = producerVars.begin(); + for (; it != producerVars.end(); it++) { + cout << *it << endl; + } + cout << "\n\nConsumer accessed index variables\n"; + it = consumerVars.begin(); + for (; it != consumerVars.end(); it++) { + cout << *it << endl; + } + cout << endl << endl; + + // check common vars that can be fused + for (auto var : sortedIndexVars.sortedIndexes) { + if (find(producerVars.begin(), producerVars.end(), var) != producerVars.end() && + find(consumerVars.begin(), consumerVars.end(), var) != consumerVars.end()) { + fusedVars.emplace_back(var); + } + else { + break; + } + } + + for (auto& fv : fusedVars) { + std::cout << "fusable vars: " << fv << std::endl; + } + + vector sharedVars; + for (auto var : sortedIndexVars.sortedIndexes) { + if (find(fusedVars.begin(), fusedVars.end(), var) == fusedVars.end() && + find(producerVars.begin(), producerVars.end(), var) != producerVars.end() && + find(consumerVars.begin(), consumerVars.end(), var) != consumerVars.end() + ) { + sharedVars.emplace_back(var); + } + } + + for (auto& sv : sharedVars) { + std::cout << "shared vars: " << sv << std::endl; + } + + vector sharedDims{}; + for (auto var : sharedVars) { + auto item = indexExprBuilder.indexDimensionsMap[var]; + cout << "shared vars: " << var << endl; + sharedDims.emplace_back(item[0].first); + } + + + // get removing tensorvars and workspace dimension + const Type& type = resultTensorPath.getAccess().getTensorVar().getType(); + const Format& format = resultTensorPath.getAccess().getTensorVar().getFormat(); + TensorVar intermediateTensor("ws", type, format); + cout << intermediateTensor << endl; + + // TensorVar A("A", Type(), taco::dense); + TensorVar tempVar("t" + resultVar.getName(), + Type(resultVar.getType().getDataType(), sharedDims)); + cout << "tensor order: " << tempVar.getOrder() << endl; + cout << "tensor format: " << tempVar.getFormat() << endl; + cout << "format order: " << tempVar.getFormat().getOrder() << endl; + + // TensorVar* a = new TensorVar("A", Type()); + // TensorVar ws("ws", Type(type(), {jdim}) ); + + // get removing indexExpr and the rest of the indexExpr + Access workspace(tempVar, sharedVars); + std::cout << "workspace access tensor: " << workspace << std::endl; + + + + // construct producer expression right hand side + cout << "generating consumer expression\n"; + IndexExpr producerExpr; + int num_muls = 0; + for (Access accessNode : indexExprBuilder.accessLeftToRight) { + std::cout << "accessNodes: " << accessNode << endl; + if (removedAccessNode != accessNode.getTensorVar().getName()) { + if (producerExpr == NULL) { + std::cout << "index expression is null"; + producerExpr = accessNode; + std::cout << "producerExpr: " << producerExpr << std::endl; + } else { + num_muls++; + producerExpr = producerExpr * accessNode; + std::cout << "producerExpr: " << producerExpr << std::endl; + } + } + } + std::cout << producerExpr << std::endl; + Assignment producerAssignment(newResultAccess, + producerExpr); + std::cout << "new inner assignment statement: " << producerAssignment << std::endl; + Assignment producerInnerBody(workspace, + producerExpr, + sortedIndexVars.innerBody.getOperator() + ); + std::cout << "producerInnerBody: " << producerInnerBody << std::endl; + + // construct consumer expression right hand side + IndexExpr consumerExpr; + if (fissionFromBack) { + consumerExpr = workspace; + } + cout << "generating consumer expression: " << consumerExpr << std::endl; + for (Access accessNode : indexExprBuilder.accessLeftToRight) { + TensorVar tv = accessNode.getTensorVar(); + std::cout << "accessNodes: " << accessNode << endl; + if (removedAccessNode == accessNode.getTensorVar().getName()) { + if (consumerExpr == NULL) { + std::cout << "index expression is null"; + consumerExpr = accessNode; + std::cout << "consumerExpr: " << consumerExpr << std::endl; + } else { + consumerExpr = consumerExpr * accessNode; + std::cout << "consumerExpr: " << consumerExpr << std::endl; + } + } + } + if (fissionFromFront) { + consumerExpr = consumerExpr * workspace; + } + Assignment consumerInnerBody(lhsAccess, + consumerExpr, + sortedIndexVars.innerBody.getOperator() + ); + + cout << "Producer inner body: " << producerInnerBody << endl; + cout << "Consumer inner body: " << consumerInnerBody << endl; + + // rewrite indexstmt + // Reorder Foralls use a rewriter in case new nodes introduced outside of Forall + struct ProducerConsumerRewriter : public IndexNotationRewriter { + using IndexNotationRewriter::visit; + + const vector& producerConsumerVars; + const vector& fusedVars; + IndexStmt innerBody; + const map forallParallelUnit; + const map forallOutputRaceStrategy; + + ProducerConsumerRewriter(const vector& producerConsumerVars, + const vector& fusedVars, IndexStmt innerBody, + const map forallParallelUnit, + const map forallOutputRaceStrategy) + : producerConsumerVars(producerConsumerVars), fusedVars(fusedVars), innerBody(innerBody), + forallParallelUnit(forallParallelUnit), forallOutputRaceStrategy(forallOutputRaceStrategy) { + } + + void visit(const ForallNode* node) { + Forall foralli(node); + IndexVar i = foralli.getIndexVar(); + cout << "going through var: " << i << endl; + + // first forall must be in collected variables + // taco_iassert(util::contains(producerVars, i)); + // std::cout << "\ninner body of the statement\n" << innerBody; + // // done in reverse order? + // for (auto it = sortedVars.rbegin(); it != sortedVars.rend(); ++it) { + // stmt = forall(*it, stmt, forallParallelUnit.at(*it), forallOutputRaceStrategy.at(*it), foralli.getUnrollFactor()); + // } + stmt = rewrite(foralli.getStmt()); + cout << "after rewrite statement: " << stmt << endl; + + // omit the index variables in the fusedVar list + if (find(fusedVars.begin(), fusedVars.end(), i) == fusedVars.end() && + find(producerConsumerVars.begin(), producerConsumerVars.end(), i) != producerConsumerVars.end()) { + stmt = forall(i, stmt, forallParallelUnit.at(i), forallOutputRaceStrategy.at(i), foralli.getUnrollFactor()); + } + } + + void visit (const AssignmentNode* node) { + cout << "assignment node: " << node << endl; + stmt = innerBody; + cout << "producerStmt: " << innerBody << endl; + cout << "stmt: " << stmt << endl; + } + + }; + ProducerConsumerRewriter producerRewriter(producerVars, fusedVars, + producerInnerBody, + sortedIndexVars.forallParallelUnit, + sortedIndexVars.forallOutputRaceStrategy); + IndexStmt producerStmt = producerRewriter.rewrite(stmt); + std::cout << "\nAfter Producer rewriter\n"; + std::cout << producerStmt << std::endl; + if (num_muls > 1) { + producerStmt = loopFusionOverFission(producerStmt, producerInnerBody, + side, iters-1); + } + + + ProducerConsumerRewriter consumerRewriter(consumerVars, fusedVars, + consumerInnerBody, + sortedIndexVars.forallParallelUnit, + sortedIndexVars.forallOutputRaceStrategy); + IndexStmt consumerStmt = consumerRewriter.rewrite(stmt); + std::cout << "\nAfter Consumer rewriter\n"; + std::cout << consumerStmt << std::endl; + + + struct CombineProducerConsumerRewriter : public IndexNotationRewriter { + + const vector& fusedVars; + IndexStmt consumerStmt; + IndexStmt producerStmt; + const map forallParallelUnit; + const map forallOutputRaceStrategy; + + CombineProducerConsumerRewriter(const vector& fusedVars, + IndexStmt producerStmt, IndexStmt consumerStmt, + const map forallParallelUnit, + const map forallOutputRaceStrategy) + : fusedVars(fusedVars), consumerStmt(consumerStmt), producerStmt(producerStmt), + forallParallelUnit(forallParallelUnit), + forallOutputRaceStrategy(forallOutputRaceStrategy) {} + + using IndexNotationRewriter::visit; + + void visit(const ForallNode* node) { + Forall foralli(node); + IndexVar i = foralli.getIndexVar(); + cout << "going through var: " << i << endl; + + // omit the index variables in the fusedVar list + if (find(fusedVars.begin(), fusedVars.end(), i) != fusedVars.end()) { + cout << "fused var in stmt\n"; + stmt = rewrite(foralli.getStmt()); + cout << "rewritten stmt: " << stmt << endl; + stmt = forall(i, stmt, forallParallelUnit.at(i), forallOutputRaceStrategy.at(i), foralli.getUnrollFactor()); + } + else { + cout << "fused var not in stmt\n"; + cout << "producerStmt: " << producerStmt << endl; + cout << "consumerStmt: " << consumerStmt << endl; + stmt = where(consumerStmt, producerStmt); + cout << "where stmt: " << stmt << endl; + } + + cout << "after rewrite statement: " << stmt << endl; + } + + }; + + CombineProducerConsumerRewriter combineRewriter(fusedVars, + producerStmt, consumerStmt, + sortedIndexVars.forallParallelUnit, + sortedIndexVars.forallOutputRaceStrategy); + IndexStmt combinedStmt = combineRewriter.rewrite(stmt); + std::cout << "\nAfter Combine rewriter\n"; + std::cout << combinedStmt << std::endl; + + + return combinedStmt; + +} + IndexStmt reorderLoopsTopologically(IndexStmt stmt) { + std::cout << "executing reorderLoopsTopologically\n"; // Collect tensorLevelVars which stores the pairs of IndexVar and tensor // level that each tensor is accessed at struct DAGBuilder : public IndexNotationVisitor { @@ -1382,8 +2079,11 @@ IndexStmt reorderLoopsTopologically(IndexStmt stmt) { }; Iterators iterators(stmt); + std::cout << "DAG builder with iterators" << std::endl; DAGBuilder dagBuilder(iterators); stmt.accept(&dagBuilder); + std::cout << "After DAGBuilder\n"; + std::cout << stmt << std::endl; // Construct tensor dependencies (sorted list of IndexVars) from tensorLevelVars map>> tensorVarOrders; @@ -1391,6 +2091,7 @@ IndexStmt reorderLoopsTopologically(IndexStmt stmt) { tensorVarOrders[tensorLevelVar.first] = varOrderFromTensorLevels(tensorLevelVar.second); } + // hard dependencies const auto hardDeps = depsFromVarOrders(tensorVarOrders); struct CollectSoftDependencies : public IndexNotationVisitor { @@ -1412,12 +2113,17 @@ IndexStmt reorderLoopsTopologically(IndexStmt stmt) { } } }; + // soft dependencies CollectSoftDependencies collectSoftDeps; stmt.accept(&collectSoftDeps); + std::cout << "After CollectSoftDependencies\n"; + std::cout << stmt << std::endl; + // topological sort const auto sortedVars = topologicallySort(hardDeps, collectSoftDeps.softDeps, dagBuilder.indexVarOriginalOrder); + // rewrite indexstmt // Reorder Foralls use a rewriter in case new nodes introduced outside of Forall struct TopoReorderRewriter : public IndexNotationRewriter { using IndexNotationRewriter::visit; @@ -1440,7 +2146,9 @@ IndexStmt reorderLoopsTopologically(IndexStmt stmt) { // first forall must be in collected variables taco_iassert(util::contains(sortedVars, i)); + std::cout << "\ninner body of the statement\n" << innerBody; stmt = innerBody; + // done in reverse order? for (auto it = sortedVars.rbegin(); it != sortedVars.rend(); ++it) { stmt = forall(*it, stmt, forallParallelUnit.at(*it), forallOutputRaceStrategy.at(*it), foralli.getUnrollFactor()); } @@ -1450,7 +2158,11 @@ IndexStmt reorderLoopsTopologically(IndexStmt stmt) { }; TopoReorderRewriter rewriter(sortedVars, dagBuilder.innerBody, dagBuilder.forallParallelUnit, dagBuilder.forallOutputRaceStrategy); - return rewriter.rewrite(stmt); + IndexStmt stmtChanged = rewriter.rewrite(stmt); + std::cout << "After TopoReorderRewriter\n"; + std::cout << stmtChanged << std::endl; + + return stmtChanged; } IndexStmt scalarPromote(IndexStmt stmt, ProvenanceGraph provGraph, @@ -1478,6 +2190,7 @@ IndexStmt scalarPromote(IndexStmt stmt, ProvenanceGraph provGraph, void visit(const ForallNode* node) { Forall foralli(node); + std::cout << "scalar promote: " << foralli << std::endl; IndexVar i = foralli.getIndexVar(); // Don't allow hoisting out of forall's for GPU warp and block reduction diff --git a/src/ir/ir_printer.cpp b/src/ir/ir_printer.cpp index a1997a9b7..0bc848148 100644 --- a/src/ir/ir_printer.cpp +++ b/src/ir/ir_printer.cpp @@ -1,6 +1,7 @@ #include #include +#include "taco/cuda.h" #include "taco/ir/ir.h" #include "taco/ir/ir_printer.h" #include "taco/ir/simplify.h" @@ -59,10 +60,13 @@ void IRPrinter::print(Stmt stmt) { } void IRPrinter::visit(const Literal* op) { + if (color) { stream << blue ; } + // It seems this is where all the types get printed in the final code generation. + // Come up with a way to generate different values if stream2 is used to generate ispc code switch (op->type.getKind()) { case Datatype::Bool: stream << op->getValue(); @@ -99,11 +103,11 @@ void IRPrinter::visit(const Literal* op) { break; case Datatype::Float32: stream << ((op->getValue() != 0.0) - ? util::toString(op->getValue()) : "0.0"); + ? util::toString(op->getValue()) : "0.0"); break; case Datatype::Float64: stream << ((op->getValue()!=0.0) - ? util::toString(op->getValue()) : "0.0"); + ? util::toString(op->getValue()) : "0.0"); break; case Datatype::Complex64: { std::complex val = op->getValue>(); @@ -123,6 +127,10 @@ void IRPrinter::visit(const Literal* op) { if (color) { stream << nc; } + + + + } void IRPrinter::visit(const Var* op) { @@ -132,6 +140,7 @@ void IRPrinter::visit(const Var* op) { else { stream << op->name; } + } void IRPrinter::visit(const Neg* op) { @@ -283,6 +292,7 @@ void IRPrinter::visit(const IfThenElse* op) { stream << "}"; } stream << endl; + } void IRPrinter::visit(const Case* op) { @@ -377,12 +387,13 @@ void IRPrinter::visit(const Store* op) { op->data.accept(this); stream << ";"; stream << endl; + } void IRPrinter::visit(const For* op) { doIndent(); stream << keywordString("for") << " (" - << keywordString(util::toString(op->var.type())) << " "; + << keywordString(util::toString(op->var.type())) << " "; op->var.accept(this); stream << " = "; op->start.accept(this); @@ -396,7 +407,7 @@ void IRPrinter::visit(const For* op) { auto lit = op->increment.as(); if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || - (lit->type.isUInt() && lit->equalsScalar(1)))) { + (lit->type.isUInt() && lit->equalsScalar(1)))) { stream << "++"; } else { @@ -408,7 +419,8 @@ void IRPrinter::visit(const For* op) { op->contents.accept(this); doIndent(); stream << "}"; - stream << endl; + stream << endl; + } void IRPrinter::visit(const While* op) { @@ -452,6 +464,7 @@ void IRPrinter::visit(const Function* op) { doIndent(); stream << "}"; + } void IRPrinter::visit(const VarDecl* op) { @@ -470,6 +483,7 @@ void IRPrinter::visit(const VarDecl* op) { op->rhs.accept(this); stream << ";"; stream << endl; + } void IRPrinter::visit(const Assign* op) { @@ -483,7 +497,7 @@ void IRPrinter::visit(const Assign* op) { if (add->a == op->lhs) { const Literal* lit = add->b.as(); if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || - (lit->type.isUInt() && lit->equalsScalar(1)))) { + (lit->type.isUInt() && lit->equalsScalar(1)))) { stream << "++"; } else { diff --git a/src/ir/ir_rewriter.cpp b/src/ir/ir_rewriter.cpp index eed6f2bab..2e4827497 100644 --- a/src/ir/ir_rewriter.cpp +++ b/src/ir/ir_rewriter.cpp @@ -292,6 +292,7 @@ void IRRewriter::visit(const Store* op) { } void IRRewriter::visit(const For* op) { + // std::cout << "This is IRRewriter::visit(const For* op) method: For: " << op << std::endl; Expr var = rewrite(op->var); Expr start = rewrite(op->start); Expr end = rewrite(op->end); diff --git a/src/lower/iteration_graph.cpp b/src/lower/iteration_graph.cpp index 77735a8d2..b25f820c1 100644 --- a/src/lower/iteration_graph.cpp +++ b/src/lower/iteration_graph.cpp @@ -64,8 +64,9 @@ IterationGraph IterationGraph::make(Assignment assignment) { oldToSplitVar.insert({indexVar, indexVar}); } + // access nodes of right hand side match(expr, - function([&](const AccessNode* op) { + function([&](const AccessNode* op) { auto type = op->tensorVar.getType(); taco_iassert((size_t)type.getShape().getOrder() == op->indexVars.size()) << "Tensor access " << IndexExpr(op) << " but tensor format only has " diff --git a/src/lower/lowerer_impl_imperative.cpp b/src/lower/lowerer_impl_imperative.cpp index b4c9ea710..e8947337d 100644 --- a/src/lower/lowerer_impl_imperative.cpp +++ b/src/lower/lowerer_impl_imperative.cpp @@ -421,7 +421,6 @@ Stmt LowererImplImperative::lowerAssignment(Assignment assignment) Expr var = getTensorVar(result); const bool needComputeAssign = util::contains(needCompute, result); - Expr rhs; if (needComputeAssign) { rhs = lower(assignment.getRhs()); @@ -817,7 +816,6 @@ Stmt LowererImplImperative::lowerForall(Forall forall) forall.getStmt(), reducedAccesses); } // taco_iassert(loops.defined()); - if (!generateComputeCode() && !hasStores(loops)) { // If assembly loop does not modify output arrays, then it can be safely // omitted. diff --git a/src/lower/tensor_path.h b/src/lower/tensor_path.h index 4f5dc49af..da52fb782 100644 --- a/src/lower/tensor_path.h +++ b/src/lower/tensor_path.h @@ -2,6 +2,7 @@ #define TACO_TENSOR_PATH_H #include +#include #include #include "taco/util/comparable.h" @@ -47,14 +48,13 @@ class TensorPath : public util::Comparable { friend bool operator==(const TensorPath&, const TensorPath&); friend bool operator<(const TensorPath&, const TensorPath&); + friend std::ostream& operator<<(std::ostream&, const TensorPath&); private: struct Content; std::shared_ptr content; }; -std::ostream& operator<<(std::ostream&, const TensorPath&); - /// A step along a tensor path. class TensorPathStep : public util::Comparable { diff --git a/src/tensor.cpp b/src/tensor.cpp index fab437ff1..30a821c9d 100644 --- a/src/tensor.cpp +++ b/src/tensor.cpp @@ -10,6 +10,7 @@ #include #include +#include "../test/util.h" #include "taco/cuda.h" #include "taco/format.h" #include "taco/taco_tensor_t.h" @@ -278,6 +279,7 @@ static size_t unpackTensorData(const taco_tensor_t& tensorData, /// Pack coordinates into a data structure given by the tensor format. void TensorBase::pack() { + std::cout << "TensorBase::Pack() method\n"; if (!needsPack()) { return; } @@ -346,6 +348,7 @@ void TensorBase::pack() { taco_iassert((content->coordinateBufferUsed % content->coordinateSize) == 0); const size_t numCoordinates = content->coordinateBufferUsed / content->coordinateSize; + std::cout << "call helperFuncs\n"; const auto helperFuncs = getHelperFunctions(getFormat(), getComponentType(), dimensions); @@ -619,10 +622,12 @@ void TensorBase::compile() { IndexStmt stmt = makeConcreteNotation(makeReductionNotation(assignment)); stmt = reorderLoopsTopologically(stmt); stmt = insertTemporaries(stmt); + std::cout << "calling parallelizeOuterLoop(stmt)\n"; stmt = parallelizeOuterLoop(stmt); compile(stmt, content->assembleWhileCompute); } void TensorBase::compile(taco::IndexStmt stmt, bool assembleWhileCompute) { + std::cout << "TensorBase::compile\n"; if (!needsCompile()) { return; } @@ -802,6 +807,63 @@ void TensorBase::assemble() { } } +void TensorBase::compute(std::ofstream& statfile, std::string& sofile) { + taco_uassert(!needsCompile()) << error::compute_without_compile; + // if (!needsCompute()) { + // return; + // } + setNeedsCompute(false); + // Sync operand tensors if needed. + auto operands = getTensors(getAssignment().getRhs()); + for (auto& operand : operands) { + // std::cout << "operand: " << operand.second << std::endl; + operand.second.syncValues(); + operand.second.removeDependentTensor(*this); + } + + auto arguments = packArguments(*this); + + taco::util::TimeResults timevalue; + bool time = true; + TOOL_BENCHMARK_TIMER2(this->content->module->callFuncPacked("compute", sofile, arguments.data()), + "\nkernel execution time: ", timevalue); + // this->content->module->callFuncPacked("compute", arguments.data()); + + if (content->assembleWhileCompute) { + setNeedsAssemble(false); + taco_tensor_t* tensorData = ((taco_tensor_t*)arguments[0]); + content->valuesSize = unpackTensorData(*tensorData, *this); + } +} + +void TensorBase::compute(std::ofstream& statfile) { + taco_uassert(!needsCompile()) << error::compute_without_compile; + // if (!needsCompute()) { + // return; + // } + setNeedsCompute(false); + // Sync operand tensors if needed. + auto operands = getTensors(getAssignment().getRhs()); + for (auto& operand : operands) { + operand.second.syncValues(); + operand.second.removeDependentTensor(*this); + } + + auto arguments = packArguments(*this); + + taco::util::TimeResults timevalue; + bool time = true; + TOOL_BENCHMARK_TIMER2(this->content->module->callFuncPacked("compute", arguments.data()), + "\nkernel execution time: ", timevalue); + // this->content->module->callFuncPacked("compute", arguments.data()); + + if (content->assembleWhileCompute) { + setNeedsAssemble(false); + taco_tensor_t* tensorData = ((taco_tensor_t*)arguments[0]); + content->valuesSize = unpackTensorData(*tensorData, *this); + } +} + void TensorBase::compute() { taco_uassert(!needsCompile()) << error::compute_without_compile; if (!needsCompute()) { @@ -816,7 +878,9 @@ void TensorBase::compute() { } auto arguments = packArguments(*this); + std::cout << "running the compute function from the shared library\n"; this->content->module->callFuncPacked("compute", arguments.data()); + std::cout << "compute function executed\n"; if (content->assembleWhileCompute) { setNeedsAssemble(false); @@ -951,6 +1015,7 @@ TensorBase::getHelperFunctions(const Format& format, Datatype ctype, } // Lower packing and iterator code. + std::cout << "1 Lower packing and iterator code\n"; helperModule->addFunction(lower(packStmt, "pack", true, true)); helperModule->addFunction(lower(iterateStmt, "iterate", false, true)); } else { @@ -964,12 +1029,14 @@ TensorBase::getHelperFunctions(const Format& format, Datatype ctype, IndexVar indexVar; IndexStmt assignment = (packedScalar() = bufferVector(indexVar)); IndexStmt packStmt= makeConcreteNotation(makeReductionNotation(assignment)); + std::cout << "2 Lower packing and iterator code\n"; helperModule->addFunction(lower(packStmt, "pack", true, true)); // Define and lower iterator code. IndexStmt iterateStmt = Yield({}, packedScalar()); helperModule->addFunction(lower(iterateStmt, "iterate", false, true)); } + std::cout << "Compiling the helperModule\n"; helperModule->compile(); helperFunctionsMutex.lock(); diff --git a/taco-uml.wsd b/taco-uml.wsd new file mode 100644 index 000000000..4b8e39802 --- /dev/null +++ b/taco-uml.wsd @@ -0,0 +1,411 @@ +@startuml taco +scale 1 + + +class IntrusivePtr { + +T *ptr +} +class Uncopyable {} + +class IRNode { + +virtual void accept(IRVisitorStrict *v) const = 0 + +virtual IRNodeType type_info() const = 0; +} + +class BaseStmtNode {} +class BaseExprNode { + +Datatype type +} + +class StmtNode { + +void accept(IRVisitorStrict *v) const +} +class ExprNode { + +void accept(IRVisitorStrict *v) const +} + +Uncopyable <|-- IRNode +IRNode <|-- BaseStmtNode +IRNode <|-- BaseExprNode +BaseStmtNode <|-- StmtNode +BaseExprNode <|-- ExprNode + +class IRHandle { + +void accept(IRVisitorStrict *v) const +} +class Expr {} +class Stmt {} + +IntrusivePtr <|-- IRHandle +IRHandle <|-- Expr +IRHandle <|-- Stmt + +IRHandle "1" *-- "1" IRNode : contains + + + +' this class is abstract but plantuml version does not support interface keyword +interface IRVisitorStrict { + +virtual void visit(const IRNode*) const = 0 +} + +/' +IRVisitor is not an interface or abstract because it +has not pure virtual methods +'/ +class IRVisitor { + +virtual void visit(const IRNode*) +} + +class IRRewriter { + ' protected fields and methods + #Expr expr + #Stmt stmt + + #virtual void visit(const ExprNode* op) + #virtual void visit(const StmtNode* op) + + ' public fields and methods + +Expr rewrite(Expr) + +Stmt rewrite(Stmt) +} +class IRPrinter { + #std::ostream &stream + #std::ostream &stream2 + #int indent + #bool color + #bool simplify + #enum Precedence + #Precedence parentPrecedence = BOTTOM + #NameGenerator varNameGenerator + #scopedMap varNames + + #void doIndent() + #void printBinOp(Expr a, Expr b, std::string op, Precedence precedence) + #void fewMoreMethods() + + #virtual void visit(const ExprNode*) + #virtual void visit(const StmtNode*) + + +setColor(bool color) + +print(Stmt) +} +class IRVerifier {} + +IRVisitorStrict <|-- IRVisitor +IRVisitorStrict <|-- IRPrinter +IRVisitorStrict <|-- IRRewriter +IRVisitor <|-- IRVerifier + +' Inheritance from IRRewriter +' simplifier for ir::Expr +class ExpressionSimplifier {} +IRRewriter <|-- ExpressionSimplifier + +' simplifiers for ir::Stmt +class RemoveRedundantStatements {} +class RemoveRedundantLoops {} +class RemoveDuplicateBody {} + +IRRewriter <|-- RemoveRedundantStatements +IRRewriter <|-- RemoveRedundantLoops +IRRewriter <|-- RemoveDuplicateBody + + +' Inheritance from IRPrinter +class CodeGen {} +class CodeGen_C {} +class CodeGen_CUDA {} +class CodeGen_ISPC { + -class FindVars +} + +class FindVars {} + +IRPrinter <|-- CodeGen +CodeGen <|-- CodeGen_C +CodeGen <|-- CodeGen_ISPC +CodeGen <|-- CodeGen_CUDA + +IRVisitor <|-- FindVars +CodeGen_ISPC +-- FindVars + +class Manageable {} +class IndexStmtNode { + -virtual void accept(IndexStmtVisitorStrict*) const = 0 +} +class IndexExprNode { + -virtual void accept(IndexStmtVisitorStrict*) const = 0 +} + + +Manageable <|-- IndexStmtNode +Uncopyable <|-- IndexStmtNode +Manageable <|-- IndexExprNode +Uncopyable <|-- IndexExprNode + +class IndexStmt {} +class IndexExpr {} + +IntrusivePtr <|-- IndexStmt +IndexStmt "1" *-- "1" IndexStmtNode +IntrusivePtr <|-- IndexExpr +IndexExpr "1" *-- "1" IndexExprNode + + +abstract class IndexExprVisitorStrict { + +void visit(const IndexStmt&) + +virtual void visit(const AccessNode*) = 0 + +virtual void visit(const LiteralNode*) = 0 + +virtual void visit(const NegNode*) = 0 + +virtual void visit(const AddNode*) = 0 + +virtual void visit(const SubNode*) = 0 + +virtual void visit(const MulNode*) = 0 + +virtual void visit(const DivNode*) = 0 + +virtual void visit(const SqrtNode*) = 0 + +virtual void visit(const CastNode*) = 0 + +virtual void visit(const CallIntrinsicNode*) = 0 + +virtual void visit(const ReductionNode*) = 0 +} +abstract class IndexStmtVisitorStrict { + +void visit(const IndexStmt&) + +virtual void visit(const AssignmentNode*) = 0 + +virtual void visit(const YieldNode*) = 0 + +virtual void visit(const ForallNode*) = 0 + +virtual void visit(const WhereNode*) = 0 + +virtual void visit(const SequenceNode*) = 0 + +virtual void visit(const AssembleNode*) = 0 + +virtual void visit(const MultiNode*) = 0 + +virtual void visit(const SuchThatNode*) = 0 +} + +abstract class IndexNotationVisitorStrict {} +class IndexNotationPrinter { + +void print(const IndexExpr& expr) + +void print(const IndexStmt& expr) + + ' Index Expressions visit() + +void visit(const AccessNode* node) + +void visit(const LiteralNode* node) + + void visit(const NegNode* node) + + void visit(const AddNode* node) + + void visit(const SubNode* node) + + void visit(const MulNode* node) + + void visit(const DivNode* node) + + void visit(const SqrtNode* node) + + void visit(const CastNode* node) + + void visit(const CallIntrinsicNode* node) + + void visit(const UnaryExprNode* node) + + void visit(const BinaryExprNode* node) + + void visit(const ReductionNode* node) + + ' Index Statement visit() + + void visit(const AssignmentNode* node) + + void visit(const YieldNode* node) + + void visit(const ForallNode* node) + + void visit(const WhereNode* node) + + void visit(const SequenceNode* node) + + void visit(const AssembleNode* node) + + void visit(const MultiNode* node) + + void visit(const SuchThatNode* node) +} +class IndexNotationVisitor { + ' Index Expressions visit() + +virtual void visit(const AccessNode* node) + +virtual void visit(const LiteralNode* node) + +virtual void visit(const NegNode* node) + +virtual void visit(const AddNode* node) + +virtual void visit(const SubNode* node) + +virtual void visit(const MulNode* node) + +virtual void visit(const DivNode* node) + +virtual void visit(const SqrtNode* node) + +virtual void visit(const CastNode* node) + +virtual void visit(const CallIntrinsicNode* node) + +virtual void visit(const UnaryExprNode* node) + +virtual void visit(const BinaryExprNode* node) + +virtual void visit(const ReductionNode* node) + + ' Index Statement visit() + +virtual void visit(const AssignmentNode* node) + +virtual void visit(const YieldNode* node) + +virtual void visit(const ForallNode* node) + +virtual void visit(const WhereNode* node) + +virtual void visit(const SequenceNode* node) + +virtual void visit(const AssembleNode* node) + +virtual void visit(const MultiNode* node) + +virtual void visit(const SuchThatNode* node) +} +class Matcher { + +} + +abstract class IndexExprRewriterStrict { + +IndexExpr rewrite(IndexExpr) + + #IndexExpr expr + + #virtual void visit(const AccessNode* op) = 0 + #virtual void visit(const LiteralNode* op) = 0 + #virtual void visit(const NegNode* op) = 0 + #virtual void visit(const SqrtNode* op) = 0 + #virtual void visit(const AddNode* op) = 0 + #virtual void visit(const SubNode* op) = 0 + #virtual void visit(const MulNode* op) = 0 + #virtual void visit(const DivNode* op) = 0 + #virtual void visit(const CastNode* op) = 0 + #virtual void visit(const CallIntrinsicNode* op) = 0 + #virtual void visit(const ReductionNode* op) = 0 +} +abstract class IndexStmtRewriterStrict { + +IndexStmt rewrite(IndexStmt) + + #IndexStmt stmt + + #virtual void visit(const AssignmentNode* op) = 0 + #virtual void visit(const YieldNode* op) = 0 + #virtual void visit(const ForallNode* op) = 0 + #virtual void visit(const WhereNode* op) = 0 + #virtual void visit(const SequenceNode* op) = 0 + #virtual void visit(const AssembleNode* op) = 0 + #virtual void visit(const MultiNode* op) = 0 + #virtual void visit(const SuchThatNode* op) = 0 +} +abstract class IndexNotationRewriterStrict {} +class IndexNotationRewriter { + ' Index Expressions visit() + +virtual void visit(const AccessNode* node) + +virtual void visit(const LiteralNode* node) + +virtual void visit(const NegNode* node) + +virtual void visit(const AddNode* node) + +virtual void visit(const SubNode* node) + +virtual void visit(const MulNode* node) + +virtual void visit(const DivNode* node) + +virtual void visit(const SqrtNode* node) + +virtual void visit(const CastNode* node) + +virtual void visit(const CallIntrinsicNode* node) + +virtual void visit(const UnaryExprNode* node) + +virtual void visit(const BinaryExprNode* node) + +virtual void visit(const ReductionNode* node) + + ' Index Statement visit() + +virtual void visit(const AssignmentNode* node) + +virtual void visit(const YieldNode* node) + +virtual void visit(const ForallNode* node) + +virtual void visit(const WhereNode* node) + +virtual void visit(const SequenceNode* node) + +virtual void visit(const AssembleNode* node) + +virtual void visit(const MultiNode* node) + +virtual void visit(const SuchThatNode* node) +} + + +IndexExprVisitorStrict <|-- IndexNotationVisitorStrict +IndexStmtVisitorStrict <|-- IndexNotationVisitorStrict +IndexNotationVisitorStrict <|-- IndexNotationVisitor +IndexNotationVisitorStrict <|-- IndexNotationPrinter +IndexNotationVisitor <|-- Matcher + +IndexExprVisitorStrict <|-- IndexExprRewriterStrict +IndexStmtVisitorStrict <|-- IndexStmtRewriterStrict +IndexExprRewriterStrict <|-- IndexNotationRewriterStrict +IndexStmtRewriterStrict <|-- IndexNotationRewriterStrict + +IndexNotationRewriterStrict <|-- IndexNotationRewriter + +' - private +' # protected +' ~ package private +' + public + +' {static} +' {abstract} virtual methods + +' lowering part -- convertion from IndexExpr and IndexStmt to ir::Expr and ir::Stmt +class Lowerer { + +std::shared_ptr impl; +} +abstract class LowererImpl { + ' protected fields and methods + #class Visitor; + #friend class Visitor; + #std::shared_ptr visitor; + + #virtual ir::Stmt lower(IndexStmt stmt); + #virtual ir::Expr lower(IndexExpr expr); + + #virtual ir::Expr lowerExpr(IndexExpr expr) = 0; + #virtual ir::Stmt lowerStmt(IndexStmt stmt) = 0; + + ' public fields and methods + +virtual ir::Stmt lower(IndexStmt stmt, std::string name, + bool assemble, bool compute, bool pack, bool unpack) = 0; +} + +class LowererImplImperative { + ' private fields and methods + -class Visitor + -fiend class Visitor + -std::shared_ptr visitor + -bool assemble + -bool compute + -vars a_bunch_of_other_fields + + ' protected fields and methods + #virtual ir::Stmt lowerExpr(IndexExpr expr); + #virtual ir::Stmt lowerStmt(IndexStmt stmt); + + ' public fields and methods + +ir::Stmt lower(IndexStmt stmt, std::string name, + bool assemble, bool compute, bool pack, bool unpack) + +} +note bottom of LowererImplImperative : Stmt LowererImplImperative::lower(IndexStmt stmt) {\n return visitor->lower(stmt);\n} + +Uncopyable <|-- LowererImpl +Lowerer "1" *-- "1" LowererImpl : contains + + +' visitor that does the lowering +class Visitor { + ' private fields and methods + -LowererImpl* impl + -Expr expr + -Stmt stmt + + -void visit(const AssignmentNode* node) + -void visit(const YieldNode* node) + -void visit(const ForallNode* node) + -void visit(const WhereNode* node) + -void visit(const MultiNode* node) + -void visit(const SuchThatNode* node) + -void visit(const SequenceNode* node) + -void visit(const AssembleNode* node) + -void visit(const AccessNode* node) + -void visit(const LiteralNode* node) + -void visit(const NegNode* node) + -void visit(const AddNode* node) + -void visit(const SubNode* node) + -void visit(const MulNode* node) + -void visit(const DivNode* node) + -void visit(const SqrtNode* node) + -void visit(const CastNode* node) + -void visit(const CallIntrinsicNode* node) + -void visit(const ReductionNode* node) + + ' public fields and methods + +Visitor(LowererImplImperative* impl) + +Stmt lower(IndexStmt stmt) + +Expr lower(IndexExpr expr) +} + +note bottom of Visitor: Stmt lower(IndexStmt stmt) {\n this->stmt = Stmt();\n impl->accessibleIterators.scope();\n IndexStmtVisitorStrict::visit(stmt);\n impl->accessibleIterators.unscope();\n return this->stmt;\n} + +IndexNotationVisitorStrict <|-- Visitor +LowererImpl "1" +-- "1" Visitor : contains +Visitor "1" *-- "1" LowererImpl : contains + +LowererImpl <|-- LowererImplImperative +LowererImplImperative "1" +-- "1" Visitor : contains +Visitor "1" *-- "1" LowererImplImperative : contains + +@enduml \ No newline at end of file diff --git a/test/kernels/mttkrp_gemm/mttkrp_ryan.c b/test/kernels/mttkrp_gemm/mttkrp_ryan.c new file mode 100644 index 000000000..9d0536b8c --- /dev/null +++ b/test/kernels/mttkrp_gemm/mttkrp_ryan.c @@ -0,0 +1,177 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A1845, taco_tensor_t *matmul_5_5_5, taco_tensor_t *A1475, taco_tensor_t *A1416) { + int A18451_dimension = (int)(A1845->dimensions[0]); + int A18452_dimension = (int)(A1845->dimensions[1]); + double* restrict A1845_vals = (double*)(A1845->vals); + + A1845_vals = (double*)malloc(sizeof(double) * (A18451_dimension * A18452_dimension)); + + A1845->vals = (uint8_t*)A1845_vals; + return 0; +} + +int compute(taco_tensor_t *A1845, taco_tensor_t *matmul_5_5_5, taco_tensor_t *A1475, taco_tensor_t *A1416) { + int A18451_dimension = (int)(A1845->dimensions[0]); + int A18452_dimension = (int)(A1845->dimensions[1]); + double* restrict A1845_vals = (double*)(A1845->vals); + int* restrict matmul_5_5_51_pos = (int*)(matmul_5_5_5->indices[0][0]); + int* restrict matmul_5_5_51_crd = (int*)(matmul_5_5_5->indices[0][1]); + int* restrict matmul_5_5_52_pos = (int*)(matmul_5_5_5->indices[1][0]); + int* restrict matmul_5_5_52_crd = (int*)(matmul_5_5_5->indices[1][1]); + int* restrict matmul_5_5_53_pos = (int*)(matmul_5_5_5->indices[2][0]); + int* restrict matmul_5_5_53_crd = (int*)(matmul_5_5_5->indices[2][1]); + double* restrict matmul_5_5_5_vals = (double*)(matmul_5_5_5->vals); + int A14751_dimension = (int)(A1475->dimensions[0]); + int A14752_dimension = (int)(A1475->dimensions[1]); + double* restrict A1475_vals = (double*)(A1475->vals); + int A14161_dimension = (int)(A1416->dimensions[0]); + int A14162_dimension = (int)(A1416->dimensions[1]); + double* restrict A1416_vals = (double*)(A1416->vals); + + #pragma omp parallel for schedule(static) + for (int32_t pA1845 = 0; pA1845 < (A18451_dimension * A18452_dimension); pA1845++) { + A1845_vals[pA1845] = 0.0; + } + + #pragma omp parallel for schedule(runtime) + for (int32_t i1542matmul_5_5_5 = matmul_5_5_51_pos[0]; i1542matmul_5_5_5 < matmul_5_5_51_pos[1]; i1542matmul_5_5_5++) { + int32_t i1542 = matmul_5_5_51_crd[i1542matmul_5_5_5]; + for (int32_t i1545 = 0; i1545 < A14162_dimension; i1545++) { + int32_t i1545A1845 = i1542 * A18452_dimension + i1545; + double ti1543A1845_val = 0.0; + for (int32_t i1543matmul_5_5_5 = matmul_5_5_52_pos[i1542matmul_5_5_5]; i1543matmul_5_5_5 < matmul_5_5_52_pos[(i1542matmul_5_5_5 + 1)]; i1543matmul_5_5_5++) { + int32_t i1543 = matmul_5_5_52_crd[i1543matmul_5_5_5]; + int32_t i1545A1416 = i1543 * A14162_dimension + i1545; + for (int32_t i1544matmul_5_5_5 = matmul_5_5_53_pos[i1543matmul_5_5_5]; i1544matmul_5_5_5 < matmul_5_5_53_pos[(i1543matmul_5_5_5 + 1)]; i1544matmul_5_5_5++) { + int32_t i1544 = matmul_5_5_53_crd[i1544matmul_5_5_5]; + int32_t i1545A1475 = i1544 * A14752_dimension + i1545; + ti1543A1845_val += (matmul_5_5_5_vals[i1544matmul_5_5_5] * A1475_vals[i1545A1475]) * A1416_vals[i1545A1416]; + } + } + A1845_vals[i1545A1845] = ti1543A1845_val; + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/mttkrp_gemm/taco_default.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} diff --git a/test/kernels/mttkrp_gemm/mttkrp_ryan.h b/test/kernels/mttkrp_gemm/mttkrp_ryan.h new file mode 100644 index 000000000..3d0c06f50 --- /dev/null +++ b/test/kernels/mttkrp_gemm/mttkrp_ryan.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A1845, taco_tensor_t *matmul_5_5_5, taco_tensor_t *A1475, taco_tensor_t *A1416); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A1845, taco_tensor_t *matmul_5_5_5, taco_tensor_t *A1475, taco_tensor_t *A1416); +#endif diff --git a/test/kernels/mttkrp_gemm/taco_default.c b/test/kernels/mttkrp_gemm/taco_default.c new file mode 100644 index 000000000..edf8cdb16 --- /dev/null +++ b/test/kernels/mttkrp_gemm/taco_default.c @@ -0,0 +1,183 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A1538, taco_tensor_t *matmul_5_5_5, taco_tensor_t *A1475, taco_tensor_t *A1416, taco_tensor_t *A1479) { + int A15381_dimension = (int)(A1538->dimensions[0]); + int A15382_dimension = (int)(A1538->dimensions[1]); + double* restrict A1538_vals = (double*)(A1538->vals); + + A1538_vals = (double*)malloc(sizeof(double) * (A15381_dimension * A15382_dimension)); + + A1538->vals = (uint8_t*)A1538_vals; + return 0; +} + +int compute(taco_tensor_t *A1538, taco_tensor_t *matmul_5_5_5, taco_tensor_t *A1475, taco_tensor_t *A1416, taco_tensor_t *A1479) { + int A15381_dimension = (int)(A1538->dimensions[0]); + int A15382_dimension = (int)(A1538->dimensions[1]); + double* restrict A1538_vals = (double*)(A1538->vals); + int* restrict matmul_5_5_51_pos = (int*)(matmul_5_5_5->indices[0][0]); + int* restrict matmul_5_5_51_crd = (int*)(matmul_5_5_5->indices[0][1]); + int* restrict matmul_5_5_52_pos = (int*)(matmul_5_5_5->indices[1][0]); + int* restrict matmul_5_5_52_crd = (int*)(matmul_5_5_5->indices[1][1]); + int* restrict matmul_5_5_53_pos = (int*)(matmul_5_5_5->indices[2][0]); + int* restrict matmul_5_5_53_crd = (int*)(matmul_5_5_5->indices[2][1]); + double* restrict matmul_5_5_5_vals = (double*)(matmul_5_5_5->vals); + int A14751_dimension = (int)(A1475->dimensions[0]); + int A14752_dimension = (int)(A1475->dimensions[1]); + double* restrict A1475_vals = (double*)(A1475->vals); + int A14161_dimension = (int)(A1416->dimensions[0]); + int A14162_dimension = (int)(A1416->dimensions[1]); + double* restrict A1416_vals = (double*)(A1416->vals); + int A14791_dimension = (int)(A1479->dimensions[0]); + int A14792_dimension = (int)(A1479->dimensions[1]); + double* restrict A1479_vals = (double*)(A1479->vals); + + #pragma omp parallel for schedule(static) + for (int32_t pA1538 = 0; pA1538 < (A15381_dimension * A15382_dimension); pA1538++) { + A1538_vals[pA1538] = 0.0; + } + + #pragma omp parallel for schedule(runtime) + for (int32_t i1542matmul_5_5_5 = matmul_5_5_51_pos[0]; i1542matmul_5_5_5 < matmul_5_5_51_pos[1]; i1542matmul_5_5_5++) { + int32_t i1542 = matmul_5_5_51_crd[i1542matmul_5_5_5]; + for (int32_t i1546 = 0; i1546 < A14792_dimension; i1546++) { + int32_t i1546A1538 = i1542 * A15382_dimension + i1546; + double ti1543A1538_val = 0.0; + for (int32_t i1543matmul_5_5_5 = matmul_5_5_52_pos[i1542matmul_5_5_5]; i1543matmul_5_5_5 < matmul_5_5_52_pos[(i1542matmul_5_5_5 + 1)]; i1543matmul_5_5_5++) { + int32_t i1543 = matmul_5_5_52_crd[i1543matmul_5_5_5]; + for (int32_t i1544matmul_5_5_5 = matmul_5_5_53_pos[i1543matmul_5_5_5]; i1544matmul_5_5_5 < matmul_5_5_53_pos[(i1543matmul_5_5_5 + 1)]; i1544matmul_5_5_5++) { + int32_t i1544 = matmul_5_5_53_crd[i1544matmul_5_5_5]; + for (int32_t i1545 = 0; i1545 < A14791_dimension; i1545++) { + int32_t i1545A1475 = i1544 * A14752_dimension + i1545; + int32_t i1545A1416 = i1543 * A14162_dimension + i1545; + int32_t i1546A1479 = i1545 * A14792_dimension + i1546; + ti1543A1538_val += ((matmul_5_5_5_vals[i1544matmul_5_5_5] * A1475_vals[i1545A1475]) * A1416_vals[i1545A1416]) * A1479_vals[i1546A1479]; + } + } + } + A1538_vals[i1546A1538] = ti1543A1538_val; + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/mttkrp_gemm/taco_default.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]), (taco_tensor_t*)(parameterPack[4])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]), (taco_tensor_t*)(parameterPack[4])); +} diff --git a/test/kernels/mttkrp_gemm/taco_default.h b/test/kernels/mttkrp_gemm/taco_default.h new file mode 100644 index 000000000..54274569e --- /dev/null +++ b/test/kernels/mttkrp_gemm/taco_default.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A1538, taco_tensor_t *matmul_5_5_5, taco_tensor_t *A1475, taco_tensor_t *A1416, taco_tensor_t *A1479); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A1538, taco_tensor_t *matmul_5_5_5, taco_tensor_t *A1475, taco_tensor_t *A1416, taco_tensor_t *A1479); +#endif diff --git a/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.c b/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.c new file mode 100644 index 000000000..a5e031e7a --- /dev/null +++ b/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.c @@ -0,0 +1,199 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A2531, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451) { + int* restrict A25312_pos = (int*)(A2531->indices[1][0]); + int* restrict A25312_crd = (int*)(A2531->indices[1][1]); + double* restrict A2531_vals = (double*)(A2531->vals); + int* restrict cage32_pos = (int*)(cage3->indices[1][0]); + int* restrict cage32_crd = (int*)(cage3->indices[1][1]); + int A13921_dimension = (int)(A1392->dimensions[0]); + + A25312_pos = (int32_t*)malloc(sizeof(int32_t) * 6); + A25312_pos[0] = 0; + for (int32_t pA25312 = 1; pA25312 < 6; pA25312++) { + A25312_pos[pA25312] = 0; + } + int32_t A25312_crd_size = 1048576; + A25312_crd = (int32_t*)malloc(sizeof(int32_t) * A25312_crd_size); + int32_t i1468A2531 = 0; + + for (int32_t i1467 = 0; i1467 < A13921_dimension; i1467++) { + int32_t pA25312_begin = i1468A2531; + + for (int32_t i1468cage3 = cage32_pos[i1467]; i1468cage3 < cage32_pos[(i1467 + 1)]; i1468cage3++) { + int32_t i1468 = cage32_crd[i1468cage3]; + if (A25312_crd_size <= i1468A2531) { + A25312_crd = (int32_t*)realloc(A25312_crd, sizeof(int32_t) * (A25312_crd_size * 2)); + A25312_crd_size *= 2; + } + A25312_crd[i1468A2531] = i1468; + i1468A2531++; + } + + A25312_pos[i1467 + 1] = i1468A2531 - pA25312_begin; + } + + int32_t csA25312 = 0; + for (int32_t pA253120 = 1; pA253120 < 6; pA253120++) { + csA25312 += A25312_pos[pA253120]; + A25312_pos[pA253120] = csA25312; + } + + A2531_vals = (double*)malloc(sizeof(double) * i1468A2531); + + A2531->indices[1][0] = (uint8_t*)(A25312_pos); + A2531->indices[1][1] = (uint8_t*)(A25312_crd); + A2531->vals = (uint8_t*)A2531_vals; + return 0; +} + +int compute(taco_tensor_t *A2531, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451) { + double* restrict A2531_vals = (double*)(A2531->vals); + int* restrict cage32_pos = (int*)(cage3->indices[1][0]); + int* restrict cage32_crd = (int*)(cage3->indices[1][1]); + double* restrict cage3_vals = (double*)(cage3->vals); + int A13921_dimension = (int)(A1392->dimensions[0]); + int A13922_dimension = (int)(A1392->dimensions[1]); + double* restrict A1392_vals = (double*)(A1392->vals); + int A14512_dimension = (int)(A1451->dimensions[1]); + double* restrict A1451_vals = (double*)(A1451->vals); + +// int32_t i1468A2531 = 0; + + #pragma omp parallel for schedule(runtime) + for (int32_t i1467 = 0; i1467 < A13921_dimension; i1467++) { + for (int32_t i1468cage3 = cage32_pos[i1467]; i1468cage3 < cage32_pos[(i1467 + 1)]; i1468cage3++) { + int32_t i1468 = cage32_crd[i1468cage3]; + double ti1469A2531_val = 0.0; + for (int32_t i1469 = 0; i1469 < A14512_dimension; i1469++) { + int32_t i1469A1392 = i1467 * A13922_dimension + i1469; + int32_t i1469A1451 = i1468 * A14512_dimension + i1469; + ti1469A2531_val += (cage3_vals[i1468cage3] * A1392_vals[i1469A1392]) * A1451_vals[i1469A1451]; + } + A2531_vals[i1468cage3] = ti1469A2531_val; + // i1468A2531++; + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} diff --git a/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.h b/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.h new file mode 100644 index 000000000..a9d6b760d --- /dev/null +++ b/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A2531, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A2531, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451); +#endif diff --git a/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.so b/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.so new file mode 100755 index 000000000..c2c5ca30e Binary files /dev/null and b/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.so differ diff --git a/test/kernels/sddmm_spmm/csr_dense_spmm.c b/test/kernels/sddmm_spmm/csr_dense_spmm.c new file mode 100644 index 000000000..7f710f6c1 --- /dev/null +++ b/test/kernels/sddmm_spmm/csr_dense_spmm.c @@ -0,0 +1,190 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A2535, taco_tensor_t *A2531, taco_tensor_t *A1455) { + int A25352_dimension = (int)(A2535->dimensions[1]); + double* restrict A2535_vals = (double*)(A2535->vals); + + A2535_vals = (double*)malloc(sizeof(double) * (5 * A25352_dimension)); + + A2535->vals = (uint8_t*)A2535_vals; + return 0; +} + +int compute(taco_tensor_t *C, taco_tensor_t *A, taco_tensor_t *B) { + int C1_dimension = (int)(C->dimensions[0]); + int C2_dimension = (int)(C->dimensions[1]); + double* restrict C_vals = (double*)(C->vals); + int A1_dimension = (int)(A->dimensions[0]); + int* restrict A2_pos = (int*)(A->indices[1][0]); + int* restrict A2_crd = (int*)(A->indices[1][1]); + double* restrict A_vals = (double*)(A->vals); + int B1_dimension = (int)(B->dimensions[0]); + int B2_dimension = (int)(B->dimensions[1]); + double* restrict B_vals = (double*)(B->vals); + + #pragma omp parallel for schedule(static) + for (int32_t pC = 0; pC < (C1_dimension * C2_dimension); pC++) { + C_vals[pC] = 0.0; + } + + #pragma omp parallel for schedule(dynamic, 1) + for (int32_t i0 = 0; i0 < ((A1_dimension + 15) / 16); i0++) { + for (int32_t i1 = 0; i1 < 16; i1++) { + int32_t i = i0 * 16 + i1; + if (i >= A1_dimension) + continue; + + for (int32_t jpos0 = A2_pos[i] / 4; jpos0 < ((A2_pos[(i + 1)] + 3) / 4); jpos0++) { + int32_t jposA = jpos0 * 4; + if (jpos0 * 4 < A2_pos[i] || (jpos0 * 4 + 4) + ((jpos0 * 4 + 4) - jpos0 * 4) >= A2_pos[(i + 1)]) { + for (int32_t k = 0; k < B2_dimension; k++) { + int32_t kC = i * C2_dimension + k; + for (int32_t jpos1 = 0; jpos1 < 4; jpos1++) { + int32_t jposA = jpos0 * 4 + jpos1; + if (jposA < A2_pos[i] || jposA >= A2_pos[(i + 1)]) + continue; + + int32_t j = A2_crd[jposA]; + int32_t kB = j * B2_dimension + k; + C_vals[kC] = C_vals[kC] + A_vals[jposA] * B_vals[kB]; + } + } + } + else { + #pragma clang loop interleave(enable) vectorize(enable) + for (int32_t k = 0; k < B2_dimension; k++) { + int32_t kC = i * C2_dimension + k; + for (int32_t jpos1 = 0; jpos1 < 4; jpos1++) { + int32_t jposA = jpos0 * 4 + jpos1; + int32_t j = A2_crd[jposA]; + int32_t kB = j * B2_dimension + k; + C_vals[kC] = C_vals[kC] + A_vals[jposA] * B_vals[kB]; + } + } + } + } + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2])); +} diff --git a/test/kernels/sddmm_spmm/csr_dense_spmm.h b/test/kernels/sddmm_spmm/csr_dense_spmm.h new file mode 100644 index 000000000..cf0cf205c --- /dev/null +++ b/test/kernels/sddmm_spmm/csr_dense_spmm.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A2535, taco_tensor_t *A2531, taco_tensor_t *A1455); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A2535, taco_tensor_t *A2531, taco_tensor_t *A1455); +#endif diff --git a/test/kernels/sddmm_spmm/csr_dense_spmm.so b/test/kernels/sddmm_spmm/csr_dense_spmm.so new file mode 100755 index 000000000..398362532 Binary files /dev/null and b/test/kernels/sddmm_spmm/csr_dense_spmm.so differ diff --git a/test/kernels/sddmm_spmm/fused_kernel.c b/test/kernels/sddmm_spmm/fused_kernel.c new file mode 100644 index 000000000..1572bce5a --- /dev/null +++ b/test/kernels/sddmm_spmm/fused_kernel.c @@ -0,0 +1,183 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A1459, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451, taco_tensor_t *A1455) { + int A14592_dimension = (int)(A1459->dimensions[1]); + double* restrict A1459_vals = (double*)(A1459->vals); + + A1459_vals = (double*)malloc(sizeof(double) * (5 * A14592_dimension)); + + A1459->vals = (uint8_t*)A1459_vals; + return 0; +} + +int compute(taco_tensor_t *A1459, taco_tensor_t *B, taco_tensor_t *A1392, taco_tensor_t *A1451, taco_tensor_t *A1455) { + int A14591_dimension = (int)(A1459->dimensions[0]); + int A14592_dimension = (int)(A1459->dimensions[1]); + double* restrict A1459_vals = (double*)(A1459->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + double* restrict B_vals = (double*)(B->vals); + int A13921_dimension = (int)(A1392->dimensions[0]); + int A13922_dimension = (int)(A1392->dimensions[1]); + double* restrict A1392_vals = (double*)(A1392->vals); + int A14511_dimension = (int)(A1451->dimensions[0]); + int A14512_dimension = (int)(A1451->dimensions[1]); + double* restrict A1451_vals = (double*)(A1451->vals); + int A14551_dimension = (int)(A1455->dimensions[0]); + int A14552_dimension = (int)(A1455->dimensions[1]); + double* restrict A1455_vals = (double*)(A1455->vals); + + #pragma omp parallel for schedule(static) + for (int32_t pA1459 = 0; pA1459 < (A14591_dimension * A14592_dimension); pA1459++) { + A1459_vals[pA1459] = 0.0; + } + + + #pragma omp parallel for schedule(runtime) + for (int32_t i0 = 0; i0 < ((A13921_dimension + 15) / 16); i0++) { + + for (int32_t i1 = 0; i1 < 16; i1++) { + int32_t i1467 = i0 * 16 + i1; + if (i1467 >= A13921_dimension) + continue; + + for (int32_t i1468B = B2_pos[i1467]; i1468B < B2_pos[(i1467 + 1)]; i1468B++) { + int32_t i1468 = B2_crd[i1468B]; + double tA1459_val = 0.0; + for (int32_t i1469 = 0; i1469 < A14512_dimension; i1469++) { + int32_t i1469A1392 = i1467 * A13922_dimension + i1469; + int32_t i1469A1451 = i1468 * A14512_dimension + i1469; + tA1459_val += (B_vals[i1468B] * A1392_vals[i1469A1392]) * A1451_vals[i1469A1451]; + } + for (int32_t i1470 = 0; i1470 < A14552_dimension; i1470++) { + int32_t i1470A1459 = i1467 * A14592_dimension + i1470; + int32_t i1470A1455 = i1468 * A14552_dimension + i1470; + A1459_vals[i1470A1459] = A1459_vals[i1470A1459] + tA1459_val * A1455_vals[i1470A1455]; + } + } + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/fused_kernel.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]), (taco_tensor_t*)(parameterPack[4])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]), (taco_tensor_t*)(parameterPack[4])); +} diff --git a/test/kernels/sddmm_spmm/fused_kernel.h b/test/kernels/sddmm_spmm/fused_kernel.h new file mode 100644 index 000000000..e67e5a761 --- /dev/null +++ b/test/kernels/sddmm_spmm/fused_kernel.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A1459, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451, taco_tensor_t *A1455); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A1459, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451, taco_tensor_t *A1455); +#endif diff --git a/test/kernels/sddmm_spmm/fused_kernel.so b/test/kernels/sddmm_spmm/fused_kernel.so new file mode 100755 index 000000000..10619e0ca Binary files /dev/null and b/test/kernels/sddmm_spmm/fused_kernel.so differ diff --git a/test/kernels/sddmm_spmm/sddmm_ryan.c b/test/kernels/sddmm_spmm/sddmm_ryan.c new file mode 100644 index 000000000..760fb5361 --- /dev/null +++ b/test/kernels/sddmm_spmm/sddmm_ryan.c @@ -0,0 +1,210 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A2531, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451) { + int* restrict A25312_pos = (int*)(A2531->indices[1][0]); + int* restrict A25312_crd = (int*)(A2531->indices[1][1]); + double* restrict A2531_vals = (double*)(A2531->vals); + int* restrict cage32_pos = (int*)(cage3->indices[1][0]); + int* restrict cage32_crd = (int*)(cage3->indices[1][1]); + int A13921_dimension = (int)(A1392->dimensions[0]); + + A25312_pos = (int32_t*)malloc(sizeof(int32_t) * 6); + A25312_pos[0] = 0; + for (int32_t pA25312 = 1; pA25312 < 6; pA25312++) { + A25312_pos[pA25312] = 0; + } + int32_t A25312_crd_size = 1048576; + A25312_crd = (int32_t*)malloc(sizeof(int32_t) * A25312_crd_size); + int32_t i1468A2531 = 0; + + for (int32_t i1467 = 0; i1467 < A13921_dimension; i1467++) { + int32_t pA25312_begin = i1468A2531; + + for (int32_t i1468cage3 = cage32_pos[i1467]; i1468cage3 < cage32_pos[(i1467 + 1)]; i1468cage3++) { + int32_t i1468 = cage32_crd[i1468cage3]; + if (A25312_crd_size <= i1468A2531) { + A25312_crd = (int32_t*)realloc(A25312_crd, sizeof(int32_t) * (A25312_crd_size * 2)); + A25312_crd_size *= 2; + } + A25312_crd[i1468A2531] = i1468; + i1468A2531++; + } + + A25312_pos[i1467 + 1] = i1468A2531 - pA25312_begin; + } + + int32_t csA25312 = 0; + for (int32_t pA253120 = 1; pA253120 < 6; pA253120++) { + csA25312 += A25312_pos[pA253120]; + A25312_pos[pA253120] = csA25312; + } + + A2531_vals = (double*)malloc(sizeof(double) * i1468A2531); + + A2531->indices[1][0] = (uint8_t*)(A25312_pos); + A2531->indices[1][1] = (uint8_t*)(A25312_crd); + A2531->vals = (uint8_t*)A2531_vals; + return 0; +} + +int compute(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) { + + int A1_dimension = (int)(A->dimensions[0]); + double* restrict A_vals = (double*)(A->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + double* restrict B_vals = (double*)(B->vals); + int C1_dimension = (int)(C->dimensions[0]); + int C2_dimension = (int)(C->dimensions[1]); + double* restrict C_vals = (double*)(C->vals); + int D1_dimension = (int)(D->dimensions[0]); + int D2_dimension = (int)(D->dimensions[1]); + double* restrict D_vals = (double*)(D->vals); + + int32_t jA = 0; + + #pragma omp parallel for schedule(runtime) + for (int32_t i0 = 0; i0 < ((C1_dimension + 15) / 16); i0++) { + for (int32_t i1 = 0; i1 < 16; i1++) { + int32_t i = i0 * 16 + i1; + if (i >= C1_dimension) + continue; + + for (int32_t jB = B2_pos[i]; jB < B2_pos[(i + 1)]; jB++) { + int32_t j = B2_crd[jB]; + double tkA_val = 0.0; + for (int32_t k = 0; k < D2_dimension; k++) { + int32_t kC = i * C2_dimension + k; + int32_t kD = j * D2_dimension + k; + tkA_val += (B_vals[jB] * C_vals[kC]) * D_vals[kD]; + } + A_vals[jB] = tkA_val; + // jA++; + } + } + } + return 0; + +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/sddmm_ryan.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} diff --git a/test/kernels/sddmm_spmm/sddmm_ryan.h b/test/kernels/sddmm_spmm/sddmm_ryan.h new file mode 100644 index 000000000..f0f9e372a --- /dev/null +++ b/test/kernels/sddmm_spmm/sddmm_ryan.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A2531, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D); +#endif diff --git a/test/kernels/sddmm_spmm/sddmm_ryan.so b/test/kernels/sddmm_spmm/sddmm_ryan.so new file mode 100755 index 000000000..c3deae084 Binary files /dev/null and b/test/kernels/sddmm_spmm/sddmm_ryan.so differ diff --git a/test/kernels/sddmm_spmm/taco_original.c b/test/kernels/sddmm_spmm/taco_original.c new file mode 100644 index 000000000..4f084ff5e --- /dev/null +++ b/test/kernels/sddmm_spmm/taco_original.c @@ -0,0 +1,166 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A1463, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451, taco_tensor_t *A1455) { + int A14632_dimension = (int)(A1463->dimensions[1]); + double* restrict A1463_vals = (double*)(A1463->vals); + + A1463_vals = (double*)malloc(sizeof(double) * (5 * A14632_dimension)); + + A1463->vals = (uint8_t*)A1463_vals; + return 0; +} + +int compute(taco_tensor_t *A1463, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451, taco_tensor_t *A1455) { + int A14632_dimension = (int)(A1463->dimensions[1]); + double* restrict A1463_vals = (double*)(A1463->vals); + int* restrict cage32_pos = (int*)(cage3->indices[1][0]); + int* restrict cage32_crd = (int*)(cage3->indices[1][1]); + double* restrict cage3_vals = (double*)(cage3->vals); + int A13921_dimension = (int)(A1392->dimensions[0]); + int A13922_dimension = (int)(A1392->dimensions[1]); + double* restrict A1392_vals = (double*)(A1392->vals); + int A14512_dimension = (int)(A1451->dimensions[1]); + double* restrict A1451_vals = (double*)(A1451->vals); + int A14552_dimension = (int)(A1455->dimensions[1]); + double* restrict A1455_vals = (double*)(A1455->vals); + + #pragma omp parallel for schedule(runtime) + for (int32_t i1467 = 0; i1467 < A13921_dimension; i1467++) { + for (int32_t i1470 = 0; i1470 < A14552_dimension; i1470++) { + int32_t i1470A1463 = i1467 * A14632_dimension + i1470; + double ti1468A1463_val = 0.0; + for (int32_t i1468cage3 = cage32_pos[i1467]; i1468cage3 < cage32_pos[(i1467 + 1)]; i1468cage3++) { + int32_t i1468 = cage32_crd[i1468cage3]; + int32_t i1470A1455 = i1468 * A14552_dimension + i1470; + for (int32_t i1469 = 0; i1469 < A14512_dimension; i1469++) { + int32_t i1469A1392 = i1467 * A13922_dimension + i1469; + int32_t i1469A1451 = i1468 * A14512_dimension + i1469; + ti1468A1463_val += ((cage3_vals[i1468cage3] * A1392_vals[i1469A1392]) * A1451_vals[i1469A1451]) * A1455_vals[i1470A1455]; + } + } + A1463_vals[i1470A1463] = ti1468A1463_val; + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/taco_original.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]), (taco_tensor_t*)(parameterPack[4])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]), (taco_tensor_t*)(parameterPack[4])); +} diff --git a/test/kernels/sddmm_spmm/taco_original.h b/test/kernels/sddmm_spmm/taco_original.h new file mode 100644 index 000000000..71ce53402 --- /dev/null +++ b/test/kernels/sddmm_spmm/taco_original.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A1463, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451, taco_tensor_t *A1455); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A1463, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451, taco_tensor_t *A1455); +#endif diff --git a/test/kernels/sddmm_spmm/taco_original.so b/test/kernels/sddmm_spmm/taco_original.so new file mode 100755 index 000000000..f50931baa Binary files /dev/null and b/test/kernels/sddmm_spmm/taco_original.so differ diff --git a/test/kernels/spmm_gemm/gemm_default.c b/test/kernels/spmm_gemm/gemm_default.c new file mode 100644 index 000000000..605cc491f --- /dev/null +++ b/test/kernels/spmm_gemm/gemm_default.c @@ -0,0 +1,160 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A2039, taco_tensor_t *A2035, taco_tensor_t *A1450) { + int A20391_dimension = (int)(A2039->dimensions[0]); + int A20392_dimension = (int)(A2039->dimensions[1]); + double* restrict A2039_vals = (double*)(A2039->vals); + + A2039_vals = (double*)malloc(sizeof(double) * (A20391_dimension * A20392_dimension)); + + A2039->vals = (uint8_t*)A2039_vals; + return 0; +} + +int compute(taco_tensor_t *A2039, taco_tensor_t *A2035, taco_tensor_t *A1450) { + int A20391_dimension = (int)(A2039->dimensions[0]); + int A20392_dimension = (int)(A2039->dimensions[1]); + double* restrict A2039_vals = (double*)(A2039->vals); + int A20351_dimension = (int)(A2035->dimensions[0]); + int A20352_dimension = (int)(A2035->dimensions[1]); + double* restrict A2035_vals = (double*)(A2035->vals); + int A14501_dimension = (int)(A1450->dimensions[0]); + int A14502_dimension = (int)(A1450->dimensions[1]); + double* restrict A1450_vals = (double*)(A1450->vals); + + #pragma omp parallel for schedule(runtime) + for (int32_t i1517 = 0; i1517 < A20351_dimension; i1517++) { + for (int32_t i1520 = 0; i1520 < A14502_dimension; i1520++) { + int32_t i1520A2039 = i1517 * A20392_dimension + i1520; + double ti1519A2039_val = 0.0; + for (int32_t i1519 = 0; i1519 < A14501_dimension; i1519++) { + int32_t i1519A2035 = i1517 * A20352_dimension + i1519; + int32_t i1520A1450 = i1519 * A14502_dimension + i1520; + ti1519A2039_val += A2035_vals[i1519A2035] * A1450_vals[i1520A1450]; + } + A2039_vals[i1520A2039] = ti1519A2039_val; + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/spmm_gemm/gemm_default.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2])); +} diff --git a/test/kernels/spmm_gemm/gemm_default.h b/test/kernels/spmm_gemm/gemm_default.h new file mode 100644 index 000000000..769514531 --- /dev/null +++ b/test/kernels/spmm_gemm/gemm_default.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A2039, taco_tensor_t *A2035, taco_tensor_t *A1450); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A2039, taco_tensor_t *A2035, taco_tensor_t *A1450); +#endif diff --git a/test/kernels/spmm_gemm/gemm_default.so b/test/kernels/spmm_gemm/gemm_default.so new file mode 100755 index 000000000..9de7a7933 Binary files /dev/null and b/test/kernels/spmm_gemm/gemm_default.so differ diff --git a/test/kernels/spmm_gemm/gemm_template.c b/test/kernels/spmm_gemm/gemm_template.c new file mode 100644 index 000000000..4a4e5faeb --- /dev/null +++ b/test/kernels/spmm_gemm/gemm_template.c @@ -0,0 +1,183 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A2039, taco_tensor_t *A2035, taco_tensor_t *A1450) { + int A20391_dimension = (int)(A2039->dimensions[0]); + int A20392_dimension = (int)(A2039->dimensions[1]); + double* restrict A2039_vals = (double*)(A2039->vals); + + A2039_vals = (double*)malloc(sizeof(double) * (A20391_dimension * A20392_dimension)); + + A2039->vals = (uint8_t*)A2039_vals; + return 0; +} + +int compute(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *C) { + int A1_dimension = (int)(A->dimensions[0]); + int A2_dimension = (int)(A->dimensions[1]); + double* restrict A_vals = (double*)(A->vals); + int B1_dimension = (int)(B->dimensions[0]); + int B2_dimension = (int)(B->dimensions[1]); + double* restrict B_vals = (double*)(B->vals); + int C1_dimension = (int)(C->dimensions[0]); + int C2_dimension = (int)(C->dimensions[1]); + double* restrict C_vals = (double*)(C->vals); + + #pragma omp parallel for schedule(static) + for (int32_t pA = 0; pA < (A1_dimension * A2_dimension); pA++) { + A_vals[pA] = 0.0; + } + + #pragma omp parallel for schedule(runtime) + for (int32_t i0 = 0; i0 < ((B1_dimension + 15) / 16); i0++) { + for (int32_t j0 = 0; j0 < ((C1_dimension + 15) / 16); j0++) { + for (int32_t k0 = 0; k0 < ((C2_dimension + 15) / 16); k0++) { + for (int32_t i1 = 0; i1 < 16; i1++) { + int32_t i = i0 * 16 + i1; + if (i >= B1_dimension) + continue; + + for (int32_t j1 = 0; j1 < 16; j1++) { + int32_t j = j0 * 16 + j1; + int32_t jB = i * B2_dimension + j; + int32_t jA = i * A2_dimension + j; + if (j >= C1_dimension) + continue; + + double tk1A_val = 0.0; + for (int32_t k1 = 0; k1 < 16; k1++) { + int32_t k = k0 * 16 + k1; + int32_t kC = j * C2_dimension + k; + if (k >= C2_dimension) + continue; + + tk1A_val += B_vals[jB] * C_vals[kC]; + } + A_vals[jA] = A_vals[jA] + tk1A_val; + } + } + } + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/spmm_gemm/gemm_template.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2])); +} diff --git a/test/kernels/spmm_gemm/gemm_template.h b/test/kernels/spmm_gemm/gemm_template.h new file mode 100644 index 000000000..769514531 --- /dev/null +++ b/test/kernels/spmm_gemm/gemm_template.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A2039, taco_tensor_t *A2035, taco_tensor_t *A1450); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A2039, taco_tensor_t *A2035, taco_tensor_t *A1450); +#endif diff --git a/test/kernels/spmm_gemm/gemm_template.so b/test/kernels/spmm_gemm/gemm_template.so new file mode 100755 index 000000000..2cfcd7ad3 Binary files /dev/null and b/test/kernels/spmm_gemm/gemm_template.so differ diff --git a/test/kernels/spmv_spmv/spmv_fused.c b/test/kernels/spmv_spmv/spmv_fused.c new file mode 100644 index 000000000..0964fb8e1 --- /dev/null +++ b/test/kernels/spmv_spmv/spmv_fused.c @@ -0,0 +1,178 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A, taco_tensor_t *C, taco_tensor_t *v, taco_tensor_t *B) { + double* restrict A_vals = (double*)(A->vals); + + A_vals = (double*)malloc(sizeof(double) * 5); + + A->vals = (uint8_t*)A_vals; + return 0; +} + +int compute(taco_tensor_t *A, taco_tensor_t *C, taco_tensor_t *v, taco_tensor_t *B) { + printf("Adhitha1\n"); + + double* restrict A_vals = (double*)(A->vals); + int* restrict C2_pos = (int*)(C->indices[1][0]); + int* restrict C2_crd = (int*)(C->indices[1][1]); + double* restrict C_vals = (double*)(C->vals); + double* restrict v_vals = (double*)(v->vals); + printf("Adhitha2\n"); + int B1_dimension = (int)(B->dimensions[0]); + int C1_dimension = (int)(B->dimensions[0]); + printf("Adhitha3 %d, %d\n", B1_dimension, C1_dimension); + int* restrict B2_pos = (int*)(B->indices[1][0]); + printf("Adhitha4\n"); + int* restrict B2_crd = (int*)(B->indices[1][1]); + printf("Adhitha2\n"); + double* restrict B_vals = (double*)(B->vals); + + printf("Adhitha3\n"); + + double* restrict tA = 0; + tA = (double*)malloc(sizeof(double) * C1_dimension); + for (int32_t ptA = 0; ptA < C1_dimension; ptA++) { + tA[ptA] = 0.0; + } + for (int32_t i1439 = 0; i1439 < C1_dimension; i1439++) { + double ti1440tA_val = 0.0; + for (int32_t i1440C = C2_pos[i1439]; i1440C < C2_pos[(i1439 + 1)]; i1440C++) { + int32_t i1440 = C2_crd[i1440C]; + ti1440tA_val += C_vals[i1440C] * v_vals[i1440]; + } + tA[i1439] = ti1440tA_val; + } + for (int32_t i1438 = 0; i1438 < B1_dimension; i1438++) { + double ti1439A_val = 0.0; + for (int32_t i1439B = B2_pos[i1438]; i1439B < B2_pos[(i1438 + 1)]; i1439B++) { + int32_t i1439 = B2_crd[i1439B]; + ti1439A_val += B_vals[i1439B] * tA[i1439]; + } + A_vals[i1438] = ti1439A_val; + } + free(tA); + + A->vals = (uint8_t*)A_vals; + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/spmv_spmv/spmv_fused.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} diff --git a/test/kernels/spmv_spmv/spmv_fused.h b/test/kernels/spmv_spmv/spmv_fused.h new file mode 100644 index 000000000..bc78275ac --- /dev/null +++ b/test/kernels/spmv_spmv/spmv_fused.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A, taco_tensor_t *C, taco_tensor_t *v, taco_tensor_t *B); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A, taco_tensor_t *C, taco_tensor_t *v, taco_tensor_t *B); +#endif diff --git a/test/kernels/spmv_spmv/spmv_fused.so b/test/kernels/spmv_spmv/spmv_fused.so new file mode 100755 index 000000000..5efd6a4d8 Binary files /dev/null and b/test/kernels/spmv_spmv/spmv_fused.so differ diff --git a/test/kernels/spmv_spmv/spmv_spmv_default.c b/test/kernels/spmv_spmv/spmv_spmv_default.c new file mode 100644 index 000000000..dfaa1c4b0 --- /dev/null +++ b/test/kernels/spmv_spmv/spmv_spmv_default.c @@ -0,0 +1,157 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *ref, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *v) { + double* restrict ref_vals = (double*)(ref->vals); + + ref_vals = (double*)malloc(sizeof(double) * 5); + + ref->vals = (uint8_t*)ref_vals; + return 0; +} + +int compute(taco_tensor_t *ref, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *v) { + double* restrict ref_vals = (double*)(ref->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + double* restrict B_vals = (double*)(B->vals); + int* restrict C2_pos = (int*)(C->indices[1][0]); + int* restrict C2_crd = (int*)(C->indices[1][1]); + double* restrict C_vals = (double*)(C->vals); + double* restrict v_vals = (double*)(v->vals); + + #pragma omp parallel for schedule(runtime) + for (int32_t i1438 = 0; i1438 < B1_dimension; i1438++) { + double ti1439ref_val = 0.0; + for (int32_t i1439B = B2_pos[i1438]; i1439B < B2_pos[(i1438 + 1)]; i1439B++) { + int32_t i1439 = B2_crd[i1439B]; + for (int32_t i1440C = C2_pos[i1439]; i1440C < C2_pos[(i1439 + 1)]; i1440C++) { + int32_t i1440 = C2_crd[i1440C]; + ti1439ref_val += (B_vals[i1439B] * C_vals[i1440C]) * v_vals[i1440]; + } + } + ref_vals[i1438] = ti1439ref_val; + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/spmv_spmv/spmv_spmv_default.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} diff --git a/test/kernels/spmv_spmv/spmv_spmv_default.h b/test/kernels/spmv_spmv/spmv_spmv_default.h new file mode 100644 index 000000000..b53193484 --- /dev/null +++ b/test/kernels/spmv_spmv/spmv_spmv_default.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *ref, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *v); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *ref, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *v); +#endif diff --git a/test/kernels/ttm_ttm/fused copy.c b/test/kernels/ttm_ttm/fused copy.c new file mode 100644 index 000000000..5d40c8aa9 --- /dev/null +++ b/test/kernels/ttm_ttm/fused copy.c @@ -0,0 +1,248 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A1532, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) { + int A15321_dimension = (int)(A1532->dimensions[0]); + int A15323_dimension = (int)(A1532->dimensions[2]); + int* restrict A15322_pos = (int*)(A1532->indices[1][0]); + int* restrict A15322_crd = (int*)(A1532->indices[1][1]); + double* restrict A1532_vals = (double*)(A1532->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + + A15322_pos = (int32_t*)malloc(sizeof(int32_t) * (A15321_dimension + 1)); + A15322_pos[0] = 0; + for (int32_t pA15322 = 1; pA15322 < (A15321_dimension + 1); pA15322++) { + A15322_pos[pA15322] = 0; + } + int32_t A15322_crd_size = 1048576; + A15322_crd = (int32_t*)malloc(sizeof(int32_t) * A15322_crd_size); + int32_t i1543A1532 = 0; + + for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= B1_dimension) + continue; + + int32_t pA15322_begin = i1543A1532; + + for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) { + int32_t i1543 = B2_crd[i1543B]; + if (A15322_crd_size <= i1543A1532) { + A15322_crd = (int32_t*)realloc(A15322_crd, sizeof(int32_t) * (A15322_crd_size * 2)); + A15322_crd_size *= 2; + } + A15322_crd[i1543A1532] = i1543; + i1543A1532++; + } + + A15322_pos[i1542 + 1] = i1543A1532 - pA15322_begin; + } + } + + int32_t csA15322 = 0; + for (int32_t pA153220 = 1; pA153220 < (A15321_dimension + 1); pA153220++) { + csA15322 += A15322_pos[pA153220]; + A15322_pos[pA153220] = csA15322; + } + + A1532_vals = (double*)malloc(sizeof(double) * (i1543A1532 * A15323_dimension)); + + A1532->indices[1][0] = (uint8_t*)(A15322_pos); + A1532->indices[1][1] = (uint8_t*)(A15322_crd); + A1532->vals = (uint8_t*)A1532_vals; + return 0; +} + +int compute(taco_tensor_t *A1532, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) { + int A15321_dimension = (int)(A1532->dimensions[0]); + int A15323_dimension = (int)(A1532->dimensions[2]); + int* restrict A15322_pos = (int*)(A1532->indices[1][0]); + double* restrict A1532_vals = (double*)(A1532->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + int* restrict B3_pos = (int*)(B->indices[2][0]); + int* restrict B3_crd = (int*)(B->indices[2][1]); + double* restrict B_vals = (double*)(B->vals); + int C1_dimension = (int)(C->dimensions[0]); + int C2_dimension = (int)(C->dimensions[1]); + double* restrict C_vals = (double*)(C->vals); + int D1_dimension = (int)(D->dimensions[0]); + int D2_dimension = (int)(D->dimensions[1]); + double* restrict D_vals = (double*)(D->vals); + +// int32_t i1543A1532 = 0; + + #pragma omp parallel for schedule(static) + for (int32_t pA1532 = 0; pA1532 < (A15322_pos[A15321_dimension] * A15323_dimension); pA1532++) { + A1532_vals[pA1532] = 0.0; + } + + double* restrict rA1532_all = 0; + tA1532_all = (double*)malloc(sizeof(double) * D1_dimension * omp_get_max_threads()); + + #pragma omp parallel for schedule(runtime) + for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= B1_dimension) + continue; + + double* restrict tA1532 = 0; + tA1532 = &tA1532_all[D1_dimension*omp_get_thread_num()]; + // tA1532 = (double*)malloc(sizeof(double) * D1_dimension); + + for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) { + for (int32_t ptA1532 = 0; ptA1532 < D1_dimension; ptA1532++) { + tA1532[ptA1532] = 0.0; + } + for (int32_t i1544B = B3_pos[i1543B]; i1544B < B3_pos[(i1543B + 1)]; i1544B++) { + int32_t i1544 = B3_crd[i1544B]; + for (int32_t i1545 = 0; i1545 < D1_dimension; i1545++) { + int32_t i1545C = i1544 * C2_dimension + i1545; + tA1532[i1545] = tA1532[i1545] + B_vals[i1544B] * C_vals[i1545C]; + } + } + for (int32_t i1545 = 0; i1545 < D1_dimension; i1545++) { + for (int32_t i1546 = 0; i1546 < D2_dimension; i1546++) { + int32_t i1546A1532 = i1543B * A15323_dimension + i1546; + int32_t i1546D = i1545 * D2_dimension + i1546; + A1532_vals[i1546A1532] = A1532_vals[i1546A1532] + tA1532[i1545] * D_vals[i1546D]; + } + } + // i1543A1532++; + } + + + } + + } + free(tA1532_all); + + A1532->indices[1][0] = (uint8_t*)(A15322_pos); + A1532->vals = (uint8_t*)A1532_vals; + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/fused.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} diff --git a/test/kernels/ttm_ttm/fused.c b/test/kernels/ttm_ttm/fused.c new file mode 100644 index 000000000..f490913cb --- /dev/null +++ b/test/kernels/ttm_ttm/fused.c @@ -0,0 +1,242 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A1532, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) { + int A15321_dimension = (int)(A1532->dimensions[0]); + int A15323_dimension = (int)(A1532->dimensions[2]); + int* restrict A15322_pos = (int*)(A1532->indices[1][0]); + int* restrict A15322_crd = (int*)(A1532->indices[1][1]); + double* restrict A1532_vals = (double*)(A1532->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + + A15322_pos = (int32_t*)malloc(sizeof(int32_t) * (A15321_dimension + 1)); + A15322_pos[0] = 0; + for (int32_t pA15322 = 1; pA15322 < (A15321_dimension + 1); pA15322++) { + A15322_pos[pA15322] = 0; + } + int32_t A15322_crd_size = 1048576; + A15322_crd = (int32_t*)malloc(sizeof(int32_t) * A15322_crd_size); + int32_t i1543A1532 = 0; + + for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= B1_dimension) + continue; + + int32_t pA15322_begin = i1543A1532; + + for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) { + int32_t i1543 = B2_crd[i1543B]; + if (A15322_crd_size <= i1543A1532) { + A15322_crd = (int32_t*)realloc(A15322_crd, sizeof(int32_t) * (A15322_crd_size * 2)); + A15322_crd_size *= 2; + } + A15322_crd[i1543A1532] = i1543; + i1543A1532++; + } + + A15322_pos[i1542 + 1] = i1543A1532 - pA15322_begin; + } + } + + int32_t csA15322 = 0; + for (int32_t pA153220 = 1; pA153220 < (A15321_dimension + 1); pA153220++) { + csA15322 += A15322_pos[pA153220]; + A15322_pos[pA153220] = csA15322; + } + + A1532_vals = (double*)malloc(sizeof(double) * (i1543A1532 * A15323_dimension)); + + A1532->indices[1][0] = (uint8_t*)(A15322_pos); + A1532->indices[1][1] = (uint8_t*)(A15322_crd); + A1532->vals = (uint8_t*)A1532_vals; + return 0; +} + +int compute(taco_tensor_t *A1532, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) { + int A15321_dimension = (int)(A1532->dimensions[0]); + int A15323_dimension = (int)(A1532->dimensions[2]); + int* restrict A15322_pos = (int*)(A1532->indices[1][0]); + double* restrict A1532_vals = (double*)(A1532->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + int* restrict B3_pos = (int*)(B->indices[2][0]); + int* restrict B3_crd = (int*)(B->indices[2][1]); + double* restrict B_vals = (double*)(B->vals); + int C1_dimension = (int)(C->dimensions[0]); + int C2_dimension = (int)(C->dimensions[1]); + double* restrict C_vals = (double*)(C->vals); + int D1_dimension = (int)(D->dimensions[0]); + int D2_dimension = (int)(D->dimensions[1]); + double* restrict D_vals = (double*)(D->vals); + +// int32_t i1543A1532 = 0; + + #pragma omp parallel for schedule(static) + for (int32_t pA1532 = 0; pA1532 < (A15322_pos[A15321_dimension] * A15323_dimension); pA1532++) { + A1532_vals[pA1532] = 0.0; + } + + #pragma omp parallel for schedule(runtime) + for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= B1_dimension) + continue; + + double* restrict tA1532 = 0; + tA1532 = (double*)malloc(sizeof(double) * D1_dimension); + + for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) { + for (int32_t ptA1532 = 0; ptA1532 < D1_dimension; ptA1532++) { + tA1532[ptA1532] = 0.0; + } + for (int32_t i1544B = B3_pos[i1543B]; i1544B < B3_pos[(i1543B + 1)]; i1544B++) { + int32_t i1544 = B3_crd[i1544B]; + for (int32_t i1545 = 0; i1545 < D1_dimension; i1545++) { + int32_t i1545C = i1544 * C2_dimension + i1545; + tA1532[i1545] = tA1532[i1545] + B_vals[i1544B] * C_vals[i1545C]; + } + } + for (int32_t i1545 = 0; i1545 < D1_dimension; i1545++) { + for (int32_t i1546 = 0; i1546 < D2_dimension; i1546++) { + int32_t i1546A1532 = i1543B * A15323_dimension + i1546; + int32_t i1546D = i1545 * D2_dimension + i1546; + A1532_vals[i1546A1532] = A1532_vals[i1546A1532] + tA1532[i1545] * D_vals[i1546D]; + } + } + // i1543A1532++; + } + + free(tA1532); + } + } + + A1532->indices[1][0] = (uint8_t*)(A15322_pos); + A1532->vals = (uint8_t*)A1532_vals; + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/fused.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} diff --git a/test/kernels/ttm_ttm/fused.h b/test/kernels/ttm_ttm/fused.h new file mode 100644 index 000000000..d613c8f07 --- /dev/null +++ b/test/kernels/ttm_ttm/fused.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A1532, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A1532, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D); +#endif diff --git a/test/kernels/ttm_ttm/fused.so b/test/kernels/ttm_ttm/fused.so new file mode 100755 index 000000000..69c65a1dc Binary files /dev/null and b/test/kernels/ttm_ttm/fused.so differ diff --git a/test/kernels/ttm_ttm/gemm.c b/test/kernels/ttm_ttm/gemm.c new file mode 100644 index 000000000..ee2b24e99 --- /dev/null +++ b/test/kernels/ttm_ttm/gemm.c @@ -0,0 +1,181 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A2886, taco_tensor_t *C, taco_tensor_t *D) { + int A28861_dimension = (int)(A2886->dimensions[0]); + int A28862_dimension = (int)(A2886->dimensions[1]); + double* restrict A2886_vals = (double*)(A2886->vals); + + A2886_vals = (double*)malloc(sizeof(double) * (A28861_dimension * A28862_dimension)); + + A2886->vals = (uint8_t*)A2886_vals; + return 0; +} + +int compute(taco_tensor_t *A2886, taco_tensor_t *C, taco_tensor_t *D) { + int A28861_dimension = (int)(A2886->dimensions[0]); + int A28862_dimension = (int)(A2886->dimensions[1]); + double* restrict A2886_vals = (double*)(A2886->vals); + int C1_dimension = (int)(C->dimensions[0]); + int C2_dimension = (int)(C->dimensions[1]); + double* restrict C_vals = (double*)(C->vals); + int D1_dimension = (int)(D->dimensions[0]); + int D2_dimension = (int)(D->dimensions[1]); + double* restrict D_vals = (double*)(D->vals); + + #pragma omp parallel for schedule(static) + for (int32_t pA2886 = 0; pA2886 < (A28861_dimension * A28862_dimension); pA2886++) { + A2886_vals[pA2886] = 0.0; + } + + #pragma omp parallel for schedule(runtime) + for (int32_t i1551 = 0; i1551 < ((C1_dimension + 31) / 32); i1551++) { + for (int32_t i1553 = 0; i1553 < ((D1_dimension + 31) / 32); i1553++) { + for (int32_t i1555 = 0; i1555 < ((D2_dimension + 31) / 32); i1555++) { + for (int32_t i1552 = 0; i1552 < 32; i1552++) { + int32_t i1544 = i1551 * 32 + i1552; + if (i1544 >= C1_dimension) + continue; + + for (int32_t i1554 = 0; i1554 < 32; i1554++) { + int32_t i1545 = i1553 * 32 + i1554; + int32_t i1545C = i1544 * C2_dimension + i1545; + if (i1545 >= D1_dimension) + continue; + + for (int32_t i1556 = 0; i1556 < 32; i1556++) { + int32_t i1546 = i1555 * 32 + i1556; + int32_t i1546D = i1545 * D2_dimension + i1546; + int32_t i1546A2886 = i1544 * A28862_dimension + i1546; + if (i1546 >= D2_dimension) + continue; + + A2886_vals[i1546A2886] = A2886_vals[i1546A2886] + C_vals[i1545C] * D_vals[i1546D]; + } + } + } + } + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/gemm.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2])); +} diff --git a/test/kernels/ttm_ttm/gemm.h b/test/kernels/ttm_ttm/gemm.h new file mode 100644 index 000000000..20cd2db53 --- /dev/null +++ b/test/kernels/ttm_ttm/gemm.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A2886, taco_tensor_t *C, taco_tensor_t *D); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A2886, taco_tensor_t *C, taco_tensor_t *D); +#endif diff --git a/test/kernels/ttm_ttm/ttm1_1.c b/test/kernels/ttm_ttm/ttm1_1.c new file mode 100644 index 000000000..e016491a2 --- /dev/null +++ b/test/kernels/ttm_ttm/ttm1_1.c @@ -0,0 +1,219 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A2398, taco_tensor_t *B, taco_tensor_t *C) { + int A23981_dimension = (int)(A2398->dimensions[0]); + int A23983_dimension = (int)(A2398->dimensions[2]); + int* restrict A23982_pos = (int*)(A2398->indices[1][0]); + int* restrict A23982_crd = (int*)(A2398->indices[1][1]); + double* restrict A2398_vals = (double*)(A2398->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + + A23982_pos = (int32_t*)malloc(sizeof(int32_t) * (A23981_dimension + 1)); + A23982_pos[0] = 0; + for (int32_t pA23982 = 1; pA23982 < (A23981_dimension + 1); pA23982++) { + A23982_pos[pA23982] = 0; + } + int32_t A23982_crd_size = 1048576; + A23982_crd = (int32_t*)malloc(sizeof(int32_t) * A23982_crd_size); + int32_t i1543A2398 = 0; + + for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= B1_dimension) + continue; + + int32_t pA23982_begin = i1543A2398; + + for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) { + int32_t i1543 = B2_crd[i1543B]; + if (A23982_crd_size <= i1543A2398) { + A23982_crd = (int32_t*)realloc(A23982_crd, sizeof(int32_t) * (A23982_crd_size * 2)); + A23982_crd_size *= 2; + } + A23982_crd[i1543A2398] = i1543; + i1543A2398++; + } + + A23982_pos[i1542 + 1] = i1543A2398 - pA23982_begin; + } + } + + int32_t csA23982 = 0; + for (int32_t pA239820 = 1; pA239820 < (A23981_dimension + 1); pA239820++) { + csA23982 += A23982_pos[pA239820]; + A23982_pos[pA239820] = csA23982; + } + + A2398_vals = (double*)malloc(sizeof(double) * (i1543A2398 * A23983_dimension)); + + A2398->indices[1][0] = (uint8_t*)(A23982_pos); + A2398->indices[1][1] = (uint8_t*)(A23982_crd); + A2398->vals = (uint8_t*)A2398_vals; + return 0; +} + +int compute(taco_tensor_t *A2398, taco_tensor_t *B, taco_tensor_t *C) { + int A23981_dimension = (int)(A2398->dimensions[0]); + int A23983_dimension = (int)(A2398->dimensions[2]); + double* restrict A2398_vals = (double*)(A2398->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + int* restrict B3_pos = (int*)(B->indices[2][0]); + int* restrict B3_crd = (int*)(B->indices[2][1]); + double* restrict B_vals = (double*)(B->vals); + int C1_dimension = (int)(C->dimensions[0]); + int C2_dimension = (int)(C->dimensions[1]); + double* restrict C_vals = (double*)(C->vals); + + // int32_t i1543A2398 = 0; + + #pragma omp parallel for schedule(runtime) + for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= B1_dimension) + continue; + + for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) { + for (int32_t i1545 = 0; i1545 < C2_dimension; i1545++) { + // int32_t i1545A2398 = i1543A2398 * A23983_dimension + i1545; + int32_t i1545A2398 = i1543B * A23983_dimension + i1545; + double ti1544A2398_val = 0.0; + for (int32_t i1544B = B3_pos[i1543B]; i1544B < B3_pos[(i1543B + 1)]; i1544B++) { + int32_t i1544 = B3_crd[i1544B]; + int32_t i1545C = i1544 * C2_dimension + i1545; + ti1544A2398_val += B_vals[i1544B] * C_vals[i1545C]; + } + A2398_vals[i1545A2398] = ti1544A2398_val; + } + // i1543A2398++; + } + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/ttm1_1.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2])); +} diff --git a/test/kernels/ttm_ttm/ttm1_1.h b/test/kernels/ttm_ttm/ttm1_1.h new file mode 100644 index 000000000..4c631f227 --- /dev/null +++ b/test/kernels/ttm_ttm/ttm1_1.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A2398, taco_tensor_t *B, taco_tensor_t *C); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A2398, taco_tensor_t *B, taco_tensor_t *C); +#endif diff --git a/test/kernels/ttm_ttm/ttm1_1.so b/test/kernels/ttm_ttm/ttm1_1.so new file mode 100755 index 000000000..911c44fa1 Binary files /dev/null and b/test/kernels/ttm_ttm/ttm1_1.so differ diff --git a/test/kernels/ttm_ttm/ttm1_2.c b/test/kernels/ttm_ttm/ttm1_2.c new file mode 100644 index 000000000..b04e23a54 --- /dev/null +++ b/test/kernels/ttm_ttm/ttm1_2.c @@ -0,0 +1,219 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A3056, taco_tensor_t *B, taco_tensor_t *A2886) { + int A30561_dimension = (int)(A3056->dimensions[0]); + int A30563_dimension = (int)(A3056->dimensions[2]); + int* restrict A30562_pos = (int*)(A3056->indices[1][0]); + int* restrict A30562_crd = (int*)(A3056->indices[1][1]); + double* restrict A3056_vals = (double*)(A3056->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + + A30562_pos = (int32_t*)malloc(sizeof(int32_t) * (A30561_dimension + 1)); + A30562_pos[0] = 0; + for (int32_t pA30562 = 1; pA30562 < (A30561_dimension + 1); pA30562++) { + A30562_pos[pA30562] = 0; + } + int32_t A30562_crd_size = 1048576; + A30562_crd = (int32_t*)malloc(sizeof(int32_t) * A30562_crd_size); + int32_t i1543A3056 = 0; + + for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= B1_dimension) + continue; + + int32_t pA30562_begin = i1543A3056; + + for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) { + int32_t i1543 = B2_crd[i1543B]; + if (A30562_crd_size <= i1543A3056) { + A30562_crd = (int32_t*)realloc(A30562_crd, sizeof(int32_t) * (A30562_crd_size * 2)); + A30562_crd_size *= 2; + } + A30562_crd[i1543A3056] = i1543; + i1543A3056++; + } + + A30562_pos[i1542 + 1] = i1543A3056 - pA30562_begin; + } + } + + int32_t csA30562 = 0; + for (int32_t pA305620 = 1; pA305620 < (A30561_dimension + 1); pA305620++) { + csA30562 += A30562_pos[pA305620]; + A30562_pos[pA305620] = csA30562; + } + + A3056_vals = (double*)malloc(sizeof(double) * (i1543A3056 * A30563_dimension)); + + A3056->indices[1][0] = (uint8_t*)(A30562_pos); + A3056->indices[1][1] = (uint8_t*)(A30562_crd); + A3056->vals = (uint8_t*)A3056_vals; + return 0; +} + +int compute(taco_tensor_t *A3056, taco_tensor_t *B, taco_tensor_t *A2886) { + int A30561_dimension = (int)(A3056->dimensions[0]); + int A30563_dimension = (int)(A3056->dimensions[2]); + double* restrict A3056_vals = (double*)(A3056->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + int* restrict B3_pos = (int*)(B->indices[2][0]); + int* restrict B3_crd = (int*)(B->indices[2][1]); + double* restrict B_vals = (double*)(B->vals); + int A28861_dimension = (int)(A2886->dimensions[0]); + int A28862_dimension = (int)(A2886->dimensions[1]); + double* restrict A2886_vals = (double*)(A2886->vals); + + // int32_t i1543A3056 = 0; + + #pragma omp parallel for schedule(runtime) + for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= B1_dimension) + continue; + + for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) { + for (int32_t i1546 = 0; i1546 < A28862_dimension; i1546++) { + // int32_t i1546A3056 = i1543A3056 * A30563_dimension + i1546; + int32_t i1546A3056 = i1543B * A30563_dimension + i1546; + double ti1544A3056_val = 0.0; + for (int32_t i1544B = B3_pos[i1543B]; i1544B < B3_pos[(i1543B + 1)]; i1544B++) { + int32_t i1544 = B3_crd[i1544B]; + int32_t i1546A2886 = i1544 * A28862_dimension + i1546; + ti1544A3056_val += B_vals[i1544B] * A2886_vals[i1546A2886]; + } + A3056_vals[i1546A3056] = ti1544A3056_val; + } + // i1543A3056++; + } + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/ttm1_2.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2])); +} diff --git a/test/kernels/ttm_ttm/ttm1_2.h b/test/kernels/ttm_ttm/ttm1_2.h new file mode 100644 index 000000000..86ebdb633 --- /dev/null +++ b/test/kernels/ttm_ttm/ttm1_2.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A3056, taco_tensor_t *B, taco_tensor_t *A2886); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A3056, taco_tensor_t *B, taco_tensor_t *A2886); +#endif diff --git a/test/kernels/ttm_ttm/ttm1_2.so b/test/kernels/ttm_ttm/ttm1_2.so new file mode 100755 index 000000000..c698ec991 Binary files /dev/null and b/test/kernels/ttm_ttm/ttm1_2.so differ diff --git a/test/kernels/ttm_ttm/ttm2.c b/test/kernels/ttm_ttm/ttm2.c new file mode 100644 index 000000000..e98f44e35 --- /dev/null +++ b/test/kernels/ttm_ttm/ttm2.c @@ -0,0 +1,218 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A2593, taco_tensor_t *A2398, taco_tensor_t *D) { + int A25931_dimension = (int)(A2593->dimensions[0]); + int A25933_dimension = (int)(A2593->dimensions[2]); + int* restrict A25932_pos = (int*)(A2593->indices[1][0]); + int* restrict A25932_crd = (int*)(A2593->indices[1][1]); + double* restrict A2593_vals = (double*)(A2593->vals); + int A23981_dimension = (int)(A2398->dimensions[0]); + int* restrict A23982_pos = (int*)(A2398->indices[1][0]); + int* restrict A23982_crd = (int*)(A2398->indices[1][1]); + + A25932_pos = (int32_t*)malloc(sizeof(int32_t) * (A25931_dimension + 1)); + A25932_pos[0] = 0; + for (int32_t pA25932 = 1; pA25932 < (A25931_dimension + 1); pA25932++) { + A25932_pos[pA25932] = 0; + } + int32_t A25932_crd_size = 1048576; + A25932_crd = (int32_t*)malloc(sizeof(int32_t) * A25932_crd_size); + int32_t i1543A2593 = 0; + + for (int32_t i1547 = 0; i1547 < ((A23981_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= A23981_dimension) + continue; + + int32_t pA25932_begin = i1543A2593; + + for (int32_t i1543A2398 = A23982_pos[i1542]; i1543A2398 < A23982_pos[(i1542 + 1)]; i1543A2398++) { + int32_t i1543 = A23982_crd[i1543A2398]; + if (A25932_crd_size <= i1543A2593) { + A25932_crd = (int32_t*)realloc(A25932_crd, sizeof(int32_t) * (A25932_crd_size * 2)); + A25932_crd_size *= 2; + } + A25932_crd[i1543A2593] = i1543; + i1543A2593++; + } + + A25932_pos[i1542 + 1] = i1543A2593 - pA25932_begin; + } + } + + int32_t csA25932 = 0; + for (int32_t pA259320 = 1; pA259320 < (A25931_dimension + 1); pA259320++) { + csA25932 += A25932_pos[pA259320]; + A25932_pos[pA259320] = csA25932; + } + + A2593_vals = (double*)malloc(sizeof(double) * (i1543A2593 * A25933_dimension)); + + A2593->indices[1][0] = (uint8_t*)(A25932_pos); + A2593->indices[1][1] = (uint8_t*)(A25932_crd); + A2593->vals = (uint8_t*)A2593_vals; + return 0; +} + +int compute(taco_tensor_t *A2593, taco_tensor_t *A2398, taco_tensor_t *D) { + int A25931_dimension = (int)(A2593->dimensions[0]); + int A25933_dimension = (int)(A2593->dimensions[2]); + double* restrict A2593_vals = (double*)(A2593->vals); + int A23981_dimension = (int)(A2398->dimensions[0]); + int A23983_dimension = (int)(A2398->dimensions[2]); + int* restrict A23982_pos = (int*)(A2398->indices[1][0]); + int* restrict A23982_crd = (int*)(A2398->indices[1][1]); + double* restrict A2398_vals = (double*)(A2398->vals); + int D1_dimension = (int)(D->dimensions[0]); + int D2_dimension = (int)(D->dimensions[1]); + double* restrict D_vals = (double*)(D->vals); + +// int32_t i1543A2593 = 0; + + #pragma omp parallel for schedule(runtime) + for (int32_t i1547 = 0; i1547 < ((A23981_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= A23981_dimension) + continue; + + for (int32_t i1543A2398 = A23982_pos[i1542]; i1543A2398 < A23982_pos[(i1542 + 1)]; i1543A2398++) { + for (int32_t i1546 = 0; i1546 < D2_dimension; i1546++) { + // int32_t i1546A2593 = i1543A2593 * A25933_dimension + i1546; + int32_t i1546A2593 = i1543A2398 * A25933_dimension + i1546; + double ti1545A2593_val = 0.0; + for (int32_t i1545 = 0; i1545 < D1_dimension; i1545++) { + int32_t i1545A2398 = i1543A2398 * A23983_dimension + i1545; + int32_t i1546D = i1545 * D2_dimension + i1546; + ti1545A2593_val += A2398_vals[i1545A2398] * D_vals[i1546D]; + } + A2593_vals[i1546A2593] = ti1545A2593_val; + } + // i1543A2593++; + } + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/ttm2.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2])); +} diff --git a/test/kernels/ttm_ttm/ttm2.h b/test/kernels/ttm_ttm/ttm2.h new file mode 100644 index 000000000..40f1400d1 --- /dev/null +++ b/test/kernels/ttm_ttm/ttm2.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A2593, taco_tensor_t *A2398, taco_tensor_t *D); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A2593, taco_tensor_t *A2398, taco_tensor_t *D); +#endif diff --git a/test/kernels/ttm_ttm/ttm2.so b/test/kernels/ttm_ttm/ttm2.so new file mode 100755 index 000000000..16a3d2542 Binary files /dev/null and b/test/kernels/ttm_ttm/ttm2.so differ diff --git a/test/kernels/ttm_ttm/ttm_original copy 2.c b/test/kernels/ttm_ttm/ttm_original copy 2.c new file mode 100644 index 000000000..cb21b209f --- /dev/null +++ b/test/kernels/ttm_ttm/ttm_original copy 2.c @@ -0,0 +1,242 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A1537, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) { + int A15371_dimension = (int)(A1537->dimensions[0]); + int A15373_dimension = (int)(A1537->dimensions[2]); + int* restrict A15372_pos = (int*)(A1537->indices[1][0]); + int* restrict A15372_crd = (int*)(A1537->indices[1][1]); + double* restrict A1537_vals = (double*)(A1537->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + + A15372_pos = (int32_t*)malloc(sizeof(int32_t) * (A15371_dimension + 1)); + A15372_pos[0] = 0; + for (int32_t pA15372 = 1; pA15372 < (A15371_dimension + 1); pA15372++) { + A15372_pos[pA15372] = 0; + } + int32_t A15372_crd_size = 1048576; + A15372_crd = (int32_t*)malloc(sizeof(int32_t) * A15372_crd_size); + int32_t i1543A1537 = 0; + + for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= B1_dimension) + continue; + + int32_t pA15372_begin = i1543A1537; + + for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) { + int32_t i1543 = B2_crd[i1543B]; + if (A15372_crd_size <= i1543A1537) { + A15372_crd = (int32_t*)realloc(A15372_crd, sizeof(int32_t) * (A15372_crd_size * 2)); + A15372_crd_size *= 2; + } + A15372_crd[i1543A1537] = i1543; + i1543A1537++; + } + + A15372_pos[i1542 + 1] = i1543A1537 - pA15372_begin; + } + } + + int32_t csA15372 = 0; + for (int32_t pA153720 = 1; pA153720 < (A15371_dimension + 1); pA153720++) { + csA15372 += A15372_pos[pA153720]; + A15372_pos[pA153720] = csA15372; + } + + A1537_vals = (double*)malloc(sizeof(double) * (i1543A1537 * A15373_dimension)); + + A1537->indices[1][0] = (uint8_t*)(A15372_pos); + A1537->indices[1][1] = (uint8_t*)(A15372_crd); + A1537->vals = (uint8_t*)A1537_vals; + return 0; +} + +int compute(taco_tensor_t *A1537, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) { + int A15371_dimension = (int)(A1537->dimensions[0]); + int A15373_dimension = (int)(A1537->dimensions[2]); + int* restrict A15372_pos = (int*)(A1537->indices[1][0]); + double* restrict A1537_vals = (double*)(A1537->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + int* restrict B3_pos = (int*)(B->indices[2][0]); + int* restrict B3_crd = (int*)(B->indices[2][1]); + double* restrict B_vals = (double*)(B->vals); + int C1_dimension = (int)(C->dimensions[0]); + int C2_dimension = (int)(C->dimensions[1]); + double* restrict C_vals = (double*)(C->vals); + int D1_dimension = (int)(D->dimensions[0]); + int D2_dimension = (int)(D->dimensions[1]); + double* restrict D_vals = (double*)(D->vals); + + // int32_t i1543A1537 = 0; + + #pragma omp parallel for schedule(static) + for (int32_t pA1537 = 0; pA1537 < (A15372_pos[A15371_dimension] * A15373_dimension); pA1537++) { + A1537_vals[pA1537] = 0.0; + } + + #pragma omp parallel for schedule(runtime) + for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= B1_dimension) + continue; + + for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) { + for (int32_t i1544B = B3_pos[i1543B]; i1544B < B3_pos[(i1543B + 1)]; i1544B++) { + int32_t i1544 = B3_crd[i1544B]; + for (int32_t i1553 = 0; i1553 < ((D1_dimension + 31) / 32); i1553++) { + for (int32_t i1555 = 0; i1555 < ((D2_dimension + 31) / 32); i1555++) { + for (int32_t i1554 = 0; i1554 < 32; i1554++) { + int32_t i1545 = i1553 * 32 + i1554; + int32_t i1545C = i1544 * C2_dimension + i1545; + if (i1545 >= D1_dimension) + continue; + + for (int32_t i1556 = 0; i1556 < 32; i1556++) { + int32_t i1546 = i1555 * 32 + i1556; + // int32_t i1546A1537 = i1543A1537 * A15373_dimension + i1546; + int32_t i1546A1537 = i1544B * A15373_dimension + i1546; + int32_t i1546D = i1545 * D2_dimension + i1546; + if (i1546 >= D2_dimension) + continue; + + A1537_vals[i1546A1537] = A1537_vals[i1546A1537] + (B_vals[i1544B] * C_vals[i1545C]) * D_vals[i1546D]; + } + } + } + } + } + + // i1543A1537++; + } + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/ttm_original.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} diff --git a/test/kernels/ttm_ttm/ttm_original copy.c b/test/kernels/ttm_ttm/ttm_original copy.c new file mode 100644 index 000000000..2db396c0a --- /dev/null +++ b/test/kernels/ttm_ttm/ttm_original copy.c @@ -0,0 +1,225 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A1537, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) { + int A15371_dimension = (int)(A1537->dimensions[0]); + int A15373_dimension = (int)(A1537->dimensions[2]); + int* restrict A15372_pos = (int*)(A1537->indices[1][0]); + int* restrict A15372_crd = (int*)(A1537->indices[1][1]); + double* restrict A1537_vals = (double*)(A1537->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + + A15372_pos = (int32_t*)malloc(sizeof(int32_t) * (A15371_dimension + 1)); + A15372_pos[0] = 0; + for (int32_t pA15372 = 1; pA15372 < (A15371_dimension + 1); pA15372++) { + A15372_pos[pA15372] = 0; + } + int32_t A15372_crd_size = 1048576; + A15372_crd = (int32_t*)malloc(sizeof(int32_t) * A15372_crd_size); + int32_t i1543A1537 = 0; + + for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= B1_dimension) + continue; + + int32_t pA15372_begin = i1543A1537; + + for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) { + int32_t i1543 = B2_crd[i1543B]; + if (A15372_crd_size <= i1543A1537) { + A15372_crd = (int32_t*)realloc(A15372_crd, sizeof(int32_t) * (A15372_crd_size * 2)); + A15372_crd_size *= 2; + } + A15372_crd[i1543A1537] = i1543; + i1543A1537++; + } + + A15372_pos[i1542 + 1] = i1543A1537 - pA15372_begin; + } + } + + int32_t csA15372 = 0; + for (int32_t pA153720 = 1; pA153720 < (A15371_dimension + 1); pA153720++) { + csA15372 += A15372_pos[pA153720]; + A15372_pos[pA153720] = csA15372; + } + + A1537_vals = (double*)malloc(sizeof(double) * (i1543A1537 * A15373_dimension)); + + A1537->indices[1][0] = (uint8_t*)(A15372_pos); + A1537->indices[1][1] = (uint8_t*)(A15372_crd); + A1537->vals = (uint8_t*)A1537_vals; + return 0; +} + +int compute(taco_tensor_t *A1537, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) { + int A15371_dimension = (int)(A1537->dimensions[0]); + int A15373_dimension = (int)(A1537->dimensions[2]); + double* restrict A1537_vals = (double*)(A1537->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + int* restrict B3_pos = (int*)(B->indices[2][0]); + int* restrict B3_crd = (int*)(B->indices[2][1]); + double* restrict B_vals = (double*)(B->vals); + int C1_dimension = (int)(C->dimensions[0]); + int C2_dimension = (int)(C->dimensions[1]); + double* restrict C_vals = (double*)(C->vals); + int D1_dimension = (int)(D->dimensions[0]); + int D2_dimension = (int)(D->dimensions[1]); + double* restrict D_vals = (double*)(D->vals); + + // int32_t i1543A1537 = 0; + + #pragma omp parallel for schedule(runtime) + for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= B1_dimension) + continue; + + for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) { + for (int32_t i1546 = 0; i1546 < D2_dimension; i1546++) { + // int32_t i1546A1537 = i1543A1537 * A15373_dimension + i1546; + int32_t i1546A1537 = i1543B * A15373_dimension + i1546; + double ti1544A1537_val = 0.0; + for (int32_t i1544B = B3_pos[i1543B]; i1544B < B3_pos[(i1543B + 1)]; i1544B++) { + int32_t i1544 = B3_crd[i1544B]; + for (int32_t i1545 = 0; i1545 < D1_dimension; i1545++) { + int32_t i1545C = i1544 * C2_dimension + i1545; + int32_t i1546D = i1545 * D2_dimension + i1546; + ti1544A1537_val += (B_vals[i1544B] * C_vals[i1545C]) * D_vals[i1546D]; + } + } + A1537_vals[i1546A1537] = ti1544A1537_val; + } + // i1543A1537++; + } + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/ttm_original.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} diff --git a/test/kernels/ttm_ttm/ttm_original.c b/test/kernels/ttm_ttm/ttm_original.c new file mode 100644 index 000000000..ac2674239 --- /dev/null +++ b/test/kernels/ttm_ttm/ttm_original.c @@ -0,0 +1,226 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A1537, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) { + int A15371_dimension = (int)(A1537->dimensions[0]); + int A15373_dimension = (int)(A1537->dimensions[2]); + int* restrict A15372_pos = (int*)(A1537->indices[1][0]); + int* restrict A15372_crd = (int*)(A1537->indices[1][1]); + double* restrict A1537_vals = (double*)(A1537->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + + A15372_pos = (int32_t*)malloc(sizeof(int32_t) * (A15371_dimension + 1)); + A15372_pos[0] = 0; + for (int32_t pA15372 = 1; pA15372 < (A15371_dimension + 1); pA15372++) { + A15372_pos[pA15372] = 0; + } + int32_t A15372_crd_size = 1048576; + A15372_crd = (int32_t*)malloc(sizeof(int32_t) * A15372_crd_size); + int32_t i1543A1537 = 0; + + for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= B1_dimension) + continue; + + int32_t pA15372_begin = i1543A1537; + + for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) { + int32_t i1543 = B2_crd[i1543B]; + if (A15372_crd_size <= i1543A1537) { + A15372_crd = (int32_t*)realloc(A15372_crd, sizeof(int32_t) * (A15372_crd_size * 2)); + A15372_crd_size *= 2; + } + A15372_crd[i1543A1537] = i1543; + i1543A1537++; + } + + A15372_pos[i1542 + 1] = i1543A1537 - pA15372_begin; + } + } + + int32_t csA15372 = 0; + for (int32_t pA153720 = 1; pA153720 < (A15371_dimension + 1); pA153720++) { + csA15372 += A15372_pos[pA153720]; + A15372_pos[pA153720] = csA15372; + } + + A1537_vals = (double*)malloc(sizeof(double) * (i1543A1537 * A15373_dimension)); + + A1537->indices[1][0] = (uint8_t*)(A15372_pos); + A1537->indices[1][1] = (uint8_t*)(A15372_crd); + A1537->vals = (uint8_t*)A1537_vals; + return 0; +} + +int compute(taco_tensor_t *A1537, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) { + int A15371_dimension = (int)(A1537->dimensions[0]); + int A15373_dimension = (int)(A1537->dimensions[2]); + double* restrict A1537_vals = (double*)(A1537->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + int* restrict B3_pos = (int*)(B->indices[2][0]); + int* restrict B3_crd = (int*)(B->indices[2][1]); + double* restrict B_vals = (double*)(B->vals); + int C1_dimension = (int)(C->dimensions[0]); + int C2_dimension = (int)(C->dimensions[1]); + double* restrict C_vals = (double*)(C->vals); + int D1_dimension = (int)(D->dimensions[0]); + int D2_dimension = (int)(D->dimensions[1]); + double* restrict D_vals = (double*)(D->vals); + + // int32_t i1543A1537 = 0; + + #pragma omp parallel for schedule(runtime) + for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= B1_dimension) + continue; + + for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) { + for (int32_t i1546 = 0; i1546 < D2_dimension; i1546++) { + // int32_t i1546A1537 = i1543A1537 * A15373_dimension + i1546; + int32_t i1546A1537 = i1543B * A15373_dimension + i1546; + double ti1544A1537_val = 0.0; + for (int32_t i1544B = B3_pos[i1543B]; i1544B < B3_pos[(i1543B + 1)]; i1544B++) { + int32_t i1544 = B3_crd[i1544B]; + for (int32_t i1545 = 0; i1545 < D1_dimension; i1545++) { + int32_t i1545C = i1544 * C2_dimension + i1545; + int32_t i1546D = i1545 * D2_dimension + i1546; + ti1544A1537_val += (B_vals[i1544B] * C_vals[i1545C]) * D_vals[i1546D]; + } + } + A1537_vals[i1546A1537] = ti1544A1537_val; + } + // i1543A1537++; + } + } + } + return 0; +} + +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/ttm_original.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} diff --git a/test/kernels/ttm_ttm/ttm_original.h b/test/kernels/ttm_ttm/ttm_original.h new file mode 100644 index 000000000..a27841047 --- /dev/null +++ b/test/kernels/ttm_ttm/ttm_original.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A1537, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A1537, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D); +#endif diff --git a/test/kernels/ttm_ttm/ttm_original.so b/test/kernels/ttm_ttm/ttm_original.so new file mode 100755 index 000000000..fa04aed35 Binary files /dev/null and b/test/kernels/ttm_ttm/ttm_original.so differ diff --git a/test/kernels/ttm_ttm/ttm_original2.c b/test/kernels/ttm_ttm/ttm_original2.c new file mode 100644 index 000000000..8dd62d6dd --- /dev/null +++ b/test/kernels/ttm_ttm/ttm_original2.c @@ -0,0 +1,229 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A1542, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) { + int A15421_dimension = (int)(A1542->dimensions[0]); + int A15423_dimension = (int)(A1542->dimensions[2]); + int* restrict A15422_pos = (int*)(A1542->indices[1][0]); + int* restrict A15422_crd = (int*)(A1542->indices[1][1]); + double* restrict A1542_vals = (double*)(A1542->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + + A15422_pos = (int32_t*)malloc(sizeof(int32_t) * (A15421_dimension + 1)); + A15422_pos[0] = 0; + for (int32_t pA15422 = 1; pA15422 < (A15421_dimension + 1); pA15422++) { + A15422_pos[pA15422] = 0; + } + int32_t A15422_crd_size = 1048576; + A15422_crd = (int32_t*)malloc(sizeof(int32_t) * A15422_crd_size); + int32_t i1548A1542 = 0; + + for (int32_t i1552 = 0; i1552 < ((B1_dimension + 15) / 16); i1552++) { + for (int32_t i1553 = 0; i1553 < 16; i1553++) { + int32_t i1547 = i1552 * 16 + i1553; + if (i1547 >= B1_dimension) + continue; + + int32_t pA15422_begin = i1548A1542; + + for (int32_t i1548B = B2_pos[i1547]; i1548B < B2_pos[(i1547 + 1)]; i1548B++) { + int32_t i1548 = B2_crd[i1548B]; + if (A15422_crd_size <= i1548A1542) { + A15422_crd = (int32_t*)realloc(A15422_crd, sizeof(int32_t) * (A15422_crd_size * 2)); + A15422_crd_size *= 2; + } + A15422_crd[i1548A1542] = i1548; + i1548A1542++; + } + + A15422_pos[i1547 + 1] = i1548A1542 - pA15422_begin; + } + } + + int32_t csA15422 = 0; + for (int32_t pA154220 = 1; pA154220 < (A15421_dimension + 1); pA154220++) { + csA15422 += A15422_pos[pA154220]; + A15422_pos[pA154220] = csA15422; + } + + A1542_vals = (double*)malloc(sizeof(double) * (i1548A1542 * A15423_dimension)); + + A1542->indices[1][0] = (uint8_t*)(A15422_pos); + A1542->indices[1][1] = (uint8_t*)(A15422_crd); + A1542->vals = (uint8_t*)A1542_vals; + return 0; +} + +int compute(taco_tensor_t *A1542, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) { + int A15421_dimension = (int)(A1542->dimensions[0]); + int A15423_dimension = (int)(A1542->dimensions[2]); + int* restrict A15422_pos = (int*)(A1542->indices[1][0]); + double* restrict A1542_vals = (double*)(A1542->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + int* restrict B3_pos = (int*)(B->indices[2][0]); + int* restrict B3_crd = (int*)(B->indices[2][1]); + double* restrict B_vals = (double*)(B->vals); + int C1_dimension = (int)(C->dimensions[0]); + int C2_dimension = (int)(C->dimensions[1]); + double* restrict C_vals = (double*)(C->vals); + int D1_dimension = (int)(D->dimensions[0]); + int D2_dimension = (int)(D->dimensions[1]); + double* restrict D_vals = (double*)(D->vals); + +// int32_t i1548A1542 = 0; + + #pragma omp parallel for schedule(static) + for (int32_t pA1542 = 0; pA1542 < (A15422_pos[A15421_dimension] * A15423_dimension); pA1542++) { + A1542_vals[pA1542] = 0.0; + } + + #pragma omp parallel for schedule(runtime) + for (int32_t i1552 = 0; i1552 < ((B1_dimension + 15) / 16); i1552++) { + for (int32_t i1553 = 0; i1553 < 16; i1553++) { + int32_t i1547 = i1552 * 16 + i1553; + if (i1547 >= B1_dimension) + continue; + + for (int32_t i1548B = B2_pos[i1547]; i1548B < B2_pos[(i1547 + 1)]; i1548B++) { + for (int32_t i1549B = B3_pos[i1548B]; i1549B < B3_pos[(i1548B + 1)]; i1549B++) { + int32_t i1549 = B3_crd[i1549B]; + for (int32_t i1550 = 0; i1550 < D1_dimension; i1550++) { + int32_t i1550C = i1549 * C2_dimension + i1550; + for (int32_t i1551 = 0; i1551 < D2_dimension; i1551++) { + // int32_t i1551A1542 = i1548A1542 * A15423_dimension + i1551; + int32_t i1551A1542 = i1548B * A15423_dimension + i1551; + int32_t i1551D = i1550 * D2_dimension + i1551; + A1542_vals[i1551A1542] = A1542_vals[i1551A1542] + (B_vals[i1549B] * C_vals[i1550C]) * D_vals[i1551D]; + } + } + } + // i1548A1542++; + } + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/ttm_original2.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} diff --git a/test/kernels/ttm_ttm/ttm_original2.h b/test/kernels/ttm_ttm/ttm_original2.h new file mode 100644 index 000000000..8a08b4548 --- /dev/null +++ b/test/kernels/ttm_ttm/ttm_original2.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A1542, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A1542, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D); +#endif diff --git a/test/kernels/ttm_ttm/ttm_original2.so b/test/kernels/ttm_ttm/ttm_original2.so new file mode 100755 index 000000000..6466a2af2 Binary files /dev/null and b/test/kernels/ttm_ttm/ttm_original2.so differ diff --git a/test/test.cpp b/test/test.cpp index a49f10ff7..851493b7f 100644 --- a/test/test.cpp +++ b/test/test.cpp @@ -38,6 +38,20 @@ void ASSERT_TENSOR_EQ(TensorBase expected, TensorBase actual) { ASSERT_TRUE(equals(expected, actual)); } +// void ASSERT_TENSOR_VAL(TensorBase expected, TensorBase actual) { +// std::cout << "order: " << expected.getOrder(); +// std::vector modes{}; +// for (int mode = 0; mode < expected.getOrder(); mode++) { +// if (expected.getDimension(mode) != actual.getDimension(mode)) { +// ASSERT_TRUE(false); +// } + +// for (int i=0; i expected, void ASSERT_STORAGE_EQ(TensorStorage expected, TensorStorage actual); void ASSERT_TENSOR_EQ(TensorBase expected, TensorBase actual); +// void ASSERT_TENSOR_VAL(TensorBase expected, TensorBase actual); template void ASSERT_COMPONENTS_EQUALS(vector>> expectedIndices, diff --git a/test/tests-indexstmt.cpp b/test/tests-indexstmt.cpp index e2a972430..ae80e5493 100644 --- a/test/tests-indexstmt.cpp +++ b/test/tests-indexstmt.cpp @@ -1,10 +1,13 @@ +#include "taco/index_notation/kernel.h" +#include "taco/type.h" #include "test.h" #include "test_tensors.h" #include "taco/tensor.h" #include "taco/index_notation/index_notation.h" +#include "taco/index_notation/transformations.h" using namespace taco; -const IndexVar i("i"), j("j"), k("k"); +const IndexVar i("i"), j("j"), k("k"), l("l"), m("m"); TEST(indexstmt, assignment) { Type t(type(), {3}); @@ -84,4 +87,192 @@ TEST(indexstmt, spmm) { } +TEST(indexstmt, sddmm) { + Type t(type(), {3,3}); + TensorVar A("A", t, {Sparse, Dense}); + TensorVar B("B", t, {Sparse, Dense}); + TensorVar C("C", t, {Dense, Dense}); + TensorVar w("w", Type(type(),{3}), Dense); + + // the below expression is the concrete index notation + // where (consumer, producer) + IndexStmt spmm = forall(i, + forall(k, + where(forall(j, A(i,j) = w(j)), + forall(j, w(j) += B(i,k)*C(k,j)) + ) + ) + ); + + // after adding scheduling transformations to this concrete-topologically sorted index stmt + // + + std::cout << spmm << std::endl; + spmm = reorderLoopsTopologically(spmm); + std::cout << "topologically reordered loops statement: " << spmm << std::endl; + + Kernel kernel = compile(spmm); +} + +TEST(indexstmt, sddmmPlusSpmm) { + + // Y(i,l) = B(i,j)*C(i,k)*D(k,j) * F(j,l); + // indexstmt order i, j, k, l + //topologically reordered loops statement: forall(i, forall(k, forall(j, forall(l, Y(i,l) += B(i,j) * C(i,k) * D(k,j) * F(j,l), NotParallel, IgnoreRaces), NotParallel, IgnoreRaces), NotParallel, IgnoreRaces), NotParallel, IgnoreRaces) + + Type t(type(), {3,3}); + TensorVar Y("Y", t, {Dense, Dense}); + TensorVar B("B", t, {Dense, Sparse}); + TensorVar C("C", t, {Dense, Dense}); + TensorVar D("D", t, {Dense, Dense}); + TensorVar E("E", t, {Dense, Dense}); + + // TensorVar A("A", Type(type(),{3}), ); + TensorVar A("A", Type()); + + IndexStmt fused1 = + forall(i, + forall(j, + forall(k, + forall(l, Y(i,l) += B(i,j) * C(i,k) * D(j,k) * E(j,l)) + ) + ) + ); + + std::cout << "before topological sort" << fused1 << std::endl; + fused1 = reorderLoopsTopologically(fused1); + std::cout << "after topological sort" << fused1 << std::endl; + + Kernel kernel = compile(fused1); + + + IndexStmt fused2 = + forall(i, + forall(j, + where( + forall(l, Y(i,l) += A * E(j,l)), // consumer + forall(k, A += B(i,j)*C(i,k)*D(j,k)) // producer + ) + ) + ); + + Kernel kernel2 = compile(fused2); + +} + + + +TEST(indexstmt, mttkrpPlusSpmm) { + + // ./bin/taco "A(i,m)=B(i,k,l)*C(k,j)*D(l,j)*E(j,m)" -f=A:dd:0,1 -f=B:sss:0,1,2 -f=C:dd:0,1 -f=D:dd:0,1 -f=E:dd:0,1 + + // i = 11, k = 5, l = 7, j = 8; + long unsigned int idim = 11, kdim = 5, ldim = 7, jdim = 8, mdim = 6; + + Type atype(type(), {idim, mdim}); + Type btype(type(), {idim, kdim, ldim}); + Type ctype(type(), {kdim, jdim}); + Type dtype(type(), {ldim, jdim}); + Type etype(type(), {jdim, mdim}); + + TensorVar A("A", atype, {Dense, Dense}); + TensorVar B("B", btype, {Sparse, Sparse, Sparse}); + TensorVar C("C", ctype, {Dense, Dense}); + TensorVar D("D", dtype, {Dense, Dense}); + TensorVar E("E", etype, {Dense, Dense}); + + TensorVar ws("ws", Type(type(), {jdim}) ); + + IndexStmt fused1 = + forall(i, + forall(k, + forall(l, + forall(j, + forall(m, A(i,m) += B(i,k,l) * C(k,j) * D(l,j) * E(j,m)) + ) + ) + ) + ); + + std::cout << "before topological sort" << fused1 << std::endl; + fused1 = reorderLoopsTopologically(fused1); + std::cout << "after topological sort" << fused1 << std::endl; + + Kernel kernel = compile(fused1); + + IndexStmt fused2 = + forall(i, + where( + forall(j, + forall(m, + A(i,m) += ws(j) * E(j,m) + ) + ) + , + forall(k, + forall(l, + forall(j, + ws(j) += B(i,k,l) * C(k,j) * D(l,j) + ) + ) + ) + ) + ); + + Kernel kernel2 = compile(fused2); + +} + +// ./bin/taco "y(i)=A(i,j)*B(j,k)*v(k)" -f=y:d:0 -f=A:dd:0,1 -f=B:dd:0,1 -f=v:d:0 +TEST(indexstmt, mmPlusSpmv) { + + // + + long unsigned int idim = 11, jdim = 8, kdim = 5; + + Type ytype(type(), {idim}); + Type atype(type(), {idim, jdim}); + Type btype(type(), {jdim, kdim}); + Type vtype(type(), {kdim}); + + TensorVar y("y", ytype, {Dense}); + TensorVar A("A", atype, {Dense, Dense}); + TensorVar B("B", btype, {Dense, Dense}); + TensorVar v("v", vtype, {Dense}); + + TensorVar ws("ws", Type(type(), {jdim}) ); + + IndexStmt fused1 = + forall(i, + forall(j, + forall(k, + forall(m, y(i) += A(i,j) * B(j,k) * v(k)) + ) + ) + ); + + std::cout << "before topological sort" << fused1 << std::endl; + fused1 = reorderLoopsTopologically(fused1); + std::cout << "after topological sort" << fused1 << std::endl; + + Kernel kernel = compile(fused1); + + IndexStmt fused2 = + where( + forall(i, + forall(j, + y(i) += A(i,j) * ws(j) + ) + ) + , + forall(j, + forall(k, + ws(j) += B(j,k) * v(k) + ) + ) + ); + + Kernel kernel2 = compile(fused2); +} + diff --git a/test/tests-scheduling-eval.cpp b/test/tests-scheduling-eval.cpp index 52bd74ab4..3c5362118 100644 --- a/test/tests-scheduling-eval.cpp +++ b/test/tests-scheduling-eval.cpp @@ -1,42 +1,8 @@ -#include -#include -#include -#include -#include "test.h" -#include "test_tensors.h" -#include "taco/tensor.h" -#include "taco/index_notation/index_notation.h" -#include "taco/index_notation/transformations.h" -#include "codegen/codegen.h" -#include "taco/lower/lower.h" - -using namespace taco; +#include "util.h" + const IndexVar i("i"), j("j"), k("k"), l("l"), m("m"), n("n"); int WARP_SIZE = 32; -void printToCout(IndexStmt stmt) { - std::shared_ptr codegen = ir::CodeGen::init_default(cout, ir::CodeGen::ImplementationGen); - ir::Stmt compute = lower(stmt, "compute", false, true); - codegen->compile(compute, true); -} - -void printToFile(string filename, IndexStmt stmt) { - stringstream source; - - string file_path = "eval_generated/"; - mkdir(file_path.c_str(), 0777); - - std::shared_ptr codegen = ir::CodeGen::init_default(source, ir::CodeGen::ImplementationGen); - ir::Stmt compute = lower(stmt, "compute", false, true); - codegen->compile(compute, true); - - ofstream source_file; - string file_ending = should_use_CUDA_codegen() ? ".cu" : ".c"; - source_file.open(file_path + filename + file_ending); - source_file << source.str(); - source_file.close(); -} - IndexStmt scheduleSpMVCPU(IndexStmt stmt, int CHUNK_SIZE=16) { IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1"); return stmt.split(i, i0, i1, CHUNK_SIZE) @@ -107,6 +73,27 @@ IndexStmt scheduleSDDMMCPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, .parallelize(kpos1, ParallelUnit::CPUVector, OutputRaceStrategy::ParallelReduction); } +IndexStmt scheduleSDDMMCSRCPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1"); + return stmt; + // return stmt.split(i, i0, i1, CHUNK_SIZE) + // .pos(k, kpos, B(i,k)) + // .split(kpos, kpos0, kpos1, UNROLL_FACTOR) + // .reorder({i0, i1, kpos0, j, kpos1}); + // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); + // .parallelize(k, ParallelUnit::CPUVector, OutputRaceStrategy::IgnoreRaces); +} + +IndexStmt scheduleSDDMM2CPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); + return stmt.split(i, i0, i1, CHUNK_SIZE) + .pos(j, jpos, B(i,j)) + .split(jpos, jpos0, jpos1, UNROLL_FACTOR) + .reorder({i0, i1, jpos0, k, jpos1}) + .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + .parallelize(jpos1, ParallelUnit::CPUVector, OutputRaceStrategy::ParallelReduction); +} + IndexStmt scheduleTTVCPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16) { IndexVar f("f"), fpos("fpos"), chunk("chunk"), fpos2("fpos2"); return stmt.fuse(i, j, f) @@ -125,6 +112,13 @@ IndexStmt scheduleTTVCPUCSR(IndexStmt stmt) { OutputRaceStrategy::NoRaces); } +IndexStmt scheduleTTVCPUCSR_ST(IndexStmt stmt) { + TensorVar result = stmt.as().getStmt().as().getStmt() + .as().getStmt().as().getLhs() + .getTensorVar(); + return stmt.assemble(result, AssembleStrategy::Insert); +} + IndexStmt scheduleTTMCPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { IndexVar f("f"), fpos("fpos"), chunk("chunk"), fpos2("fpos2"), kpos("kpos"), kpos1("kpos1"), kpos2("kpos2"); return stmt.fuse(i, j, f) @@ -149,12 +143,30 @@ IndexStmt scheduleMTTKRPCPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, .parallelize(i1, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); } +IndexStmt scheduleMTTKRPCPU_ST(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i1("i1"), i2("i2"); + IndexExpr precomputeExpr = stmt.as().getStmt().as().getStmt() + .as().getStmt().as().getStmt() + .as().getRhs().as().getA(); + TensorVar w("w", Type(Float64, {Dimension(j)}), taco::dense); + return stmt.split(i, i1, i2, CHUNK_SIZE) + .reorder({i1, i2, k, l, j}) + .precompute(precomputeExpr, j, j, w); + // .parallelize(j, ParallelUnit::CPUVector, OutputRaceStrategy::Atomics); // gives error when lowering for IgnoreRaces, NoRaces and Atomics + // .parallelize(i1, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); +} + IndexStmt scheduleMTTKRPPrecomputedCPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { IndexVar i1("i1"), i2("i2"), j_pre("j_pre"); return stmt.split(i, i1, i2, CHUNK_SIZE) .parallelize(i1, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); } +IndexStmt scheduleMTTKRPPrecomputedCPU_ST(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i1("i1"), i2("i2"), j_pre("j_pre"); + return stmt.split(i, i1, i2, CHUNK_SIZE); +} + IndexStmt scheduleMTTKRP4CPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { IndexVar i1("i1"), i2("i2"); return stmt.split(i, i1, i2, CHUNK_SIZE) @@ -162,6 +174,12 @@ IndexStmt scheduleMTTKRP4CPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16 .parallelize(i1, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); } +IndexStmt scheduleMTTKRP4CPU_ST(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i1("i1"), i2("i2"); + return stmt.split(i, i1, i2, CHUNK_SIZE) + .reorder({i1, i2, k, l, m, j}); +} + IndexStmt scheduleMTTKRP5CPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { IndexVar i1("i1"), i2("i2"); return stmt.split(i, i1, i2, CHUNK_SIZE) @@ -805,7 +823,68 @@ TEST(scheduling_eval, sddmmCPU) { IndexStmt stmt = A.getAssignment().concretize(); stmt = scheduleSDDMMCPU(stmt, B); - //printToFile("sddmm_cpu", stmt); + printToFile("sddmm_cpu_ryan2", stmt); + + A.compile(stmt); + A.assemble(); + A.compute(); + + Tensor expected("expected", {NUM_I, NUM_K}, {Dense, Dense}); + expected(i,k) = B(i,k) * C(i,j) * D(j,k); + expected.compile(); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, A); +} + +TEST(scheduling_eval, sddmmSPMMFusedCPU) { + if (should_use_CUDA_codegen()) { + return; + } + + int NUM_I = 1021/10; + int NUM_J = 1039/10; + int NUM_K = 1057/10; + float SPARSITY = .3; + Tensor A("A", {NUM_I, NUM_K}, {Dense, Dense}); + Tensor B("B", {NUM_I, NUM_K}, CSR); + Tensor C("C", {NUM_I, NUM_J}, {Dense, Dense}); + Tensor D("D", {NUM_J, NUM_K}, {Dense, Dense}); + + srand(268238); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + C.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + + for (int i = 0; i < NUM_I; i++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, k}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + } + + for (int j = 0; j < NUM_J; j++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + D.insert({j, k}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + + B.pack(); + C.pack(); + D.pack(); + + A(i,k) = B(i,k) * C(i,j) * D(j,k); + + IndexStmt stmt = A.getAssignment().concretize(); + stmt = scheduleSDDMMCPU(stmt, B); + + printToFile("sddmm_cpu_ryan2", stmt); A.compile(stmt); A.assemble(); @@ -819,6 +898,125 @@ TEST(scheduling_eval, sddmmCPU) { ASSERT_TENSOR_EQ(expected, A); } + +TEST(scheduling_eval, sddmmcsrCPU) { + if (should_use_CUDA_codegen()) { + return; + } + int NUM_I = 1021/10; + int NUM_J = 1039/10; + int NUM_K = 1057/10; + float SPARSITY = .3; + Tensor A("A", {NUM_I, NUM_K}, CSR); + Tensor B("B", {NUM_I, NUM_K}, CSR); + Tensor C("C", {NUM_I, NUM_J}, {Dense, Dense}); + Tensor D("D", {NUM_J, NUM_K}, {Dense, Dense}); + + srand(268238); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + C.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + + for (int i = 0; i < NUM_I; i++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, k}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + } + + for (int j = 0; j < NUM_J; j++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + D.insert({j, k}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + + B.pack(); + C.pack(); + D.pack(); + + A(i,k) = B(i,k) * C(i,j) * D(j,k); + + IndexStmt stmt = A.getAssignment().concretize(); + stmt = scheduleSDDMMCSRCPU(stmt, B); + + printToFile("sddmm_cpu", stmt); + + A.compile(stmt); + A.assemble(); + A.compute(); + + Tensor expected("expected", {NUM_I, NUM_K}, CSR); + expected(i,k) = B(i,k) * C(i,j) * D(j,k); + + IndexStmt stmt_ref = expected.getAssignment().concretize(); + printToFile("sddmm_cpu_ref", stmt_ref); + + expected.compile(stmt_ref); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, A); +} + + +TEST(scheduling_eval, sddmm2CPU) { + if (should_use_CUDA_codegen()) { + return; + } + int NUM_I = 1021/10; + int NUM_J = 1021/10; + int NUM_K = 18; + float SPARSITY = .3; + Tensor Y("Y", {NUM_I, NUM_J}, {Dense, Compressed(ModeFormat::UNIQUE)}); + Tensor A("A", {NUM_I, NUM_J}, {Dense, Compressed(ModeFormat::UNIQUE)}); + Tensor X("X", {NUM_I, NUM_K}, {Dense, Dense}); + + srand(268238); + + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + if (rand_float < SPARSITY) { + A.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + } + + for (int i = 0; i < NUM_J; i++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + X.insert({i, k}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + + A.pack(); + X.pack(); + + Y(i,j) = A(i,j) * X(i,k) * X(k,j); + + // IndexStmt stmt = A.getAssignment().concretize(); + // // stmt = scheduleSDDMMCPU(stmt, A); + + // printToFile("sddmm2_cpu", stmt); + + // A.compile(stmt); + // A.assemble(); + // A.compute(); + + // Tensor expected("expected", {NUM_I, NUM_J}, {Dense, Dense}); + // expected(i,j) = A(i,j) * X(i,k) * X(j,k); + // expected.compile(); + // expected.assemble(); + // expected.compute(); + // ASSERT_TENSOR_EQ(expected, A); +} + + TEST(scheduling_eval, spmvCPU) { if (should_use_CUDA_codegen()) { return; @@ -904,7 +1102,7 @@ TEST(scheduling_eval, ttvCPU) { IndexStmt stmt = A.getAssignment().concretize(); stmt = scheduleTTVCPU(stmt, B); - //printToFile("ttv_cpu", stmt); + printToFile("ttv_cpu", stmt); A.compile(stmt); A.assemble(); @@ -918,6 +1116,7 @@ TEST(scheduling_eval, ttvCPU) { ASSERT_TENSOR_EQ(expected, A); } + TEST(scheduling_eval, ttvCPU_CSR) { if (should_use_CUDA_codegen()) { return; @@ -928,7 +1127,7 @@ TEST(scheduling_eval, ttvCPU_CSR) { int NUM_K = 1057/10; float SPARSITY = .3; Tensor A("A", {NUM_I, NUM_J}, {Dense, Sparse}); - Tensor B("B", {NUM_I, NUM_J, NUM_K}, {Sparse, Sparse, Sparse}); + Tensor B("B", {NUM_I, NUM_J, NUM_K}, {Dense, Sparse, Sparse}); Tensor c("c", {NUM_K}, Format({Dense})); srand(9536); @@ -956,11 +1155,13 @@ TEST(scheduling_eval, ttvCPU_CSR) { IndexStmt stmt = A.getAssignment().concretize(); stmt = scheduleTTVCPUCSR(stmt); + printToFile("ttv_cpu_csr", stmt); + A.compile(stmt); A.assemble(); A.compute(); - Tensor expected("expected", {NUM_I, NUM_J}, {Dense, Dense}); + Tensor expected("expected", {NUM_I, NUM_J}, {Dense, Sparse}); expected(i,j) = B(i,j,k) * c(k); expected.compile(); expected.assemble(); @@ -968,6 +1169,7 @@ TEST(scheduling_eval, ttvCPU_CSR) { ASSERT_TENSOR_EQ(expected, A); } + TEST(scheduling_eval, ttmCPU) { if (should_use_CUDA_codegen()) { return; @@ -1463,7 +1665,8 @@ TEST(scheduling_eval, mttkrpGPU) { ASSERT_TENSOR_EQ(expected, A); } -TEST(generate_evaluation_files, DISABLED_cpu) { + +TEST(generate_evaluation_files, cpu) { if (should_use_CUDA_codegen()) { return; } @@ -1779,10 +1982,13 @@ TEST(generate_evaluation_files, DISABLED_cpu) { } } -TEST(generate_evaluation_files, DISABLED_gpu) { - if (!should_use_CUDA_codegen()) { - return; - } +TEST(generate_evaluation_files, gpu) { + // if (!should_use_CUDA_codegen()) { + // return; + // } + set_CUDA_codegen_enabled(true); + + std::cout << "executing generate_evaluation_file.gpu\n"; vector> spmv_parameters = {}; // {NNZ_PER_THREAD, BLOCK_SIZE} for (int i = 3; i <= 20; i++) { diff --git a/test/tests-scheduling-fuse.cpp b/test/tests-scheduling-fuse.cpp new file mode 100644 index 000000000..2fbececfe --- /dev/null +++ b/test/tests-scheduling-fuse.cpp @@ -0,0 +1,2780 @@ +#include "taco/cuda.h" +#include "taco/tensor.h" +#include "test.h" +#include "util.h" +#include +#include "gtest/gtest.h" +#include + +#define NUM_THREADS_TO_USE 1 +// #define NUM_THREADS_TO_USE 32 + +// TEST(scheduling_eval, spmvFusedWithSyntheticData) { +// if (should_use_CUDA_codegen()) { +// return; +// } +// taco_set_num_threads(NUM_THREADS_TO_USE); + +// std::default_random_engine gen(0); +// std::uniform_real_distribution unif(0.0, 1.0); + +// Format csr({dense, sparse}); +// Format rm({dense}); + +// // uncomment this for reading the csr matrix saved in mtx file +// std::cout << "reading B mat mtx\n"; + +// int NUM_I = 5; // 1021/10; +// int NUM_J = 5; // 1039/10; +// int NUM_K = 8; +// float SPARSITY = .3; +// Tensor B("B", {NUM_I, NUM_J}, csr); +// srand(75883); +// for (int i = 0; i < NUM_I; i++) { +// for (int j = 0; j < NUM_J; j++) { +// float rand_float = (float)rand()/(float)(RAND_MAX); +// if (rand_float < SPARSITY) { +// B.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY))); +// } +// } +// } +// B.pack(); + + +// std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl; +// std::cout << "adding c mat\n"; +// Tensor C("C", {NUM_J, NUM_K}, csr); +// for (int i = 0; i < C.getDimension(0); ++i) { +// for (int j = 0; j < C.getDimension(1); ++j) { +// C.insert({i,j}, unif(gen)); +// } +// } +// std::cout << "packing C mat\n"; +// C.pack(); + +// Tensor v("v", {NUM_K}, rm); +// for (int i = 0; i < v.getDimension(0); ++i) { +// v.insert({i}, unif(gen)); +// } +// std::cout << "packing D mat\n"; +// v.pack(); + +// Tensor A("A", {NUM_I}, rm); +// Tensor ref("ref", {NUM_I}, rm); +// IndexVar i, j, k, l, m; +// A(i) = B(i,j) * C(j,k) * v(k); + +// // IndexStmt stmt = A.getAssignment().concretize(); +// IndexStmt stmt = makeReductionNotation(A.getAssignment()); +// stmt = makeConcreteNotation(stmt); +// printToFile("SpMVfused", stmt); +// stmt = reorderLoopsTopologically(stmt); +// stmt = loopFusionOverFission(stmt, A.getAssignment(), "f", 1); +// stmt = insertTemporaries(stmt); +// stmt = parallelizeOuterLoop(stmt); + +// A.compile(stmt); +// // We can now call the functions taco generated to assemble the indices of the +// // output matrix and then actually compute the MTTKRP. +// A.assemble(); + + +// // ref(i) = B(i,j) * C(j,k) * v(k); +// // IndexStmt refStmt = makeReductionNotation(ref.getAssignment()); +// // refStmt = makeConcreteNotation(refStmt); +// // refStmt = insertTemporaries(refStmt); +// // refStmt = parallelizeOuterLoop(refStmt); +// // ref.compile(refStmt); +// // ref.assemble(); + +// // Tensor ref1({NUM_J}, rm); +// // Tensor ref2({NUM_I}, rm); +// // ref1(j) = C(j,k) * v(k); +// // ref2(i) = B(i,j) * ref1(j); + +// // IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment()); +// // ref1Stmt = makeConcreteNotation(ref1Stmt); +// // ref1Stmt = insertTemporaries(ref1Stmt); +// // ref1Stmt = parallelizeOuterLoop(ref1Stmt); +// // ref1.compile(ref1Stmt); +// // ref1.assemble(); + +// // IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment()); +// // ref2Stmt = makeConcreteNotation(ref2Stmt); +// // ref2Stmt = insertTemporaries(ref2Stmt); +// // ref2Stmt = parallelizeOuterLoop(ref2Stmt); +// // ref2.compile(ref2Stmt); +// // ref2.assemble(); + +// std::cout << "compute start\n"; +// taco::util::TimeResults timevalue; +// bool time = true; +// // TOOL_BENCHMARK_TIMER(ref.compute(), "\n\nReference Kernel: ", timevalue); +// TOOL_BENCHMARK_TIMER(A.compute(), "\n\nFused Kernel: ", timevalue); +// // ASSERT_TENSOR_EQ(ref, A); + +// // // check results +// // for (int q = 0; q < A.getDimension(0); ++q) { +// // if ( abs(A(q) - ref(q))/abs(ref(q)) > ERROR_MARGIN) { +// // std::cout << "error: results don't match A("<< q << "): " +// // << A(q) << ", ref: " << ref(q) << std::endl; +// // ASSERT_TRUE(false); +// // } +// // } +// // // ASSERT_TENSOR_EQ(A, ref); +// // TOOL_BENCHMARK_TIMER(ref1.compute(), "\n\nSDDMM Kernel: ", timevalue); +// // TOOL_BENCHMARK_TIMER(ref2.compute(), "\n\nSpMM Kernel: ", timevalue); +// // ASSERT_TENSOR_EQ(ref, ref2); + +// // for (int q = 0; q < ref2.getDimension(0); ++q) { +// // for (int w = 0; w < ref2.getDimension(1); ++w) { +// // if ( abs(ref2(q,w) - ref(q,w))/abs(ref(q,w)) > ERROR_MARGIN) { +// // std::cout << "error: results don't match A("<< q << "," << w << "): " +// // << ref2(q,w) << ", ref: " << ref(q,w) << std::endl; +// // ASSERT_TRUE(false); +// // } +// // } +// // } + +// } + +// TEST(scheduling_eval, spmvFused) { +// if (should_use_CUDA_codegen()) { +// return; +// } + +// ofstream statfile; +// statfile.open( +// "/home/min/a/kadhitha/workspace/my_taco/taco/test/stats/spmv-spmv.txt", std::ios::app); +// if (statfile.is_open()) { +// statfile << "\nspmv-spmv execution\n"; +// statfile << "\n-----------------------------------------\n"; +// } +// taco_set_num_threads(NUM_THREADS_TO_USE); + +// std::default_random_engine gen(0); +// std::uniform_real_distribution unif(0.0, 1.0); + +// Format csr({dense, sparse}); +// Format rm({dense}); + + + +// int filenum = 1; + +// std::vector matfiles = { +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx", +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx", +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx", +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx", +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx", +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx", // 5 +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx", +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx", +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx", // 8 +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx", +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx", // 10 +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx", +// "/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx", // 12 +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wiki-Talk/wiki-Talk.mtx", // 13 +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/com-Orkut/com-Orkut.mtx", +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx", // 15 +// "/home/min/a/kadhitha/workspace/my_taco/FusedMM/dataset/harvard.mtx", +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/twitter7/twitter7.mtx" +// }; +// std::vector matfilesrw = { +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/synthetic.mtx", +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cage3.mtx", +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/bcsstk17.mtx", +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/pdb1HYS.mtx", +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/rma10.mtx", +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cant.mtx", +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/consph.mtx", +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cop20k_A.mtx", +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/shipsec1.mtx", +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/scircuit.mtx", +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx", +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx", +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/webbase-1M.mtx", +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/wiki-Talk.mtx", +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/com-Orkut.mtx", +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/circuit5M.mtx", +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/harvard.mtx", +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/twitter7.mtx" +// }; + +// // uncomment this for reading the csr matrix saved in mtx file +// std::cout << "reading B mat mtx\n"; + + +// int kDim = 8; +// float SPARSITY = .3; +// std::string matfile = matfiles[filenum]; +// std::cout << "reading B mat mtx\n"; +// Tensor B = read(matfile, csr, true); +// B.setName("B"); +// B.pack(); + +// std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl; +// std::cout << "adding c mat\n"; + +// std::cout << "reading B mat mtx\n"; +// Tensor C = read(matfile, csr, true); +// C.setName("C"); +// C.pack(); + + +// Tensor v("v", {C.getDimension(1)}, rm); +// for (int i = 0; i < v.getDimension(0); ++i) { +// v.insert({i}, unif(gen)); +// } +// std::cout << "packing D mat\n"; +// v.pack(); + +// if (statfile.is_open()) { +// statfile +// << "A(i) = B(i,j) * C(j,k) * v(k);" << std::endl +// << "B1_dimension: " << B.getDimension(0) << ", B2_dimension: " << B.getDimension(1) << ", vals: " << B.getStorage().getValues().getSize() << std::endl +// << "C1_dimension: " << C.getDimension(0) << ", C2_dimension: " << C.getDimension(1) << ", vals: " << C.getStorage().getValues().getSize() << std::endl +// << "D1_dimension: " << v.getDimension(0) << ", vals: " << v.getStorage().getValues().getSize() << std::endl +// << std::endl; +// } + +// Tensor A("A", {B.getDimension(0)}, rm); +// Tensor ref("ref", {B.getDimension(0)}, rm); +// IndexVar i, j, k, l, m; +// A(i) = B(i,j) * C(j,k) * v(k); + +// ref(i) = B(i,j) * C(j,k) * v(k); +// IndexStmt refStmt = makeReductionNotation(ref.getAssignment()); +// refStmt = makeConcreteNotation(refStmt); +// refStmt = insertTemporaries(refStmt); +// refStmt = parallelizeOuterLoop(refStmt); +// ref.compile(refStmt); +// ref.assemble(); + +// // IndexStmt stmt = A.getAssignment().concretize(); +// IndexStmt stmt = makeReductionNotation(A.getAssignment()); +// stmt = makeConcreteNotation(stmt); +// printToFile("SpMVfused", stmt); +// stmt = reorderLoopsTopologically(stmt); +// stmt = loopFusionOverFission(stmt, A.getAssignment(), "f", 1); +// stmt = insertTemporaries(stmt); +// stmt = parallelizeOuterLoop(stmt); +// A.compile(stmt); +// A.assemble(); + + +// // Tensor ref1({NUM_J}, rm); +// // Tensor ref2({NUM_I}, rm); +// // ref1(j) = C(j,k) * v(k); +// // ref2(i) = B(i,j) * ref1(j); + +// // IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment()); +// // ref1Stmt = makeConcreteNotation(ref1Stmt); +// // ref1Stmt = insertTemporaries(ref1Stmt); +// // ref1Stmt = parallelizeOuterLoop(ref1Stmt); +// // ref1.compile(ref1Stmt); +// // ref1.assemble(); + +// // IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment()); +// // ref2Stmt = makeConcreteNotation(ref2Stmt); +// // ref2Stmt = insertTemporaries(ref2Stmt); +// // ref2Stmt = parallelizeOuterLoop(ref2Stmt); +// // ref2.compile(ref2Stmt); +// // ref2.assemble(); + +// std::cout << "compute start\n"; +// taco::util::TimeResults timevalue; +// bool time = true; +// std::string sofused = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/spmv_spmv/spmv_fused.so"; + +// TOOL_BENCHMARK_TIMER(ref.compute(statfile, sofused), "\n\nReference Kernel: ", timevalue); + + +// std::cout << "b1 dim: " << B.getTacoTensorT()->dimensions[1] << std::endl; +// // TOOL_BENCHMARK_TIMER(ref.compute(statfile, sofused), "\n\nFused Kernel: ", timevalue); +// // ASSERT_TENSOR_EQ(ref, A); + +// // // check results +// // for (int q = 0; q < A.getDimension(0); ++q) { +// // if ( abs(A(q) - ref(q))/abs(ref(q)) > ERROR_MARGIN) { +// // std::cout << "error: results don't match A("<< q << "): " +// // << A(q) << ", ref: " << ref(q) << std::endl; +// // ASSERT_TRUE(false); +// // } +// // } +// // // ASSERT_TENSOR_EQ(A, ref); +// // TOOL_BENCHMARK_TIMER(ref1.compute(), "\n\nSDDMM Kernel: ", timevalue); +// // TOOL_BENCHMARK_TIMER(ref2.compute(), "\n\nSpMM Kernel: ", timevalue); +// // ASSERT_TENSOR_EQ(ref, ref2); + +// // for (int q = 0; q < ref2.getDimension(0); ++q) { +// // for (int w = 0; w < ref2.getDimension(1); ++w) { +// // if ( abs(ref2(q,w) - ref(q,w))/abs(ref(q,w)) > ERROR_MARGIN) { +// // std::cout << "error: results don't match A("<< q << "," << w << "): " +// // << ref2(q,w) << ", ref: " << ref(q,w) << std::endl; +// // ASSERT_TRUE(false); +// // } +// // } +// // } + +// if (statfile.is_open()) { +// statfile.close(); +// } + +// } + +TEST(scheduling_eval, sddmmFusedWithSyntheticData) { + if (should_use_CUDA_codegen()) { + return; + } + + taco_set_num_threads(NUM_THREADS_TO_USE); + + std::default_random_engine gen(0); + std::uniform_real_distribution unif(0.0, 1.0); + + Format csr({dense, sparse}); + Format rm({dense, dense}); + int ldim = 4; + int kdim = 8; + + // uncomment this for reading the csr matrix saved in mtx file + std::cout << "reading B mat mtx\n"; + + int NUM_I = 1021/10; + int NUM_J = 1039/10; + float SPARSITY = .3; + Tensor B("B", {NUM_I, NUM_J}, csr); + srand(75883); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + } + B.pack(); + write("/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx", B); + + std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl; + std::cout << "adding c mat\n"; + Tensor C({B.getDimension(0), kdim}, rm); + for (int i = 0; i < C.getDimension(0); ++i) { + for (int j = 0; j < C.getDimension(1); ++j) { + C.insert({i,j}, unif(gen)); + } + } + std::cout << "packing C mat\n"; + C.pack(); + + Tensor D({B.getDimension(1), kdim}, rm); + for (int i = 0; i < D.getDimension(0); ++i) { + for (int j = 0; j < D.getDimension(1); ++j) { + D.insert({i,j}, unif(gen)); + } + } + std::cout << "packing D mat\n"; + D.pack(); + + Tensor F({B.getDimension(1), ldim}, rm); + for (int i = 0; i < F.getDimension(0); ++i) { + for (int j = 0; j < F.getDimension(1); ++j) { + F.insert({i,j}, unif(gen)); + } + } + std::cout << "packing F mat\n"; + F.pack(); + + Tensor A({B.getDimension(0), ldim}, rm); + Tensor ref({B.getDimension(0), ldim}, rm); + IndexVar i, j, k, l; + A(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); + + // IndexStmt stmt = A.getAssignment().concretize(); + IndexStmt stmt = makeReductionNotation(A.getAssignment()); + stmt = makeConcreteNotation(stmt); + printToFile("fusedMMConcrete", stmt); + + stmt = reorderLoopsTopologically(stmt); + printToFile("fusedMMOrdered", stmt); + + stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 1); + printToFile("fusedMMFused", stmt); + + stmt = insertTemporaries(stmt); + printToFile("fusedMMWithTemps", stmt); + stmt = parallelizeOuterLoop(stmt); + printToFile("fusedMMFusedPar", stmt); + + A.compile(stmt); + // We can now call the functions taco generated to assemble the indices of the + // output matrix and then actually compute the MTTKRP. + A.assemble(); + + + ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); + IndexStmt refStmt = makeReductionNotation(ref.getAssignment()); + refStmt = makeConcreteNotation(refStmt); + refStmt = insertTemporaries(refStmt); + refStmt = parallelizeOuterLoop(refStmt); + ref.compile(refStmt); + ref.assemble(); + + Tensor ref1({B.getDimension(0), B.getDimension(1)}, csr); + Tensor ref2({B.getDimension(0), ldim}, rm); + ref1(i,j)=B(i,j)*C(i,k)*D(j,k); + ref2(i,l)=ref1(i,j)*F(j,l); + + IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment()); + ref1Stmt = makeConcreteNotation(ref1Stmt); + ref1Stmt = insertTemporaries(ref1Stmt); + ref1Stmt = parallelizeOuterLoop(ref1Stmt); + ref1.compile(ref1Stmt); + ref1.assemble(); + + IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment()); + ref2Stmt = makeConcreteNotation(ref2Stmt); + ref2Stmt = insertTemporaries(ref2Stmt); + ref2Stmt = parallelizeOuterLoop(ref2Stmt); + ref2.compile(ref2Stmt); + ref2.assemble(); + + std::cout << "compute start\n"; + taco::util::TimeResults timevalue; + bool time = true; + TOOL_BENCHMARK_TIMER(ref.compute(), "\n\nReference Kernel: ", timevalue); + TOOL_BENCHMARK_TIMER(A.compute(), "\n\nFused Kernel: ", timevalue); + + // check results + for (int q = 0; q < A.getDimension(0); ++q) { + for (int w = 0; w < A.getDimension(1); ++w) { + if ( abs(A(q,w) - ref(q,w))/abs(ref(q,w)) > ERROR_MARGIN) { + std::cout << "error: results don't match A("<< q << "," << w << "): " + << A(q,w) << ", ref: " << ref(q,w) << std::endl; + ASSERT_TRUE(false); + } + } + } + // ASSERT_TENSOR_EQ(A, ref); + TOOL_BENCHMARK_TIMER(ref1.compute(), "\n\nSDDMM Kernel: ", timevalue); + TOOL_BENCHMARK_TIMER(ref2.compute(), "\n\nSpMM Kernel: ", timevalue); + + for (int q = 0; q < ref2.getDimension(0); ++q) { + for (int w = 0; w < ref2.getDimension(1); ++w) { + if ( abs(ref2(q,w) - ref(q,w))/abs(ref(q,w)) > ERROR_MARGIN) { + std::cout << "error: results don't match A("<< q << "," << w << "): " + << ref2(q,w) << ", ref: " << ref(q,w) << std::endl; + ASSERT_TRUE(false); + } + } + } + +} + + +IndexStmt scheduleSDDMMCPU_forfuse(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i, j, k, l, m; + IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1"); + return stmt.split(i, i0, i1, CHUNK_SIZE) + .pos(k, kpos, B(i,k)) + .split(kpos, kpos0, kpos1, UNROLL_FACTOR) + .reorder({i0, i1, kpos0, j, kpos1}) + .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + .parallelize(kpos1, ParallelUnit::CPUVector, OutputRaceStrategy::ParallelReduction); +} + +TEST(scheduling_eval, sddmmFused) { + if (should_use_CUDA_codegen()) { + return; + } + + taco_set_num_threads(NUM_THREADS_TO_USE); + + ofstream statfile; + statfile.open( + "/home/min/a/kadhitha/workspace/my_taco/taco/test/stats/sddmm-spmm.txt", std::ios::app); + if (statfile.is_open()) { + statfile << "\nsddmm-spmm execution\n"; + statfile << "\n-----------------------------------------\n"; + } + + std::default_random_engine gen(0); + std::uniform_real_distribution unif(0.0, 1.0); + + Format csr({dense, sparse}); + Format rm({dense, dense}); + int ldim = 128; + int kdim = 128; + + // vector filenums = {2,3,4,5,6,7,8,9,10,12,15}; + + vector filenums = {0}; + + for (auto filenum : filenums) { + + // int filenum = 5; + + std::vector matfiles = { + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx", + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx", // 5 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx", // 8 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx", // 10 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx", // 12 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wiki-Talk/wiki-Talk.mtx", // 13 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/com-Orkut/com-Orkut.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx", // 15 + "/home/min/a/kadhitha/workspace/my_taco/FusedMM/dataset/harvard.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/twitter7/twitter7.mtx" + }; + std::vector matfilesrw = { + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/rw/cora.mtx", + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/rw/amazon.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/synthetic.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cage3.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/bcsstk17.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/pdb1HYS.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/rma10.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cant.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/consph.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cop20k_A.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/shipsec1.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/scircuit.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/webbase-1M.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/wiki-Talk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/com-Orkut.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/circuit5M.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/harvard.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/twitter7.mtx" + }; + + std::string matfile = matfiles[filenum]; + std::cout << "reading B mat mtx\n"; + Tensor B = read(matfile, csr, true); + B.setName("B"); + B.pack(); + // write(matfilesrw[filenum], B); + + if (statfile.is_open()) { + statfile << matfile << std::endl; + } + + std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl; + std::cout << "adding c mat\n"; + Tensor C({B.getDimension(0), kdim}, rm); + for (int i = 0; i < C.getDimension(0); ++i) { + for (int j = 0; j < C.getDimension(1); ++j) { + C.insert({i,j}, unif(gen)); + } + } + std::cout << "packing C mat\n"; + C.pack(); + + Tensor D({B.getDimension(1), kdim}, rm); + for (int i = 0; i < D.getDimension(0); ++i) { + for (int j = 0; j < D.getDimension(1); ++j) { + D.insert({i,j}, unif(gen)); + } + } + std::cout << "packing D mat\n"; + D.pack(); + + Tensor F({B.getDimension(1), ldim}, rm); + for (int i = 0; i < F.getDimension(0); ++i) { + for (int j = 0; j < F.getDimension(1); ++j) { + F.insert({i,j}, unif(gen)); + } + } + std::cout << "packing F mat\n"; + F.pack(); + + Tensor A({B.getDimension(0), ldim}, rm); + Tensor ref({B.getDimension(0), ldim}, rm); + IndexVar i, j, k, l, m; + IndexVar i0("i0"), i1("i1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"), k0("k0"), k1("k1"); + A(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); + if (statfile.is_open()) { + statfile + << "ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);" << std::endl + << "B1_dimension: " << B.getDimension(0) << ", B2_dimension: " << B.getDimension(1) << ", vals: " << B.getStorage().getValues().getSize() << std::endl + << "C1_dimension: " << C.getDimension(0) << ", C2_dimension: " << C.getDimension(1) << ", vals: " << C.getStorage().getValues().getSize() << std::endl + << "D1_dimension: " << D.getDimension(0) << ", D2_dimension: " << D.getDimension(1) << ", vals: " << D.getStorage().getValues().getSize() << std::endl + << "E1_dimension: " << F.getDimension(0) << ", E2_dimension: " << F.getDimension(1) << ", vals: " << F.getStorage().getValues().getSize() << std::endl + << std::endl; + } + + // IndexStmt stmt = A.getAssignment().concretize(); + IndexStmt stmt = makeReductionNotation(A.getAssignment()); + stmt = makeConcreteNotation(stmt); + stmt = reorderLoopsTopologically(stmt); + stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 1); + stmt = stmt + .split(i, i0, i1, 16); + stmt = insertTemporaries(stmt); + stmt = parallelizeOuterLoop(stmt); + + A.compile(stmt); + A.assemble(); + + + ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); + IndexStmt refStmt = makeReductionNotation(ref.getAssignment()); + refStmt = makeConcreteNotation(refStmt); + refStmt = insertTemporaries(refStmt); + refStmt = refStmt + .split(i, i0, i1, 16) + .reorder({i0, i1, j, k, l}); + stmt = insertTemporaries(stmt); + refStmt = parallelizeOuterLoop(refStmt); + ref.compile(refStmt); + ref.assemble(); + + Tensor ref1({B.getDimension(0), B.getDimension(1)}, csr); + Tensor ref2({B.getDimension(0), ldim}, rm); + ref1(i,j)=B(i,j)*C(i,k)*D(j,k); + ref2(i,l)=ref1(i,j)*F(j,l); + + IndexStmt ref1Stmt = ref1.getAssignment().concretize(); // anyway Ryan's kernel is used here + + ref1Stmt = ref1Stmt.split(i, i0, i1, 16); + // .pos(j, jpos, B(i,j)); + // .split(k, k0, k1, 8); + // .reorder({i0, i1, jpos0, k, jpos1}); + // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + // .parallelize(jpos1, ParallelUnit::CPUVector, OutputRaceStrategy::ParallelReduction); + // ref1Stmt.split(i, ); + // stmt = scheduleSDDMMCPU_forfuse(ref1Stmt, B); + // IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment()); + // ref1Stmt = makeConcreteNotation(ref1Stmt); + ref1Stmt = insertTemporaries(ref1Stmt); + ref1Stmt = parallelizeOuterLoop(ref1Stmt); + ref1.compile(ref1Stmt); + ref1.assemble(); + + IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment()); // Ryan's SpMM kernel is used here + ref2Stmt = makeConcreteNotation(ref2Stmt); + ref2Stmt = insertTemporaries(ref2Stmt); + ref2Stmt = parallelizeOuterLoop(ref2Stmt); + ref2.compile(ref2Stmt); + ref2.assemble(); + + std::cout << "compute start\n"; + taco::util::TimeResults timevalue; + bool time = true; + + std::string sofile_fused = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/fused_kernel.so"; + TOOL_BENCHMARK_TIMER(A.compute(statfile), "\n\nFused Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "fused time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + statfile << "\nseparate execution\n"; + + // std::string sofile_sddmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so"; + std::string sofile_sddmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.so"; + TOOL_BENCHMARK_TIMER(ref1.compute(statfile, sofile_sddmm), "\n\nSDDMM Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "sddmm time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + std::string sofile_sddmm_ryan = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/sddmm_ryan.so"; + TOOL_BENCHMARK_TIMER(ref1.compute(statfile, sofile_sddmm_ryan), "\n\nSDDMM Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "sddmm time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + std::string sofile_spmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so"; + TOOL_BENCHMARK_TIMER(ref2.compute(statfile, sofile_spmm), "\n\nSpMM Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "spmm time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + statfile << "\nreference execution \n"; + + std::string sofile_original = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/taco_original.so"; + TOOL_BENCHMARK_TIMER(ref.compute(statfile, sofile_original), "\n\nReference Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "taco reference time: "; + statfile << timevalue << std::endl; + } else { std::cout << " stat file is not open\n"; } + + double* A_vals = (double*) (A.getTacoTensorT()->vals); + double* ref_vals = (double*) (ref.getTacoTensorT()->vals); + double* ref2_vals = (double*) (ref2.getTacoTensorT()->vals); + + // int* A2_pos = (double*) (ref.getTacoTensorT()->vals); + + // for (size_t q=0; q < B.getStorage().getValues().getSize(); q++) { + // if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) { + // std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + // << "refvals: " << ref_vals[q] << std::endl; + // ASSERT_TRUE(false); + // } + // } + + for (size_t q=0; q < A.getDimension(0)* A.getDimension(1); q++) { + if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) { + std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + << "refvals: " << ref_vals[q] << std::endl; + ASSERT_TRUE(false); + } + } + for (size_t q=0; q < A.getDimension(0)* A.getDimension(1); q++) { + if ( abs(A_vals[q] - ref2_vals[q])/abs(ref2_vals[q]) > ERROR_MARGIN) { + std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + << "refvals: " << ref2_vals[q] << std::endl; + ASSERT_TRUE(false); + } + } + // // for (int q= 0; q< A_vals + // for (int q = 0; q < A.getDimension(0); ++q) { + // for (int w = 0; w < A.getDimension(1); ++w) { + // if ( abs(A(q,w) - ref(q,w))/abs(ref(q,w)) > ERROR_MARGIN) { + // std::cout << "error: results don't match A("<< q << "," << w << "): " + // << A(q,w) << ", ref: " << ref(q,w) << std::endl; + // ASSERT_TRUE(false); + // } + // } + // } + // ASSERT_TENSOR_EQ(A, ref); + + } // end of for loop + + + if (statfile.is_open()) { + statfile.close(); + } +} + + + + +TEST(scheduling_eval, hadamardFused) { + if (should_use_CUDA_codegen()) { + return; + } + + taco_set_num_threads(NUM_THREADS_TO_USE); + + ofstream statfile; + statfile.open( + "/home/min/a/kadhitha/workspace/my_taco/taco/test/stats/hadamard-gemm.txt", std::ios::app); + if (statfile.is_open()) { + statfile << "\nsddmm-spmm execution\n"; + statfile << "\n-----------------------------------------\n"; + } + + std::default_random_engine gen(0); + std::uniform_real_distribution unif(0.0, 1.0); + + Format csr({dense, sparse}); + Format rm({dense, dense}); + int kdim = 128; + int ldim = 128; + + // vector filenums = {2,3,4,5,6,7,8,9,10,12,15}; + vector filenums = {0}; + + for (auto filenum : filenums) { + + // int filenum = 15; + + std::vector matfiles = { + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx", + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx", // 2 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx", // 5 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx", // 8 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx", // 10 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx", // 12 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wiki-Talk/wiki-Talk.mtx", // 13 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/com-Orkut/com-Orkut.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx", // 15 + "/home/min/a/kadhitha/workspace/my_taco/FusedMM/dataset/harvard.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/twitter7/twitter7.mtx" + }; + std::vector matfilesrw = { + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/rw/cora.mtx", + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/rw/amazon.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/synthetic.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cage3.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/bcsstk17.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/pdb1HYS.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/rma10.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cant.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/consph.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cop20k_A.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/shipsec1.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/scircuit.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/webbase-1M.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/wiki-Talk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/com-Orkut.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/circuit5M.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/harvard.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/twitter7.mtx" + }; + + std::string matfile = matfiles[filenum]; + std::cout << "reading B mat mtx\n"; + Tensor B = read(matfile, csr, true); + B.setName("B"); + B.pack(); + // write(matfilesrw[filenum], B); + + if (statfile.is_open()) { + statfile << matfile << std::endl; + } + + std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl; + std::cout << "adding c mat\n"; + Tensor C({B.getDimension(1), kdim}, rm); + for (int i = 0; i < C.getDimension(0); ++i) { + for (int j = 0; j < C.getDimension(1); ++j) { + C.insert({i,j}, unif(gen)); + } + } + std::cout << "packing C mat\n"; + C.pack(); + + Tensor D({B.getDimension(1), kdim}, rm); + for (int i = 0; i < D.getDimension(0); ++i) { + for (int j = 0; j < D.getDimension(1); ++j) { + D.insert({i,j}, unif(gen)); + } + } + std::cout << "packing D mat\n"; + D.pack(); + + Tensor F({kdim, ldim}, rm); + for (int i = 0; i < F.getDimension(0); ++i) { + for (int j = 0; j < F.getDimension(1); ++j) { + F.insert({i,j}, unif(gen)); + } + } + std::cout << "packing F mat\n"; + F.pack(); + + Tensor A({B.getDimension(0), ldim}, rm); + Tensor ref({B.getDimension(0), ldim}, rm); + IndexVar i, j, k, l, m; + IndexVar i0("i0"), i1("i1"), l0("l0"), l1("l1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"), k0("k0"), k1("k1"); + A(i,l)=B(i,j)*C(j,k)*D(j,k)*F(k,l); + if (statfile.is_open()) { + statfile + << "ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);" << std::endl + << "B1_dimension: " << B.getDimension(0) << ", B2_dimension: " << B.getDimension(1) << ", vals: " << B.getStorage().getValues().getSize() << std::endl + << "C1_dimension: " << C.getDimension(0) << ", C2_dimension: " << C.getDimension(1) << ", vals: " << C.getStorage().getValues().getSize() << std::endl + << "D1_dimension: " << D.getDimension(0) << ", D2_dimension: " << D.getDimension(1) << ", vals: " << D.getStorage().getValues().getSize() << std::endl + << "E1_dimension: " << F.getDimension(0) << ", E2_dimension: " << F.getDimension(1) << ", vals: " << F.getStorage().getValues().getSize() << std::endl + << std::endl; + } + + // IndexStmt stmt = A.getAssignment().concretize(); + IndexStmt stmt = makeReductionNotation(A.getAssignment()); + stmt = makeConcreteNotation(stmt); + stmt = reorderLoopsTopologically(stmt); + stmt = stmt.reorder({i, j, k, l}); + stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 1); + stmt = stmt + .split(i, i0, i1, 16); + stmt = insertTemporaries(stmt); + stmt = parallelizeOuterLoop(stmt); + printToFile("fusedMMFusedPar", stmt); + + A.compile(stmt); + A.assemble(); + + + ref(i,l)=B(i,j)*C(j,k)*D(j,k)*F(k,l); + IndexStmt refStmt = makeReductionNotation(ref.getAssignment()); + refStmt = makeConcreteNotation(refStmt); + refStmt = refStmt + .split(i, i0, i1, 16) + .reorder({i0, i1, j, k, l}); + refStmt = insertTemporaries(refStmt); + refStmt = parallelizeOuterLoop(refStmt); + ref.compile(refStmt); + ref.assemble(); + + Tensor ref1({B.getDimension(0), kdim}, rm); + Tensor ref2({B.getDimension(0), ldim}, rm); + ref1(i,k)=B(i,j)*C(j,k)*D(j,k); + ref2(i,l)=ref1(i,k)*F(k,l); + + // IndexStmt ref1Stmt = ref1.getAssignment().concretize(); + + // ref1Stmt = ref1Stmt.split(i, i0, i1, 16); + // // .pos(j, jpos, B(i,j)); + // // .split(k, k0, k1, 8); + // // .reorder({i0, i1, jpos0, k, jpos1}); + // // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + // // .parallelize(jpos1, ParallelUnit::CPUVector, OutputRaceStrategy::ParallelReduction); + // // ref1Stmt.split(i, ); + // // stmt = scheduleSDDMMCPU_forfuse(ref1Stmt, B); + IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment()); + ref1Stmt = makeConcreteNotation(ref1Stmt); + ref1Stmt = ref1Stmt + .split(i, i0, i1, 16) + .reorder({i0, i1, j, k}); + // .pos(j, jpos, B(i,j)) + // .split(jpos, jpos0, jpos1, 32) + // .split(k, k0, k1, 32) + // .reorder({i0, i1, jpos0, k0, jpos1, k1}); + ref1Stmt = insertTemporaries(ref1Stmt); + ref1Stmt = parallelizeOuterLoop(ref1Stmt); + ref1.compile(ref1Stmt); + ref1.assemble(); + + IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment()); + ref2Stmt = makeConcreteNotation(ref2Stmt); + ref2Stmt = ref2Stmt + .split(i, i0, i1, 32) + .split(k, k0, k1, 32) + .split(l, l0, l1, 32) + .reorder({i0, k0, l0, i1, k1, l1}); + ref2Stmt = insertTemporaries(ref2Stmt); + ref2Stmt = parallelizeOuterLoop(ref2Stmt); + ref2.compile(ref2Stmt); + ref2.assemble(); + + std::cout << "compute start\n"; + taco::util::TimeResults timevalue; + bool time = true; + + TOOL_BENCHMARK_TIMER(A.compute(statfile), "\n\nFused Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "fused time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + // // std::string sofile_sddmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so"; + // std::string sofile_sddmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.so"; + TOOL_BENCHMARK_TIMER(ref1.compute(statfile), "\n\nHadamard Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "hadamard time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + // std::string sofile_sddmm_ryan = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/sddmm_ryan.so"; + // TOOL_BENCHMARK_TIMER(ref1.compute(statfile, sofile_sddmm_ryan), "\n\nSDDMM Kernel: ", timevalue); + // if (statfile.is_open()) { + // statfile << "sddmm time: "; + // statfile << timevalue.mean << std::endl; + // } else { std::cout << " stat file is not open\n"; } + + // std::string sofile_spmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so"; + TOOL_BENCHMARK_TIMER(ref2.compute(statfile), "\n\nGeMM Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "gemm time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + // std::string sofile_original = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/taco_original.so"; + TOOL_BENCHMARK_TIMER(ref.compute(statfile), "\n\nReference Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "taco reference time: "; + statfile << timevalue << std::endl; + } else { std::cout << " stat file is not open\n"; } + + double* A_vals = (double*) (A.getTacoTensorT()->vals); + double* ref_vals = (double*) (ref.getTacoTensorT()->vals); + double* ref2_vals = (double*) (ref2.getTacoTensorT()->vals); + + // // int* A2_pos = (double*) (ref.getTacoTensorT()->vals); + + for (int q=0; q < A.getDimension(0)* A.getDimension(1); q++) { + if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) { + std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + << "refvals: " << ref_vals[q] << std::endl; + ASSERT_TRUE(false); + } + } + + for (int q=0; q < A.getDimension(0)* A.getDimension(1); q++) { + if ( abs(A_vals[q] - ref2_vals[q])/abs(ref2_vals[q]) > ERROR_MARGIN) { + std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + << "refvals: " << ref2_vals[q] << std::endl; + ASSERT_TRUE(false); + } + } + + } // end of for loop + + if (statfile.is_open()) { + statfile.close(); + } + +} + + + + + + +TEST(scheduling_eval, mttkrpFusedWithSyntheticData) { + if (should_use_CUDA_codegen()) { + return; + } + taco_set_num_threads(NUM_THREADS_TO_USE); + + std::default_random_engine gen(0); + std::uniform_real_distribution unif(0.0, 1.0); + // Predeclare the storage formats that the inputs and output will be stored as. + // To define a format, you must specify whether each dimension is dense or + // sparse and (optionally) the order in which dimensions should be stored. The + // formats declared below correspond to compressed sparse fiber (csf) and + // row-major dense (rm). + Format csf({Sparse,Sparse,Sparse}); + Format rm({Dense,Dense}); + Format sd({Dense,Dense}); + + int NUM_I = 1021/20; + int NUM_J = 1039/20; + int NUM_K = 1057/20; + int NUM_L = 1232/20; + int NUM_M = 1231/20; + float SPARSITY = .1; + Tensor A("A", {NUM_I, NUM_M}, sd); + Tensor B("B", {NUM_I, NUM_K, NUM_L}, csf); + Tensor C("C", {NUM_K, NUM_J}, rm); + Tensor D("D", {NUM_L, NUM_J}, rm); + Tensor E("E", {NUM_J, NUM_M}, rm); + Tensor ref({NUM_I, NUM_M}, sd); + + srand(549694); + for (int i = 0; i < NUM_I; i++) { + for (int k = 0; k < NUM_K; k++) { + for (int l = 0; l < NUM_L; l++) { + float rand_float = (float) rand() / (float) (RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, k, l}, (double) ((int) (rand_float * 3 / SPARSITY))); + } + } + } + } + B.pack(); + write("/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/synthetic.tns", B); + + // Generate a random dense matrix and store it in row-major (dense) format. + // Matrices correspond to order-2 tensors in taco. + for (int k = 0; k < NUM_K; k++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + C.insert({k, j}, (double) ((int) (rand_float*3))); + } + } + C.pack(); + + for (int l = 0; l < NUM_L; l++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + D.insert({l, j}, (double) ((int) (rand_float*3))); + } + } + D.pack(); + + for (int i = 0; i < E.getDimension(0); ++i) { + for (int j = 0; j < E.getDimension(1); ++j) { + E.insert({i,j}, unif(gen)); + } + } + E.pack(); + + // Define the MTTKRP computation using index notation. + IndexVar i, k, l, j, m; + A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m); + + + IndexStmt stmt = makeReductionNotation(A.getAssignment()); + stmt = makeConcreteNotation(stmt); + printToFile("fusedMTTKRPConcrete", stmt); + + stmt = reorderLoopsTopologically(stmt); + printToFile("fusedMTTKRPOrdered", stmt); + + stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 1); + printToFile("fusedMTTKRPFused", stmt); + + stmt = insertTemporaries(stmt); + printToFile("fusedMTTKRPWithTemps", stmt); + stmt = parallelizeOuterLoop(stmt); + printToFile("fusedMTTKRPFusedPar", stmt); + + + // At this point, we have defined how entries in the output matrix should be + // computed from entries in the input tensor and matrices but have not actually + // performed the computation yet. To do so, we must first tell taco to generate + // code that can be executed to compute the MTTKRP operation. + A.compile(stmt); + // We can now call the functions taco generated to assemble the indices of the + // output matrix and then actually compute the MTTKRP. + A.assemble(); + + + ref(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m); + IndexStmt refStmt = makeReductionNotation(ref.getAssignment()); + refStmt = makeConcreteNotation(refStmt); + refStmt = insertTemporaries(refStmt); + refStmt = parallelizeOuterLoop(refStmt); + ref.compile(refStmt); + ref.assemble(); + + // Tensor ref2({NUM_I, NUM_J}, sd); + // ref2(i,j) = B(i,k,l) * D(l,j) * C(k,j); + // IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment()); + // ref2Stmt = makeConcreteNotation(ref2Stmt); + // ref2Stmt = insertTemporaries(ref2Stmt); + // ref2Stmt = parallelizeOuterLoop(ref2Stmt); + // ref2.compile(ref2Stmt); + // ref2.assemble(); + + // Tensor ref3({NUM_I, NUM_M}, sd); + // ref3(i,m) = ref2(i,j) * E(j,m); + // IndexStmt ref3Stmt = makeReductionNotation(ref3.getAssignment()); + // ref3Stmt = makeConcreteNotation(ref3Stmt); + // ref3Stmt = insertTemporaries(ref3Stmt); + // ref3Stmt = parallelizeOuterLoop(ref3Stmt); + // ref3.compile(ref3Stmt); + // ref3.assemble(); + + std::cout << "compute start\n"; + taco::util::TimeResults timevalue; + bool time = true; + // TOOL_BENCHMARK_TIMER(ref.compute(), "\n\nReference ISPC: ", timevalue); + TOOL_BENCHMARK_TIMER(A.compute(), "\n\nFused MTTKRP+SPMM: ", timevalue); + TOOL_BENCHMARK_TIMER(ref.compute(), "\n\nReference MTTKRP+SPMM: ", timevalue); + // TOOL_BENCHMARK_TIMER(ref2.compute(), "\n\nReference MTTKRP: ", timevalue); + // TOOL_BENCHMARK_TIMER(ref3.compute(), "\n\nReference SPMM: ", timevalue); + ASSERT_TENSOR_EQ(ref, A); + // ASSERT_TENSOR_EQ(ref, ref3); + +} + + +TEST(scheduling_eval, mttkrpFused) { + if (should_use_CUDA_codegen()) { + return; + } + + taco_set_num_threads(NUM_THREADS_TO_USE); + + ofstream statfile; + statfile.open( + "/home/min/a/kadhitha/workspace/my_taco/taco/test/stats/mttkrp-spmm.txt", std::ios::app); + if (statfile.is_open()) { + statfile << "\nmttkrp-spmm execution\n"; + statfile << "\n-----------------------------------------\n"; + } + + std::default_random_engine gen(0); + std::uniform_real_distribution unif(0.0, 1.0); + // Predeclare the storage formats that the inputs and output will be stored as. + // To define a format, you must specify whether each dimension is dense or + // sparse and (optionally) the order in which dimensions should be stored. The + // formats declared below correspond to compressed sparse fiber (csf) and + // row-major dense (rm). + Format csf({Dense,Sparse,Sparse}); + Format rm({Dense,Dense}); + Format sd({Dense,Dense}); + int jDim = 32; + int mDim = 64; + + int matfilenum = 3; + + // Load a sparse order-3 tensor from file (stored in the FROSTT format) and + // store it as a compressed sparse fiber tensor. The tensor in this example + // can be download from: http://frostt.io/tensors/nell-2/ + std::vector matfiles = { + "/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns", + "/home/min/a/kadhitha/ispc-examples/data/tns/delicious-3d.tns", + "/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns", // 2 + "/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns", // 3 + "/home/min/a/kadhitha/ispc-examples/data/tns/nell-1.tns", // 4 + "/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns", // 5 + "/home/min/a/kadhitha/ispc-examples/data/tns/darpa1998.tns", // 6 + "/home/min/a/kadhitha/ispc-examples/data/tns/freebase_music.tns", + "/home/min/a/kadhitha/ispc-examples/data/tns/freebase_sampled.tns" // 8 + }; + std::vector matfilesrw = { + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/matmul_5-5-5.tns", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/delicious-3d.tns", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/flickr-3d.tns", // 2 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/nell-2.tns", // 3 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/nell-1.tns", // 4 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/vast-2015-mc1-3d.tns", // 5 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/darpa1998.tns", // 6 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/freebase_music.tns", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/freebase_sampled.tns" + }; + std::string matfile = matfiles[matfilenum]; + Tensor B = read(matfile, csf, true); + // write(matfilesrw[matfilenum], B); + + // Generate a random dense matrix and store it in row-major (dense) format. + // Matrices correspond to order-2 tensors in taco. + Tensor C({B.getDimension(1), jDim}, rm); + for (int i = 0; i < C.getDimension(0); ++i) { + for (int j = 0; j < C.getDimension(1); ++j) { + C.insert({i,j}, unif(gen)); + } + } + C.pack(); + + // Generate another random dense matrix and store it in row-major format. + Tensor D({B.getDimension(2), jDim}, rm); + for (int i = 0; i < D.getDimension(0); ++i) { + for (int j = 0; j < D.getDimension(1); ++j) { + D.insert({i,j}, unif(gen)); + } + } + D.pack(); + + Tensor E({jDim, mDim}, rm); + for (int i = 0; i < E.getDimension(0); ++i) { + for (int j = 0; j < E.getDimension(1); ++j) { + E.insert({i,j}, unif(gen)); + } + } + E.pack(); + + if (statfile.is_open()) { + statfile + << matfile << std::endl + << "A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m)" << std::endl + << "B1_dimension: " << B.getDimension(0) << ", B2_dimension: " << B.getDimension(1) << ", B3_dimension: " << B.getDimension(0) << ", vals: " << B.getStorage().getValues().getSize() << std::endl + << "C1_dimension: " << C.getDimension(0) << ", C2_dimension: " << C.getDimension(1) << ", vals: " << C.getStorage().getValues().getSize() << std::endl + << "D1_dimension: " << D.getDimension(0) << ", D2_dimension: " << D.getDimension(1) << ", vals: " << D.getStorage().getValues().getSize() << std::endl + << "E1_dimension: " << E.getDimension(0) << ", E2_dimension: " << E.getDimension(1) << ", vals: " << E.getStorage().getValues().getSize() << std::endl + << std::endl; + } + + // Declare the output matrix to be a dense matrix with 25 columns and the same + // number of rows as the number of slices along the first dimension of input + // tensor B, to be also stored as a row-major dense matrix. + Tensor A({B.getDimension(0), mDim}, sd); + Tensor ref({B.getDimension(0), mDim}, sd); + + // Define the MTTKRP computation using index notation. + IndexVar i, k, l, j, m; + IndexVar i1("i1"), i2("i2"), j1("j1"), j2("j2"), m1("m1"), m2("m2"); + + A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m); + + IndexStmt stmt = makeReductionNotation(A.getAssignment()); + stmt = makeConcreteNotation(stmt); + stmt = reorderLoopsTopologically(stmt); + // stmt = stmt.reorder({i,j,k,l,m}); + stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 1); + stmt = stmt.split(i, i1, i2, 16); + stmt = insertTemporaries(stmt); + stmt = parallelizeOuterLoop(stmt); + printToFile("fusedMTTKRPFusedPar", stmt); + A.compile(stmt); + A.assemble(); + + + ref(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m); + IndexStmt refStmt = makeReductionNotation(ref.getAssignment()); + refStmt = makeConcreteNotation(refStmt); + refStmt = refStmt + .split(i, i1, i2, 16); + refStmt = insertTemporaries(refStmt); + refStmt = parallelizeOuterLoop(refStmt); + ref.compile(refStmt); + ref.assemble(); + + Tensor ref2({B.getDimension(0), jDim}, sd); + ref2(i,j) = B(i,k,l) * D(l,j) * C(k,j); + IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment()); + ref2Stmt = makeConcreteNotation(ref2Stmt); + ref2Stmt = ref2Stmt + .split(i, i1, i2, 16); + ref2Stmt = insertTemporaries(ref2Stmt); + ref2Stmt = parallelizeOuterLoop(ref2Stmt); + ref2.compile(ref2Stmt); + ref2.assemble(); + + Tensor ref2_ryan({B.getDimension(0), jDim}, sd); + ref2_ryan(i,j) = B(i,k,l) * D(l,j) * C(k,j); + + IndexStmt ref2RyanStmt = makeReductionNotation(ref2_ryan.getAssignment()); + ref2RyanStmt = makeConcreteNotation(ref2RyanStmt); + + IndexExpr precomputeExpr = ref2RyanStmt.as().getStmt().as().getStmt() + .as().getStmt().as().getStmt() + .as().getRhs().as().getA(); + TensorVar w("w", Type(Float64, {Dimension(j)}), taco::dense); + ref2RyanStmt = ref2RyanStmt.split(i, i1, i2, 16) + .reorder({i1, i2, k, l, j}) + .precompute(precomputeExpr, j, j, w) + .parallelize(i1, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); + ref2RyanStmt = insertTemporaries(ref2RyanStmt); + // ref2RyanStmt = parallelizeOuterLoop(ref2RyanStmt); + ref2_ryan.compile(ref2RyanStmt); + ref2_ryan.assemble(); + + Tensor ref3({B.getDimension(0), mDim}, sd); + ref3(i,m) = ref2(i,j) * E(j,m); + IndexStmt ref3Stmt = makeReductionNotation(ref3.getAssignment()); + ref3Stmt = makeConcreteNotation(ref3Stmt); + ref3Stmt = ref3Stmt + .split(i, i1, i2, 16) + .split(j, j1, j2, 16) + .split(m, m1, m2, 16) + .reorder({i1, j1, m1, i2, j2, m2}) + .parallelize(i1, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); + ref3Stmt = insertTemporaries(ref3Stmt); + ref3Stmt = parallelizeOuterLoop(ref3Stmt); + ref3.compile(ref3Stmt); + ref3.assemble(); + + + std::cout << "compute start\n"; + taco::util::TimeResults timevalue; + bool time = true; + + TOOL_BENCHMARK_TIMER(ref2.compute(statfile), "\n\nDefault MTTKRP: ", timevalue); + if (statfile.is_open()) { + statfile << "default mttkrp time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + TOOL_BENCHMARK_TIMER(ref2_ryan.compute(statfile), "\n\nRyan MTTKRP workspace: ", timevalue); + if (statfile.is_open()) { + statfile << "ryan mttkrp workspace time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + double* ref2_vals = (double*) (ref2.getTacoTensorT()->vals); + double* ref2_ryan_vals = (double*) (ref2_ryan.getTacoTensorT()->vals); + for (int q=0; q < B.getDimension(0)* jDim; q++) { + if ( abs(ref2_vals[q] - ref2_ryan_vals[q])/abs(ref2_ryan_vals[q]) > ERROR_MARGIN) { + std::cout << "error: results don't match i: " << q << ", avals: " << ref2_vals[q] << " " + << "refvals: " << ref2_ryan_vals[q] << std::endl; + ASSERT_TRUE(false); + } + } + + TOOL_BENCHMARK_TIMER(ref3.compute(statfile), "\n\nGeMM time: ", timevalue); + if (statfile.is_open()) { + statfile << "GeMM time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + + TOOL_BENCHMARK_TIMER(ref.compute(statfile), "\n\nReference MTTKRP+GEMM: ", timevalue); + if (statfile.is_open()) { + statfile << "reference asymptotic blowup time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + double* ref3_vals = (double*) (ref3.getTacoTensorT()->vals); + double* ref_vals = (double*) (ref.getTacoTensorT()->vals); + for (int q=0; q < B.getDimension(0)* mDim; q++) { + if ( abs(ref3_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) { + std::cout << "error: results don't match i: " << q << ", avals: " << ref3_vals[q] << " " + << "refvals: " << ref_vals[q] << std::endl; + ASSERT_TRUE(false); + } + } + + TOOL_BENCHMARK_TIMER(A.compute(statfile), "\n\nFused MTTKRP+GEMM: ", timevalue); + if (statfile.is_open()) { + statfile << "fused mttkrp+gemm time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + if (statfile.is_open()) { + statfile.close(); + } + + double* A_vals = (double*) (A.getTacoTensorT()->vals); + for (int q=0; q < B.getDimension(0)* mDim; q++) { + if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) { + std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + << "refvals: " << ref_vals[q] << std::endl; + ASSERT_TRUE(false); + } + } + + +} + +TEST(scheduling_eval, ttmFusedWithSyntheticData) { + if (should_use_CUDA_codegen()) { + return; + } + + taco_set_num_threads(NUM_THREADS_TO_USE); + + std::default_random_engine gen(0); + std::uniform_real_distribution unif(0.0, 1.0); + Format csf({Sparse,Sparse,Sparse}); + Format custom({Sparse,Sparse,Dense}); + Format rm({Dense,Dense}); + + int NUM_I = 5; + int NUM_J = 5; + int NUM_K = 5; + int NUM_L = 64; + int NUM_M = 1024; + float SPARSITY = .1; + + Tensor B("B", {NUM_I, NUM_J, NUM_K}, csf); + srand(549694); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float) rand() / (float) (RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, j, k}, (double) ((int) (rand_float * 3 / SPARSITY))); + } + } + } + } + B.pack(); + write("/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/synthetic.tns", B); + + // Generate a random dense matrix and store it in row-major (dense) format. + // Matrices correspond to order-2 tensors in taco. + Tensor C({B.getDimension(2), NUM_L}, rm); + for (int i = 0; i < C.getDimension(0); ++i) { + for (int j = 0; j < C.getDimension(1); ++j) { + C.insert({i,j}, unif(gen)); + } + } + C.pack(); + + // Generate another random dense matrix and store it in row-major format. + Tensor D({NUM_L, NUM_M}, rm); + for (int i = 0; i < D.getDimension(0); ++i) { + for (int j = 0; j < D.getDimension(1); ++j) { + D.insert({i,j}, unif(gen)); + } + } + D.pack(); + + Tensor A({B.getDimension(0), B.getDimension(1), NUM_M}, custom); + Tensor ref({B.getDimension(0), B.getDimension(1), NUM_M}, custom); + + // Define the MTTKRP computation using index notation. + IndexVar i, j, k, l, m; + A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m); + + IndexStmt stmt = makeReductionNotation(A.getAssignment()); + stmt = makeConcreteNotation(stmt); + printToFile("fusedTTMTTKRPConcrete", stmt); + + stmt = reorderLoopsTopologically(stmt); + printToFile("fusedTTMOrdered", stmt); + + stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 1); + printToFile("fusedTTMFused", stmt); + + stmt = insertTemporaries(stmt); + printToFile("fusedTTMWithTemps", stmt); + stmt = parallelizeOuterLoop(stmt); + printToFile("fusedTTMFinal", stmt); + + + // At this point, we have defined how entries in the output matrix should be + // computed from entries in the input tensor and matrices but have not actually + // performed the computation yet. To do so, we must first tell taco to generate + // code that can be executed to compute the MTTKRP operation. + A.compile(stmt); + // We can now call the functions taco generated to assemble the indices of the + // output matrix and then actually compute the MTTKRP. + A.assemble(); + + + ref(i,j,m) = B(i,j,k) * C(k,l) * D(l,m); + IndexStmt refStmt = makeReductionNotation(ref.getAssignment()); + refStmt = makeConcreteNotation(refStmt); + refStmt = insertTemporaries(refStmt); + refStmt = parallelizeOuterLoop(refStmt); + printToFile("tacoFusedTTM", refStmt); + ref.compile(refStmt); + ref.assemble(); + + Tensor ref1({B.getDimension(0), B.getDimension(1), NUM_L}, custom); + ref1(i,j,l) = B(i,j,k) * C(k,l); + IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment()); + ref1Stmt = makeConcreteNotation(ref1Stmt); + ref1Stmt = insertTemporaries(ref1Stmt); + ref1Stmt = parallelizeOuterLoop(ref1Stmt); + ref1.compile(ref1Stmt); + ref1.assemble(); + + Tensor ref2({B.getDimension(0), B.getDimension(1), NUM_M}, custom); + ref2(i,j,m) = ref1(i,j,l) * D(l,m); + IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment()); + ref2Stmt = makeConcreteNotation(ref2Stmt); + ref2Stmt = insertTemporaries(ref2Stmt); + ref2Stmt = parallelizeOuterLoop(ref2Stmt); + ref2.compile(ref2Stmt); + ref2.assemble(); + + Tensor ref3({B.getDimension(2), NUM_M}, rm); + ref3(k,m) = C(k,l) * D(l,m); + IndexStmt ref3Stmt = makeReductionNotation(ref3.getAssignment()); + ref3Stmt = makeConcreteNotation(ref3Stmt); + ref3Stmt = insertTemporaries(ref3Stmt); + ref3Stmt = parallelizeOuterLoop(ref3Stmt); + ref3.compile(ref3Stmt); + ref3.assemble(); + + Tensor ref4({B.getDimension(0), B.getDimension(1), NUM_M}, custom); + ref4(i,j,m) = B(i,j,k) * ref3(k,m); + IndexStmt ref4Stmt = makeReductionNotation(ref4.getAssignment()); + ref4Stmt = makeConcreteNotation(ref4Stmt); + ref4Stmt = insertTemporaries(ref4Stmt); + ref4Stmt = parallelizeOuterLoop(ref4Stmt); + ref4.compile(ref4Stmt); + ref4.assemble(); + + std::cout << "compute start\n"; + taco::util::TimeResults timevalue; + bool time = true; + // TOOL_BENCHMARK_TIMER(ref.compute(), "\n\nReference ISPC: ", timevalue); + TOOL_BENCHMARK_TIMER(A.compute(), "\n\nFused TTM->TTM: ", timevalue); + TOOL_BENCHMARK_TIMER(ref.compute(), "\n\nReference TTM->TTM: ", timevalue); + TOOL_BENCHMARK_TIMER(ref1.compute(), "\n\nTTM1: ", timevalue); + TOOL_BENCHMARK_TIMER(ref2.compute(), "\n\nTTM1: ", timevalue); + TOOL_BENCHMARK_TIMER(ref3.compute(), "\n\ndense: ", timevalue); + TOOL_BENCHMARK_TIMER(ref4.compute(), "\n\nTTM after dense: ", timevalue); + ASSERT_TENSOR_EQ(ref, A); + ASSERT_TENSOR_EQ(ref, ref2); + ASSERT_TENSOR_EQ(ref, ref4); + + for (int q = 0; q < A.getDimension(0); ++q) { + for (int w = 0; w < A.getDimension(1); ++w) { + for (int z = 0; z < A.getDimension(2); ++z) { + // std::cout << "(" << q << "," << w << "," << z << ")" + // << "a: " << A(q,w,z) << ", ref: " << ref(q,w,z) << std::endl; + if ( abs(A(q,w,z) - ref(q,w,z))/abs(ref(q,w,z)) > ERROR_MARGIN) { + std::cout << "error: results don't match A: " + << A(q,w,z) << ", ref: " << ref(q,w,z) << std::endl; + ASSERT_TRUE(false); + } + } + } + } + +} + +TEST(scheduling_eval, ttmFused) { + if (should_use_CUDA_codegen()) { + return; + } + + taco_set_num_threads(NUM_THREADS_TO_USE); + + ofstream statfile; + statfile.open( + "/home/min/a/kadhitha/workspace/my_taco/taco/test/stats/ttm-ttm.txt", std::ios::app); + if (statfile.is_open()) { + statfile << "\nttm-ttm execution\n"; + statfile << "\n-----------------------------------------\n"; + } + + std::default_random_engine gen(0); + std::uniform_real_distribution unif(0.0, 1.0); + Format csf({Dense,Sparse,Sparse}); + Format custom({Dense,Sparse,Dense}); + Format rm({Dense,Dense}); + int ldim = 32; + int mdim = 64; + + int64_t dummy_array_size = 2e6; + int64_t* dummy_array_to_flush_cache = (int64_t*) malloc(dummy_array_size*sizeof(int64_t)); + + vector matfilenums = {5}; + + for (auto matfilenum : matfilenums) { + + // int matfilenum = 0; + + + + // Load a sparse order-3 tensor from file (stored in the FROSTT format) and + // store it as a compressed sparse fiber tensor. The tensor in this example + // can be download from: http://frostt.io/tensors/nell-2/ + std::vector matfiles = { + "/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns", + "/home/min/a/kadhitha/ispc-examples/data/tns/delicious-3d.tns", + "/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns", // 2 + "/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns", // 3 + "/home/min/a/kadhitha/ispc-examples/data/tns/nell-1.tns", // 4 + "/home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns", // 5 + "/home/min/a/kadhitha/workspace/my_taco/tns/darpa1998.tns", // 6 + "/home/min/a/kadhitha/ispc-examples/data/tns/freebase_music.tns", + "/home/min/a/kadhitha/ispc-examples/data/tns/freebase_sampled.tns" + }; + std::vector matfilesrw = { + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/matmul_5-5-5.tns", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/delicious-3d.tns", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/flickr-3d.tns", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/nell-2.tns", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/nell-1.tns", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/vast-2015-mc1-3d.tns", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/darpa1998.tns", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/freebase_music.tns", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/freebase_sampled.tns" + }; + statfile << "\nfile: " << matfiles[matfilenum] << std::endl; + statfile << "----------------------------------------------------------------\n"; + + std::string matfile = matfiles[matfilenum]; + Tensor B = read(matfile, csf); + B.setName("B"); + B.pack(); + // write(matfilesrw[matfilenum], B); + + // Generate a random dense matrix and store it in row-major (dense) format. + // Matrices correspond to order-2 tensors in taco. + Tensor C("C", {B.getDimension(2), ldim}, rm); + for (int i = 0; i < C.getDimension(0); ++i) { + for (int j = 0; j < C.getDimension(1); ++j) { + C.insert({i,j}, unif(gen)); + } + } + C.pack(); + + // Generate another random dense matrix and store it in row-major format. + Tensor D("D", {ldim, mdim}, rm); + for (int i = 0; i < D.getDimension(0); ++i) { + for (int j = 0; j < D.getDimension(1); ++j) { + D.insert({i,j}, unif(gen)); + } + } + D.pack(); + + if (statfile.is_open()) { + statfile + << matfile << std::endl + << "A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)" << std::endl + << "B1_dimension: " << B.getDimension(0) << ", B2_dimension: " << B.getDimension(1) << ", B3_dimension: " << B.getDimension(2) << ", vals: " << B.getStorage().getValues().getSize() << std::endl + << "C1_dimension: " << C.getDimension(0) << ", C2_dimension: " << C.getDimension(1) << ", vals: " << C.getStorage().getValues().getSize() << std::endl + << "D1_dimension: " << D.getDimension(0) << ", D2_dimension: " << D.getDimension(1) << ", vals: " << D.getStorage().getValues().getSize() << std::endl + << std::endl; + } + + Tensor A({B.getDimension(0), B.getDimension(1), mdim}, custom); + Tensor ref({B.getDimension(0), B.getDimension(1), mdim}, custom); + Tensor refn({B.getDimension(0), B.getDimension(1), mdim}, custom); + + // Define the MTTKRP computation using index notation. + IndexVar i, j, k, l, m; + IndexVar i0,i1, j0, j1, k0, k1, l0, l1, m0, m1; + A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m); + + + IndexStmt stmt = makeReductionNotation(A.getAssignment()); + stmt = makeConcreteNotation(stmt); + stmt = reorderLoopsTopologically(stmt); + stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 1); + stmt = stmt.split(i, i0, i1, 16); + stmt = insertTemporaries(stmt); + stmt = parallelizeOuterLoop(stmt); + printToFile("fusedTTMFinal", stmt); + + A.compile(stmt); + A.assemble(); + + + ref(i,j,m) = B(i,j,k) * C(k,l) * D(l,m); // TTM->TTM TACO + IndexStmt refStmt = makeReductionNotation(ref.getAssignment()); + refStmt = makeConcreteNotation(refStmt); + refStmt = refStmt + .split(i, i0, i1, 16); + refStmt = insertTemporaries(refStmt); + refStmt = parallelizeOuterLoop(refStmt); + printToFile("tacoFusedTTM", refStmt); + ref.compile(refStmt); + ref.assemble(); + + refn(i,j,m) = B(i,j,k) * C(k,l) * D(l,m); // TTM->TTM TACO + IndexStmt refnStmt = makeReductionNotation(refn.getAssignment()); + refnStmt = makeConcreteNotation(refnStmt); + refnStmt = refnStmt + .split(i, i0, i1, 16) + .reorder({i0, i1, j, k, l, m}); + refnStmt = insertTemporaries(refnStmt); + refnStmt = parallelizeOuterLoop(refnStmt); + printToFile("tacoFusedTTM", refnStmt); + refn.compile(refnStmt); + refn.assemble(); + + Tensor ref1({B.getDimension(0), B.getDimension(1), ldim}, custom); + ref1(i,j,l) = B(i,j,k) * C(k,l); // TTM1 + IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment()); + ref1Stmt = makeConcreteNotation(ref1Stmt); + // ref1Stmt = ref1Stmt.split(i, i0, i1, 16); + ref1Stmt = insertTemporaries(ref1Stmt); + ref1Stmt = parallelizeOuterLoop(ref1Stmt); + ref1.compile(ref1Stmt); + ref1.assemble(); + + Tensor ref2({B.getDimension(0), B.getDimension(1), mdim}, custom); + ref2(i,j,m) = ref1(i,j,l) * D(l,m); // TTM2 + IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment()); + ref2Stmt = makeConcreteNotation(ref2Stmt); + // ref2Stmt = ref2Stmt.split(i, i0, i1, 16); + ref2Stmt = insertTemporaries(ref2Stmt); + ref2Stmt = parallelizeOuterLoop(ref2Stmt); + ref2.compile(ref2Stmt); + ref2.assemble(); + + Tensor ref3({B.getDimension(2), mdim}, rm); + ref3(k,m) = C(k,l) * D(l,m); // GeMM + IndexStmt ref3Stmt = makeReductionNotation(ref3.getAssignment()); + ref3Stmt = makeConcreteNotation(ref3Stmt); + ref3Stmt = ref3Stmt + .split(k, k0, k1, 32) + .split(l, l0, l1, 32) + .split(m, m0, m1, 32) + .reorder({k0, l0, m0, k1, l1, m1}); + ref3Stmt = insertTemporaries(ref3Stmt); + ref3Stmt = parallelizeOuterLoop(ref3Stmt); + ref3.compile(ref3Stmt); + ref3.assemble(); + + Tensor ref4({B.getDimension(0), B.getDimension(1), mdim}, custom); + ref4(i,j,m) = B(i,j,k) * ref3(k,m); // TTM1 + IndexStmt ref4Stmt = makeReductionNotation(ref4.getAssignment()); + ref4Stmt = makeConcreteNotation(ref4Stmt); + // ref4Stmt = ref4Stmt + // .split(i, i0, i1, 16); + // // .split(k, k0, k1, 16) + // .split(m, m0, m1, 16) + // .reorder({i0, i1, j, m0, k, m1}); + ref4Stmt = insertTemporaries(ref4Stmt); + ref4Stmt = parallelizeOuterLoop(ref4Stmt); + ref4.compile(ref4Stmt); + ref4.assemble(); + + std::cout << "compute start\n"; + taco::util::TimeResults timevalue; + bool time = true; + + int r = rand(); + for (int64_t i=0; iTTM: ", timevalue); + if (statfile.is_open()) { + statfile << "fused time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + r = rand(); + for (int64_t i=0; iTTM: ", timevalue); + if (statfile.is_open()) { + statfile << "reference time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + r = rand(); + for (int64_t i=0; iTTM: ", timevalue); + if (statfile.is_open()) { + statfile << "reference new time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + statfile << "\nschedule 1\n"; + + r = rand(); + for (int64_t i=0; ivals); + double* ref_vals = (double*) (ref.getTacoTensorT()->vals); + double* ref2_vals = (double*) (ref2.getTacoTensorT()->vals); + double* ref4_vals = (double*) (ref4.getTacoTensorT()->vals); + + // int* A2_pos = (double*) (ref.getTacoTensorT()->vals); + + // for (size_t q=0; q < B.getStorage().getValues().getSize(); q++) { + // if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) { + // std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + // << "refvals: " << ref_vals[q] << std::endl; + // ASSERT_TRUE(false); + // } + // } + + // std::cout << "our fused vs taco original fused check\n"; + // for (size_t q=0; q < A.getStorage().getValues().getSize(); q++) { + // if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) { + // std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + // << "refvals: " << ref_vals[q] << std::endl; + // ASSERT_TRUE(false); + // } + // } + // std::cout << "taco original fused vs TTM1, TTM2 check\n"; + // for (size_t q=0; q < A.getStorage().getValues().getSize(); q++) { + // if ( abs(ref_vals[q] - ref2_vals[q])/abs(ref2_vals[q]) > ERROR_MARGIN) { + // std::cout << "error: results don't match i: " << q << ", avals: " << ref_vals[q] << " " + // << "refvals: " << ref2_vals[q] << std::endl; + // ASSERT_TRUE(false); + // } + // } + // std::cout << "taco original fused vs GeMM, TTM1 check\n"; + // for (size_t q=0; q < A.getStorage().getValues().getSize(); q++) { + // if ( abs(ref_vals[q] - ref4_vals[q])/abs(ref4_vals[q]) > ERROR_MARGIN) { + // std::cout << "error: results don't match i: " << q << ", avals: " << ref_vals[q] << " " + // << "refvals: " << ref4_vals[q] << std::endl; + // ASSERT_TRUE(false); + // } + // } + + } // end of forloop + + if (statfile.is_open()) { + statfile.close(); + } + +} + + + + +TEST(scheduling_eval, spmmFusedWithSyntheticData) { + if (should_use_CUDA_codegen()) { + return; + } + + taco_set_num_threads(NUM_THREADS_TO_USE); + + std::default_random_engine gen(0); + std::uniform_real_distribution unif(0.0, 1.0); + + Format csr({dense, sparse}); + Format rm({dense, dense}); + int ldim = 32; + int kdim = 64; + + // uncomment this for reading the csr matrix saved in mtx file + std::cout << "reading B mat mtx\n"; + + int NUM_I = 128; + int NUM_J = 96; + int NUM_K = 64; + float SPARSITY = .3; + Tensor B("B", {NUM_I, NUM_J}, csr); + srand(75883); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + } + B.pack(); + + Tensor C("C", {NUM_J, NUM_K}, csr); + for (int j = 0; j < NUM_J; j++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({j, k}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + } + C.pack(); + // write("/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx", B); + + std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl; + std::cout << "adding c mat\n"; + Tensor D({C.getDimension(1), ldim}, rm); + for (int i = 0; i < D.getDimension(0); ++i) { + for (int j = 0; j < D.getDimension(1); ++j) { + D.insert({i,j}, unif(gen)); + } + } + std::cout << "packing C mat\n"; + D.pack(); + + // Tensor E({B.getDimension(1), kdim}, rm); + // for (int i = 0; i < D.getDimension(0); ++i) { + // for (int j = 0; j < D.getDimension(1); ++j) { + // D.insert({i,j}, unif(gen)); + // } + // } + // std::cout << "packing D mat\n"; + // D.pack(); + + // Tensor F({B.getDimension(1), ldim}, rm); + // for (int i = 0; i < F.getDimension(0); ++i) { + // for (int j = 0; j < F.getDimension(1); ++j) { + // F.insert({i,j}, unif(gen)); + // } + // } + // std::cout << "packing F mat\n"; + // F.pack(); + + Tensor A({B.getDimension(0), ldim}, rm); + Tensor ref({B.getDimension(0), ldim}, rm); + IndexVar i, j, k, l; + A(i,l)=B(i,j)*C(j,k)*D(k,l); + + // IndexStmt stmt = A.getAssignment().concretize(); + IndexStmt stmt = makeReductionNotation(A.getAssignment()); + stmt = makeConcreteNotation(stmt); + printToFile("fusedMMConcrete", stmt); + + stmt = reorderLoopsTopologically(stmt); + printToFile("fusedMMOrdered", stmt); + + stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 1); + printToFile("fusedMMFused", stmt); + + stmt = insertTemporaries(stmt); + printToFile("fusedMMWithTemps", stmt); + stmt = parallelizeOuterLoop(stmt); + printToFile("fusedMMFusedPar", stmt); + + A.compile(stmt); + // We can now call the functions taco generated to assemble the indices of the + // output matrix and then actually compute the MTTKRP. + A.assemble(); + + + ref(i,l)=B(i,j)*C(j,k)*D(k,l); + IndexStmt refStmt = makeReductionNotation(ref.getAssignment()); + refStmt = makeConcreteNotation(refStmt); + refStmt = insertTemporaries(refStmt); + refStmt = parallelizeOuterLoop(refStmt); + ref.compile(refStmt); + ref.assemble(); + + // Tensor ref1({B.getDimension(0), B.getDimension(1)}, csr); + // Tensor ref2({B.getDimension(0), ldim}, rm); + // ref1(i,j)=B(i,j)*C(i,k)*D(j,k); + // ref2(i,l)=ref1(i,j)*F(j,l); + + // IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment()); + // ref1Stmt = makeConcreteNotation(ref1Stmt); + // ref1Stmt = insertTemporaries(ref1Stmt); + // ref1Stmt = parallelizeOuterLoop(ref1Stmt); + // ref1.compile(ref1Stmt); + // ref1.assemble(); + + // IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment()); + // ref2Stmt = makeConcreteNotation(ref2Stmt); + // ref2Stmt = insertTemporaries(ref2Stmt); + // ref2Stmt = parallelizeOuterLoop(ref2Stmt); + // ref2.compile(ref2Stmt); + // ref2.assemble(); + + std::cout << "compute start\n"; + taco::util::TimeResults timevalue; + bool time = true; + TOOL_BENCHMARK_TIMER(ref.compute(), "\n\nReference Kernel: ", timevalue); + TOOL_BENCHMARK_TIMER(A.compute(), "\n\nFused Kernel: ", timevalue); + + // check results + for (int q = 0; q < A.getDimension(0); ++q) { + for (int w = 0; w < A.getDimension(1); ++w) { + if ( abs(A(q,w) - ref(q,w))/abs(ref(q,w)) > ERROR_MARGIN) { + std::cout << "error: results don't match A("<< q << "," << w << "): " + << A(q,w) << ", ref: " << ref(q,w) << std::endl; + ASSERT_TRUE(false); + } + } + } + // // ASSERT_TENSOR_EQ(A, ref); + // TOOL_BENCHMARK_TIMER(ref1.compute(), "\n\nSDDMM Kernel: ", timevalue); + // TOOL_BENCHMARK_TIMER(ref2.compute(), "\n\nSpMM Kernel: ", timevalue); + + // for (int q = 0; q < ref2.getDimension(0); ++q) { + // for (int w = 0; w < ref2.getDimension(1); ++w) { + // if ( abs(ref2(q,w) - ref(q,w))/abs(ref(q,w)) > ERROR_MARGIN) { + // std::cout << "error: results don't match A("<< q << "," << w << "): " + // << ref2(q,w) << ", ref: " << ref(q,w) << std::endl; + // ASSERT_TRUE(false); + // } + // } + // } + +} + + +TEST(scheduling_eval, spmmFused) { + if (should_use_CUDA_codegen()) { + return; + } + + taco_set_num_threads(NUM_THREADS_TO_USE); + + ofstream statfile; + statfile.open( + "/home/min/a/kadhitha/workspace/my_taco/taco/test/stats/spmm-gemm.txt", std::ios::app); + if (statfile.is_open()) { + statfile << "\nspmm-spmm execution\n"; + statfile << "\n-----------------------------------------\n"; + } + + std::default_random_engine gen(0); + std::uniform_real_distribution unif(0.0, 1.0); + + Format csr({dense, sparse}); + Format rm({dense, dense}); + int kdim = 128; + int ldim = 64; + + // vector filenums = {2,3,4,5,6,7,8,9,10,12,15}; + vector filenums = {0}; + + for (auto filenum : filenums) { + + + statfile << "filenum: " << filenum << std::endl; + statfile << "---------------------------------\n"; + // int filenum = 7; + + std::vector matfiles = { + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx", + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx", // 2 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx", // 10 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx", // 12 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wiki-Talk/wiki-Talk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/com-Orkut/com-Orkut.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx", // 15 + "/home/min/a/kadhitha/workspace/my_taco/FusedMM/dataset/harvard.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/twitter7/twitter7.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k.mtx", + }; + std::vector matfilesrw = { + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/rw/cora.mtx", + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/rw/amazon.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/synthetic.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cage3.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/bcsstk17.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/pdb1HYS.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/rma10.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cant.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/consph.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cop20k_A.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/shipsec1.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/scircuit.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/webbase-1M.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/wiki-Talk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/com-Orkut.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/circuit5M.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/harvard.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/twitter7.mtx" + }; + + std::string matfile = matfiles[filenum]; + std::cout << "reading B mat mtx\n"; + Tensor B = read(matfile, csr); + B.pack(); + // write(matfilesrw[filenum], B); + + if (statfile.is_open()) { + statfile << matfile << std::endl; + } + + std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl; + std::cout << "adding c mat\n"; + // Tensor C = read(matfiles2[filenum], csr, true); + // std::cout << "packing C mat\n"; + + std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl; + std::cout << "adding c mat\n"; + Tensor C("C", {B.getDimension(1), kdim}, rm); + for (int i = 0; i < C.getDimension(0); ++i) { + for (int j = 0; j < C.getDimension(1); ++j) { + C.insert({i,j}, unif(gen)); + } + } + std::cout << "packing C mat\n"; + C.pack(); + + Tensor D({C.getDimension(1), ldim}, rm); + for (int i = 0; i < D.getDimension(0); ++i) { + for (int j = 0; j < D.getDimension(1); ++j) { + D.insert({i,j}, unif(gen)); + } + } + std::cout << "packing D mat\n"; + D.pack(); + + // Tensor F({B.getDimension(1), ldim}, rm); + // for (int i = 0; i < F.getDimension(0); ++i) { + // for (int j = 0; j < F.getDimension(1); ++j) { + // F.insert({i,j}, unif(gen)); + // } + // } + // std::cout << "packing F mat\n"; + // F.pack(); + + Tensor A({B.getDimension(0), ldim}, rm); + Tensor ref({B.getDimension(0), ldim}, rm); + Tensor refn({B.getDimension(0), ldim}, rm); + IndexVar i, j, k, l; + IndexVar i0, i1, j0, j1, k0, k1, l0, l1; + + A(i,l)=B(i,j)*C(j,k)*D(k,l); + if (statfile.is_open()) { + statfile + << "ref(i,l)=B(i,j)*C(i,k)*D(j,k);" << std::endl + << "B1_dimension: " << B.getDimension(0) << ", B2_dimension: " << B.getDimension(1) << ", vals: " << B.getStorage().getValues().getSize() << std::endl + << "C1_dimension: " << C.getDimension(0) << ", C2_dimension: " << C.getDimension(1) << ", vals: " << C.getStorage().getValues().getSize() << std::endl + << "D1_dimension: " << D.getDimension(0) << ", D2_dimension: " << D.getDimension(1) << ", vals: " << D.getStorage().getValues().getSize() << std::endl + // << "E1_dimension: " << F.getDimension(0) << ", E2_dimension: " << F.getDimension(1) << ", vals: " << F.getStorage().getValues().getSize() << std::endl + << std::endl; + } + + // IndexStmt stmt = A.getAssignment().concretize(); + IndexStmt stmt = makeReductionNotation(A.getAssignment()); + stmt = makeConcreteNotation(stmt); + stmt = reorderLoopsTopologically(stmt); + stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 1); + stmt = stmt.split(i, i0, i1, 16); + stmt = insertTemporaries(stmt); + stmt = parallelizeOuterLoop(stmt); + + A.compile(stmt); + A.assemble(); + + + ref(i,l)=B(i,j)*C(j,k)*D(k,l); + refn(i,l)=B(i,j)*C(j,k)*D(k,l); + // IndexStmt refStmt = ref.getAssignment().concretize(); + + // ref1Stmt = ref1Stmt.split(i, i0, i1, 16); + // .pos(j, jpos, B(i,j)); + // .split(k, k0, k1, 8); + // .reorder({i0, i1, jpos0, k, jpos1}); + // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + // .parallelize(jpos1, ParallelUnit::CPUVector, OutputRaceStrategy::ParallelReduction); + IndexStmt refStmt = makeReductionNotation(ref.getAssignment()); + refStmt = makeConcreteNotation(refStmt); + refStmt = refStmt + .split(i, i0, i1, 16) + .split(k, k0, k1, 32) + .split(l, l0, l1, 32) + .reorder({i0, i1, j, k0, l0, k1, l1}); + refStmt = insertTemporaries(refStmt); + refStmt = parallelizeOuterLoop(refStmt); + ref.compile(refStmt); + ref.assemble(); + + IndexStmt refnStmt = makeReductionNotation(refn.getAssignment()); + refnStmt = makeConcreteNotation(refnStmt); + refnStmt = refnStmt + .split(i, i0, i1, 16); + refnStmt = insertTemporaries(refnStmt); + refnStmt = parallelizeOuterLoop(refnStmt); + refn.compile(refnStmt); + refn.assemble(); + + // SpMM , GEMM + + Tensor ref1({B.getDimension(0), kdim}, rm); + Tensor ref2({B.getDimension(0), ldim}, rm); + Tensor ref2_2({B.getDimension(0), ldim}, rm); + + ref1(i,k)=B(i,j)*C(j,k); + ref2(i,l)=ref1(i,k)*D(k,l); + ref2_2(i,l)=ref1(i,k)*D(k,l); + + IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment()); + ref1Stmt = makeConcreteNotation(ref1Stmt); + ref1Stmt = insertTemporaries(ref1Stmt); + ref1Stmt = parallelizeOuterLoop(ref1Stmt); + ref1.compile(ref1Stmt); + ref1.assemble(); + + IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment()); + ref2Stmt = makeConcreteNotation(ref2Stmt); + ref2Stmt = insertTemporaries(ref2Stmt); + ref2Stmt = ref2Stmt.split(i, i0, i1, 16); + ref2Stmt = parallelizeOuterLoop(ref2Stmt); + ref2.compile(ref2Stmt); + ref2.assemble(); + + IndexStmt ref2Stmt2 = makeReductionNotation(ref2_2.getAssignment()); + ref2Stmt2 = makeConcreteNotation(ref2Stmt2); + ref2Stmt2 = ref2Stmt2 + .split(i, i0, i1, 32) + .split(k,k0,k1, 32) + .split(l, l0, l1, 32) + .reorder({i0, k0, l0, i1, k1, l1}) + .parallelize(j0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); + ref2Stmt2 = insertTemporaries(ref2Stmt2); + // ref2Stmt2 = parallelizeOuterLoop(ref2Stmt2); + ref2_2.compile(ref2Stmt2); + ref2_2.assemble(); + + + // -------------- GeMM and SpMM + + Tensor ref3({C.getDimension(0), ldim}, rm); + Tensor ref4({C.getDimension(0), ldim}, rm); + ref3(j,l)=C(j,k)*D(k,l); // GEMM + ref4(i,l) = B(i,j)*ref3(j,l); // SpMM + + IndexStmt ref3Stmt = ref3.getAssignment().concretize(); + ref3Stmt = ref3Stmt + .split(j, j0, j1, 32) // changed to 32 + .split(k, k0, k1, 32) + .split(l, l0, l1, 32) + .reorder({j0, k0, l0, j1, k1, l1}) + .parallelize(j0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); + ref2Stmt2 = insertTemporaries(ref2Stmt2); + ref3.compile(ref3Stmt); + ref3.assemble(); + + IndexStmt ref4Stmt = makeReductionNotation(ref4.getAssignment()); // SpMM operation + ref4Stmt = makeConcreteNotation(ref4Stmt); + ref4Stmt = ref4Stmt.split(i, i0, i1, 16); + ref4Stmt = insertTemporaries(ref4Stmt); + ref4Stmt = parallelizeOuterLoop(ref4Stmt); + ref4.compile(ref4Stmt); + ref4.assemble(); + + + std::cout << "compute start\n"; + taco::util::TimeResults timevalue; + bool time = true; + + statfile << "\n--------- 1st pattern computation TTM, GEMM\n"; + + TOOL_BENCHMARK_TIMER(ref1.compute(statfile), "\n\nSpMM Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "SpMM time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + std::string sofile_spmm_template = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so"; + TOOL_BENCHMARK_TIMER(ref1.compute(statfile, sofile_spmm_template), "\n\nSpMM template Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "SpMM template time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + TOOL_BENCHMARK_TIMER(ref2.compute(statfile), "\n\nGeMM Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "GeMM time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + TOOL_BENCHMARK_TIMER(ref2_2.compute(statfile), "\n\nref GeMM template Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "ref 2 GeMM template time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + // std::string sofile_gemm_template = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/spmm_gemm/spmm_template.so"; + statfile << "\n--------- 2nd pattern computation GEMM, SpMM\n"; + TOOL_BENCHMARK_TIMER(ref3.compute(statfile), "\n\nGeMM template ref3 Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "ref3 GeMM template time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + TOOL_BENCHMARK_TIMER(ref4.compute(statfile, sofile_spmm_template), "\n\nSpMM template Kernel ref4: ", timevalue); + if (statfile.is_open()) { + statfile << "SpMM template time ref4: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + + statfile << "\n-------- reference pattern computation\n"; + + TOOL_BENCHMARK_TIMER(ref.compute(statfile), "\n\nReference Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "taco reference time: "; + statfile << timevalue << std::endl; + } else { std::cout << " stat file is not open\n"; } + + TOOL_BENCHMARK_TIMER(refn.compute(statfile), "\n\nReference new Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "taco reference new time: "; + statfile << timevalue << std::endl; + } else { std::cout << " stat file is not open\n"; } + + + TOOL_BENCHMARK_TIMER(A.compute(statfile), "\n\nFused Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "fused time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + + double* A_vals = (double*) (A.getTacoTensorT()->vals); + double* ref_vals = (double*) (ref.getTacoTensorT()->vals); + double* ref2_vals = (double*) (ref2.getTacoTensorT()->vals); + double* ref4_vals = (double*) (ref2.getTacoTensorT()->vals); + + // int* A2_pos = (double*) (ref.getTacoTensorT()->vals); + + // for (size_t q=0; q < B.getStorage().getValues().getSize(); q++) { + // if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) { + // std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + // << "refvals: " << ref_vals[q] << std::endl; + // ASSERT_TRUE(false); + // } + // } + + for (int q=0; q < A.getDimension(0)* A.getDimension(1); q++) { + if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) { + std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + << "refvals: " << ref_vals[q] << std::endl; + ASSERT_TRUE(false); + } + } + for (int q=0; q < A.getDimension(0)* A.getDimension(1); q++) { + if ( abs(A_vals[q] - ref2_vals[q])/abs(ref2_vals[q]) > ERROR_MARGIN) { + std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + << "refvals: " << ref2_vals[q] << std::endl; + ASSERT_TRUE(false); + } + } + for (int q=0; q < A.getDimension(0)* A.getDimension(1); q++) { + if ( abs(A_vals[q] - ref4_vals[q])/abs(ref4_vals[q]) > ERROR_MARGIN) { + std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + << "refvals: " << ref4_vals[q] << std::endl; + ASSERT_TRUE(false); + } + } + + } // end of file num for loop + + if (statfile.is_open()) { + statfile.close(); + } + +} + + + + + + +TEST(scheduling_eval, sddmmspmmFused) { + if (should_use_CUDA_codegen()) { + return; + } + + taco_set_num_threads(NUM_THREADS_TO_USE); + + ofstream statfile; + statfile.open( + "/home/min/a/kadhitha/workspace/my_taco/taco/test/stats/sddmm-spmm-gemm.txt", std::ios::app); + if (statfile.is_open()) { + statfile << "\nsddmm-spmm-gemm execution\n"; + statfile << "\n-----------------------------------------\n"; + } + + std::default_random_engine gen(0); + std::uniform_real_distribution unif(0.0, 1.0); + + Format csr({dense, sparse}); + Format rm({dense, dense}); + + int kdim = 64; + int ldim = 64; + int mdim = 64; + + // vector filenums{2, 3,4,5,6,7,8,9,10,12,15}; + vector filenums{0}; + + for (auto filenum : filenums) { + + + std::vector matfiles = { + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx", + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx", // 5 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx", // 8 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx", // 10 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx", // 12 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wiki-Talk/wiki-Talk.mtx", // 13 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/com-Orkut/com-Orkut.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx", // 15 + "/home/min/a/kadhitha/workspace/my_taco/FusedMM/dataset/harvard.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/twitter7/twitter7.mtx" + }; + std::vector matfilesrw = { + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/rw/cora.mtx", + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/rw/amazon.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/synthetic.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cage3.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/bcsstk17.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/pdb1HYS.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/rma10.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cant.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/consph.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cop20k_A.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/shipsec1.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/scircuit.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/webbase-1M.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/wiki-Talk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/com-Orkut.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/circuit5M.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/harvard.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/twitter7.mtx" + }; + + std::string matfile = matfiles[filenum]; + std::cout << "reading B mat mtx\n"; + Tensor B = read(matfile, csr, true); + B.setName("B"); + B.pack(); + // write(matfilesrw[filenum], B); + + if (statfile.is_open()) { + statfile << matfile << std::endl; + } + + std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl; + std::cout << "adding c mat\n"; + Tensor C({B.getDimension(0), kdim}, rm); + for (int i = 0; i < C.getDimension(0); ++i) { + for (int j = 0; j < C.getDimension(1); ++j) { + C.insert({i,j}, unif(gen)); + } + } + std::cout << "packing C mat\n"; + C.pack(); + + Tensor D({B.getDimension(1), kdim}, rm); + for (int i = 0; i < D.getDimension(0); ++i) { + for (int j = 0; j < D.getDimension(1); ++j) { + D.insert({i,j}, unif(gen)); + } + } + std::cout << "packing D mat\n"; + D.pack(); + + Tensor F({B.getDimension(1), ldim}, rm); + for (int i = 0; i < F.getDimension(0); ++i) { + for (int j = 0; j < F.getDimension(1); ++j) { + F.insert({i,j}, unif(gen)); + } + } + std::cout << "packing F mat\n"; + F.pack(); + + Tensor G({ldim, mdim}, rm); + for (int i = 0; i < G.getDimension(0); ++i) { + for (int j = 0; j < G.getDimension(1); ++j) { + G.insert({i,j}, unif(gen)); + } + } + std::cout << "packing F mat\n"; + G.pack(); + + Tensor A({B.getDimension(0), mdim}, rm); + Tensor ref({B.getDimension(0), mdim}, rm); + IndexVar i, j, k, l, m; + IndexVar i0("i0"), i1("i1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"), k0("k0"), k1("k1"); + IndexVar l0("l0"), l1("l1"), m0("m0"), m1("m1"); + + A(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); + + if (statfile.is_open()) { + statfile + << "ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);" << std::endl + << "B1_dimension: " << B.getDimension(0) << ", B2_dimension: " << B.getDimension(1) << ", vals: " << B.getStorage().getValues().getSize() << std::endl + << "C1_dimension: " << C.getDimension(0) << ", C2_dimension: " << C.getDimension(1) << ", vals: " << C.getStorage().getValues().getSize() << std::endl + << "D1_dimension: " << D.getDimension(0) << ", D2_dimension: " << D.getDimension(1) << ", vals: " << D.getStorage().getValues().getSize() << std::endl + << "E1_dimension: " << F.getDimension(0) << ", E2_dimension: " << F.getDimension(1) << ", vals: " << F.getStorage().getValues().getSize() << std::endl + << "G1_dimension: " << F.getDimension(0) << ", G2_dimension: " << G.getDimension(1) << ", vals: " << G.getStorage().getValues().getSize() << std::endl + << std::endl; + } + + // IndexStmt stmt = A.getAssignment().concretize(); + IndexStmt stmt = makeReductionNotation(A.getAssignment()); + stmt = makeConcreteNotation(stmt); + stmt = reorderLoopsTopologically(stmt); + stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 2); + stmt = stmt.split(i, i0, i1, 16); + + stmt = insertTemporaries(stmt); + stmt = parallelizeOuterLoop(stmt); + printToFile("sddmmSpMMGeMM", stmt); + + A.compile(stmt); + A.assemble(); + + + ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); + IndexStmt refStmt = makeReductionNotation(ref.getAssignment()); + refStmt = makeConcreteNotation(refStmt); + refStmt = refStmt.split(i, i0, i1, 16); + refStmt = insertTemporaries(refStmt); + refStmt = parallelizeOuterLoop(refStmt); + ref.compile(refStmt); + ref.assemble(); + + Tensor ref1({B.getDimension(0), B.getDimension(1)}, csr); + Tensor ref2({B.getDimension(0), ldim}, rm); + Tensor ref3({B.getDimension(0), mdim}, rm); + ref1(i,j)=B(i,j)*C(i,k)*D(j,k); + ref2(i,l)=ref1(i,j)*F(j,l); + ref3(i,m)=ref2(i,l)*G(l,m); + + IndexStmt ref1Stmt = ref1.getAssignment().concretize(); + + ref1Stmt = ref1Stmt.split(i, i0, i1, 16); + // // .pos(j, jpos, B(i,j)); + // // .split(k, k0, k1, 8); + // // .reorder({i0, i1, jpos0, k, jpos1}); + // // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + // // .parallelize(jpos1, ParallelUnit::CPUVector, OutputRaceStrategy::ParallelReduction); + // // ref1Stmt.split(i, ); + // // stmt = scheduleSDDMMCPU_forfuse(ref1Stmt, B); + // IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment()); + // ref1Stmt = makeConcreteNotation(ref1Stmt); + ref1Stmt = insertTemporaries(ref1Stmt); + ref1Stmt = parallelizeOuterLoop(ref1Stmt); + ref1.compile(ref1Stmt); + ref1.assemble(); + + IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment()); + ref2Stmt = makeConcreteNotation(ref2Stmt); + ref2Stmt = insertTemporaries(ref2Stmt); + ref2Stmt = parallelizeOuterLoop(ref2Stmt); + ref2.compile(ref2Stmt); + ref2.assemble(); + + // ref3(i,m)=ref2(i,l)*G(l,m); + IndexStmt ref3Stmt = makeReductionNotation(ref3.getAssignment()); + ref3Stmt = makeConcreteNotation(ref3Stmt); + ref3Stmt = ref3Stmt + .split(i, i0, i1, 32) + .split(l, l0, l1, 32) + .split(m, m0, m1, 32) + .reorder({i0, l0, m0, i1, l1, m1}); + ref3Stmt = insertTemporaries(ref3Stmt); + ref3Stmt = parallelizeOuterLoop(ref3Stmt); + ref3.compile(ref3Stmt); + ref3.assemble(); + + std::cout << "compute start\n"; + taco::util::TimeResults timevalue; + bool time = true; + + // std::string sofile_fused = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/fused_kernel.so"; + TOOL_BENCHMARK_TIMER(A.compute(statfile), "\n\nFused Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "fused time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + // std::string sofile_sddmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so"; + std::string sofile_sddmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.so"; + TOOL_BENCHMARK_TIMER(ref1.compute(statfile, sofile_sddmm), "\n\nSDDMM Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "sddmm time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + std::string sofile_sddmm_ryan = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/sddmm_ryan.so"; + TOOL_BENCHMARK_TIMER(ref1.compute(statfile, sofile_sddmm_ryan), "\n\nSDDMM ryan Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "sddmm ryan time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + std::string sofile_spmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so"; + TOOL_BENCHMARK_TIMER(ref2.compute(statfile, sofile_spmm), "\n\nSpMM ryan Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "spmm ryan time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + // std::string sofile_spmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so"; + TOOL_BENCHMARK_TIMER(ref3.compute(statfile), "\n\nGeMM Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "gemm time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + // std::string sofile_original = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/taco_original.so"; + TOOL_BENCHMARK_TIMER(ref.compute(statfile), "\n\nReference Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "taco reference time: "; + statfile << timevalue << std::endl; + } else { std::cout << " stat file is not open\n"; } + + double* A_vals = (double*) (A.getTacoTensorT()->vals); + double* ref_vals = (double*) (ref.getTacoTensorT()->vals); + double* ref3_vals = (double*) (ref3.getTacoTensorT()->vals); + + // int* A2_pos = (double*) (ref.getTacoTensorT()->vals); + + for (int q=0; q < A.getDimension(0)* A.getDimension(1); q++) { + if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) { + std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + << "refvals: " << ref_vals[q] << std::endl; + ASSERT_TRUE(false); + } + } + + for (int q=0; q < A.getDimension(0)* A.getDimension(1); q++) { + if ( abs(ref3_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) { + std::cout << "error: results don't match i: " << q << ", avals: " << ref3_vals[q] << " " + << "refvals: " << ref_vals[q] << std::endl; + ASSERT_TRUE(false); + } + } + + + + } + + // int filenum = 3; + + + // for (size_t q=0; q < A.getDimension(0)* A.getDimension(1); q++) { + // if ( abs(A_vals[q] - ref3_vals[q])/abs(ref3_vals[q]) > ERROR_MARGIN) { + // std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + // << "refvals: " << ref3_vals[q] << std::endl; + // ASSERT_TRUE(false); + // } + // } + // for (int q= 0; q< A_vals + // for (int q = 0; q < A.getDimension(0); ++q) { + // for (int w = 0; w < A.getDimension(1); ++w) { + // if ( abs(A(q,w) - ref(q,w))/abs(ref(q,w)) > ERROR_MARGIN) { + // std::cout << "error: results don't match A("<< q << "," << w << "): " + // << A(q,w) << ", ref: " << ref(q,w) << std::endl; + // ASSERT_TRUE(false); + // } + // } + // } + // ASSERT_TENSOR_EQ(A, ref); + + if (statfile.is_open()) { + statfile.close(); + } + +} \ No newline at end of file diff --git a/test/tests-transformation.cpp b/test/tests-transformation.cpp index abfec3d45..9a472906f 100644 --- a/test/tests-transformation.cpp +++ b/test/tests-transformation.cpp @@ -255,6 +255,8 @@ INSTANTIATE_TEST_CASE_P(parallelize, apply, struct reorderLoopsTopologically : public TestWithParam {}; + +// TEST_P(reorderLoopsTopologically, test) { IndexStmt actual = taco::reorderLoopsTopologically(GetParam().actual); ASSERT_NOTATION_EQ(GetParam().expected, actual); diff --git a/test/util.h b/test/util.h new file mode 100644 index 000000000..0f8b633e6 --- /dev/null +++ b/test/util.h @@ -0,0 +1,86 @@ +#ifndef __SCHEDULE_UTIL_HH__ +#define __SCHEDULE_UTIL_HH__ + +#include +#include +#include +#include +#include +#include +#include +#include "taco/cuda.h" +#include "test.h" +#include "test_tensors.h" +#include "taco/tensor.h" +#include "taco/index_notation/index_notation.h" +#include "taco/index_notation/transformations.h" +#include "codegen/codegen.h" +#include "taco/lower/lower.h" +#include "taco/util/timers.h" + +using namespace taco; + +#define ERROR_MARGIN (1.0e-2) + +#define TOOL_BENCHMARK_TIMER(CODE,NAME,TIMER) { \ + if (time) { \ + taco::util::Timer timer; \ + timer.start(); \ + CODE; \ + timer.stop(); \ + taco::util::TimeResults result = timer.getResult(); \ + cout << NAME << " " << result << " ms" << endl; \ + TIMER=result; \ + } \ + else { \ + CODE; \ + } \ +} + +#define TOOL_BENCHMARK_TIMER2(CODE,NAME,TIMER) { \ + if (time) { \ + taco::util::Timer timer; \ + timer.start(); \ + CODE; \ + timer.stop(); \ + taco::util::TimeResults result = timer.getResult(); \ + if (statfile.is_open()) { \ + statfile << NAME << " " << result << " ms" << endl; \ + } else { \ + cout << NAME << " " << result << " ms" << endl; \ + } \ + TIMER=result; \ + } \ + else { \ + CODE; \ + } \ +} + +static void printToCout(IndexStmt stmt); +static void printToFile(string filename, IndexStmt stmt); + + +static void printToCout(IndexStmt stmt) { + std::shared_ptr codegen = ir::CodeGen::init_default(cout, ir::CodeGen::ImplementationGen); + ir::Stmt compute = lower(stmt, "compute", false, true); + codegen->compile(compute, true); +} + +void printToFile(string filename, IndexStmt stmt) { + stringstream source; + + string file_path = "eval_generated/"; + mkdir(file_path.c_str(), 0777); + + std::shared_ptr codegen = ir::CodeGen::init_default(source, ir::CodeGen::ImplementationGen); + ir::Stmt compute = lower(stmt, "compute", false, true); + codegen->compile(compute, true); + + ofstream source_file; + string file_ending = should_use_CUDA_codegen() ? ".cu" : ".c"; + source_file.open(file_path + filename + file_ending); + source_file << source.str(); + source_file.close(); +} + +#endif // __SCHEDULE_UTIL_HH__ \ No newline at end of file diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt index 922f7e52e..41699d3fd 100644 --- a/tools/CMakeLists.txt +++ b/tools/CMakeLists.txt @@ -4,6 +4,7 @@ foreach(TOOL_SOURCE ${TOOL_SOURCES}) get_filename_component(TOOL ${TOOL_SOURCE} NAME_WE) add_executable("${TOOL}-tool" ${TOOL_SOURCE}) target_link_libraries("${TOOL}-tool" taco) + target_link_libraries("${TOOL}-tool" papi) target_include_directories("${TOOL}-tool" PRIVATE "${CMAKE_BINARY_DIR}/include") SET_TARGET_PROPERTIES("${TOOL}-tool" PROPERTIES OUTPUT_NAME ${TOOL}) install(TARGETS "${TOOL}-tool" DESTINATION bin) diff --git a/tools/taco.cpp b/tools/taco.cpp index cd351a203..1c22fc368 100644 --- a/tools/taco.cpp +++ b/tools/taco.cpp @@ -9,6 +9,7 @@ #include "taco.h" #include "taco/error.h" +#include "taco/index_notation/index_notation.h" #include "taco/parser/lexer.h" #include "taco/parser/parser.h" #include "taco/parser/schedule_parser.h" @@ -308,7 +309,9 @@ static void printCommandLine(ostream& os, int argc, char* argv[]) { } } -static bool setSchedulingCommands(vector> scheduleCommands, parser::Parser& parser, IndexStmt& stmt) { +static int setSchedulingCommands(vector> scheduleCommands, + parser::Parser& parser, IndexStmt& stmt, Assignment assignment) { + auto findVar = [&stmt](string name) { ProvenanceGraph graph(stmt); for (auto v : graph.getAllIndexVars()) { @@ -352,6 +355,16 @@ static bool setSchedulingCommands(vector> scheduleCommands, parse IndexVar fused(f); stmt = stmt.fuse(findVar(i), findVar(j), fused); + } else if (command == "loopfuse") { + taco_uassert(scheduleCommand.size() == 2) + << "'loopfuse' scheduling directive takes 2 parameters: fuse(b, 2)"; + std::string side = scheduleCommand[0]; + taco_uassert(side == "b" || side == "f") + << "first parameter must be either 'f' or 'b'"; + + int iters = std::stoi(scheduleCommand[1]); + + stmt = loopFusionOverFission(stmt, assignment, side, iters); } else if (command == "split") { taco_uassert(scheduleCommand.size() == 4) << "'split' scheduling directive takes 4 parameters: split(i, i1, i2, splitFactor)"; @@ -536,7 +549,8 @@ static bool setSchedulingCommands(vector> scheduleCommands, parse parallel_unit = ParallelUnit::CPUThread; } else if (unit == "CPUVector") { parallel_unit = ParallelUnit::CPUVector; - } else { + } + else { taco_uerror << "Parallel hardware not defined."; goto end; } @@ -1009,9 +1023,11 @@ int main(int argc, char* argv[]) { } // pre-parse expression, to determine existence and order of loaded tensors + std::cout << "pre-parse expression, to determine existence and order of loaded tensors\n"; map loadedTensors; TensorBase temp_tensor; parser::Parser temp_parser(exprStr, formats, dataTypes, tensorsDimensions, loadedTensors, 42); + std::cout << exprStr << std::endl; try { temp_parser.parse(); temp_tensor = temp_parser.getResultTensor(); @@ -1112,17 +1128,29 @@ int main(int argc, char* argv[]) { taco_set_parallel_schedule(sched, chunkSize); taco_set_num_threads(nthreads); - IndexStmt stmt = - makeConcreteNotation(makeReductionNotation(tensor.getAssignment())); + Assignment assignment = tensor.getAssignment(); + std::cout << "tensor.getAssignment(): " << assignment << std::endl; + + IndexStmt stmt2 = makeReductionNotation(tensor.getAssignment()); + std::cout << "reducedNotation: " << stmt2 << std::endl; + // IndexStmt stmt = + // makeConcreteNotation(makeReductionNotation(tensor.getAssignment())); + IndexStmt stmt = makeConcreteNotation(stmt2); + std::cout << "concrete index statement: " << stmt << std::endl; stmt = reorderLoopsTopologically(stmt); + std::cout << "topologically reordered loops statement: " << stmt << std::endl; + if (setSchedule) { - cuda |= setSchedulingCommands(scheduleCommands, parser, stmt); + cuda |= setSchedulingCommands(scheduleCommands, parser, stmt, tensor.getAssignment()); } else { + // stmt = loopFusionOverFission(stmt, tensor.getAssignment()); stmt = insertTemporaries(stmt); stmt = parallelizeOuterLoop(stmt); } + std::cout << "after setting the scheduling commands\n"; + std::cout << stmt << std::endl; if (cuda) { if (!CUDA_BUILT && benchmark) { @@ -1134,7 +1162,10 @@ int main(int argc, char* argv[]) { set_CUDA_codegen_enabled(false); } + std::cout << "running scalar promote\n" << std::endl; // stmt = scalarPromote(stmt); + std::cout << "\nafter scalar promote: \n" << stmt << std::endl << std::endl; + if (printConcrete) { cout << stmt << endl; }