From 2edd480ba9eed293e0d9c9eab8ee1bb3792286ea Mon Sep 17 00:00:00 2001 From: Adhhitha Dias Date: Mon, 28 Jun 2021 16:30:46 -0400 Subject: [PATCH 01/10] initial commit of ispc codegen files --- src/codegen/codegen_ispc.cpp | 606 +++++++++++++++++++++++++++++++++++ src/codegen/codegen_ispc.h | 63 ++++ 2 files changed, 669 insertions(+) create mode 100644 src/codegen/codegen_ispc.cpp create mode 100644 src/codegen/codegen_ispc.h diff --git a/src/codegen/codegen_ispc.cpp b/src/codegen/codegen_ispc.cpp new file mode 100644 index 000000000..4b0e82903 --- /dev/null +++ b/src/codegen/codegen_ispc.cpp @@ -0,0 +1,606 @@ +#include +#include +#include +#include +#include +#include + +#include "taco/ir/ir_visitor.h" +#include "codegen_ispc.h" +#include "taco/error.h" +#include "taco/util/strings.h" +#include "taco/util/collections.h" + +using namespace std; + +namespace taco { +namespace ir { + +// Some helper functions +namespace { + +// Include stdio.h for printf +// stdlib.h for malloc/realloc +// math.h for sqrt +// MIN preprocessor macro +// This *must* be kept in sync with taco_tensor_t.h +const string cHeaders = + "#ifndef TACO_C_HEADERS\n" + "#define TACO_C_HEADERS\n" + "#include \n" + "#include \n" + "#include \n" + "#include \n" + "#include \n" + "#include \n" + "#include \n" + "#if _OPENMP\n" + "#include \n" + "#endif\n" + "#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))\n" + "#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))\n" + "#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)\n" + "#ifndef TACO_TENSOR_T_DEFINED\n" + "#define TACO_TENSOR_T_DEFINED\n" + "typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;\n" + "typedef struct {\n" + " int32_t order; // tensor order (number of modes)\n" + " int32_t* dimensions; // tensor dimensions\n" + " int32_t csize; // component size\n" + " int32_t* mode_ordering; // mode storage ordering\n" + " taco_mode_t* mode_types; // mode storage types\n" + " uint8_t*** indices; // tensor index data (per mode)\n" + " uint8_t* vals; // tensor values\n" + " int32_t vals_size; // values array size\n" + "} taco_tensor_t;\n" + "#endif\n" + "#if !_OPENMP\n" + "int omp_get_thread_num() { return 0; }\n" + "int omp_get_max_threads() { return 1; }\n" + "#endif\n" + "int cmp(const void *a, const void *b) {\n" + " return *((const int*)a) - *((const int*)b);\n" + "}\n" + "int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {\n" + " if (array[arrayStart] >= target) {\n" + " return arrayStart;\n" + " }\n" + " int lowerBound = arrayStart; // always < target\n" + " int upperBound = arrayEnd; // always >= target\n" + " while (upperBound - lowerBound > 1) {\n" + " int mid = (upperBound + lowerBound) / 2;\n" + " int midValue = array[mid];\n" + " if (midValue < target) {\n" + " lowerBound = mid;\n" + " }\n" + " else if (midValue > target) {\n" + " upperBound = mid;\n" + " }\n" + " else {\n" + " return mid;\n" + " }\n" + " }\n" + " return upperBound;\n" + "}\n" + "int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {\n" + " if (array[arrayEnd] <= target) {\n" + " return arrayEnd;\n" + " }\n" + " int lowerBound = arrayStart; // always <= target\n" + " int upperBound = arrayEnd; // always > target\n" + " while (upperBound - lowerBound > 1) {\n" + " int mid = (upperBound + lowerBound) / 2;\n" + " int midValue = array[mid];\n" + " if (midValue < target) {\n" + " lowerBound = mid;\n" + " }\n" + " else if (midValue > target) {\n" + " upperBound = mid;\n" + " }\n" + " else {\n" + " return mid;\n" + " }\n" + " }\n" + " return lowerBound;\n" + "}\n" + "taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,\n" + " int32_t* dimensions, int32_t* mode_ordering,\n" + " taco_mode_t* mode_types) {\n" + " taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));\n" + " t->order = order;\n" + " t->dimensions = (int32_t *) malloc(order * sizeof(int32_t));\n" + " t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));\n" + " t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));\n" + " t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***));\n" + " t->csize = csize;\n" + " for (int32_t i = 0; i < order; i++) {\n" + " t->dimensions[i] = dimensions[i];\n" + " t->mode_ordering[i] = mode_ordering[i];\n" + " t->mode_types[i] = mode_types[i];\n" + " switch (t->mode_types[i]) {\n" + " case taco_mode_dense:\n" + " t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));\n" + " break;\n" + " case taco_mode_sparse:\n" + " t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));\n" + " break;\n" + " }\n" + " }\n" + " return t;\n" + "}\n" + "void deinit_taco_tensor_t(taco_tensor_t* t) {\n" + " for (int i = 0; i < t->order; i++) {\n" + " free(t->indices[i]);\n" + " }\n" + " free(t->indices);\n" + " free(t->dimensions);\n" + " free(t->mode_ordering);\n" + " free(t->mode_types);\n" + " free(t);\n" + "}\n" + "#endif\n"; +} // anonymous namespace + +// find variables for generating declarations +// generates a single var for each GetProperty +class CodeGen_ISPC::FindVars : public IRVisitor { +public: + map varMap; + + // the variables for which we need to add declarations + map varDecls; + + vector localVars; + + // this maps from tensor, property, mode, index to the unique var + map, string> canonicalPropertyVar; + + // this is for convenience, recording just the properties unpacked + // from the output tensor so we can re-save them at the end + map, string> outputProperties; + + // TODO: should replace this with an unordered set + vector outputTensors; + vector inputTensors; + + CodeGen_ISPC *codeGen; + + // copy inputs and outputs into the map + FindVars(vector inputs, vector outputs, CodeGen_ISPC *codeGen) + : codeGen(codeGen) { + for (auto v: inputs) { + auto var = v.as(); + taco_iassert(var) << "Inputs must be vars in codegen"; + taco_iassert(varMap.count(var)==0) << "Duplicate input found in codegen"; + inputTensors.push_back(v); + varMap[var] = var->name; + } + for (auto v: outputs) { + auto var = v.as(); + taco_iassert(var) << "Outputs must be vars in codegen"; + taco_iassert(varMap.count(var)==0) << "Duplicate output found in codegen"; + outputTensors.push_back(v); + varMap[var] = var->name; + } + } + +protected: + using IRVisitor::visit; + + virtual void visit(const Var *op) { + if (varMap.count(op) == 0) { + varMap[op] = op->is_ptr? op->name : codeGen->genUniqueName(op->name); + } + } + + virtual void visit(const VarDecl *op) { + if (!util::contains(localVars, op->var)) { + localVars.push_back(op->var); + } + op->var.accept(this); + op->rhs.accept(this); + } + + virtual void visit(const For *op) { + if (!util::contains(localVars, op->var)) { + localVars.push_back(op->var); + } + op->var.accept(this); + op->start.accept(this); + op->end.accept(this); + op->increment.accept(this); + op->contents.accept(this); + } + + virtual void visit(const GetProperty *op) { + if (!util::contains(inputTensors, op->tensor) && + !util::contains(outputTensors, op->tensor)) { + // Don't create header unpacking code for temporaries + return; + } + + if (varMap.count(op) == 0) { + auto key = + tuple(op->tensor,op->property, + (size_t)op->mode, + (size_t)op->index); + if (canonicalPropertyVar.count(key) > 0) { + varMap[op] = canonicalPropertyVar[key]; + } else { + auto unique_name = codeGen->genUniqueName(op->name); + canonicalPropertyVar[key] = unique_name; + varMap[op] = unique_name; + varDecls[op] = unique_name; + if (util::contains(outputTensors, op->tensor)) { + outputProperties[key] = unique_name; + } + } + } + } +}; + +CodeGen_ISPC::CodeGen_ISPC(std::ostream &dest, OutputKind outputKind, bool simplify) + : CodeGen(dest, false, simplify, C), out(dest), outputKind(outputKind) {} + +CodeGen_ISPC::~CodeGen_ISPC() {} + +void CodeGen_ISPC::compile(Stmt stmt, bool isFirst) { + varMap = {}; + localVars = {}; + + if (isFirst) { + // output the headers + out << cHeaders; + } + out << endl; + // generate code for the Stmt + stmt.accept(this); +} + +void CodeGen_ISPC::visit(const Function* func) { + // if generating a header, protect the function declaration with a guard + if (outputKind == HeaderGen) { + out << "#ifndef TACO_GENERATED_" << func->name << "\n"; + out << "#define TACO_GENERATED_" << func->name << "\n"; + } + + int numYields = countYields(func); + emittingCoroutine = (numYields > 0); + funcName = func->name; + labelCount = 0; + + resetUniqueNameCounters(); + FindVars inputVarFinder(func->inputs, {}, this); + func->body.accept(&inputVarFinder); + FindVars outputVarFinder({}, func->outputs, this); + func->body.accept(&outputVarFinder); + + // output function declaration + doIndent(); + out << printFuncName(func, inputVarFinder.varDecls, outputVarFinder.varDecls); + + // if we're just generating a header, this is all we need to do + if (outputKind == HeaderGen) { + out << ";\n"; + out << "#endif\n"; + return; + } + + out << " {\n"; + + indent++; + + // find all the vars that are not inputs or outputs and declare them + resetUniqueNameCounters(); + FindVars varFinder(func->inputs, func->outputs, this); + func->body.accept(&varFinder); + varMap = varFinder.varMap; + localVars = varFinder.localVars; + + // Print variable declarations + out << printDecls(varFinder.varDecls, func->inputs, func->outputs) << endl; + + if (emittingCoroutine) { + out << printContextDeclAndInit(varMap, localVars, numYields, func->name) + << endl; + } + + // output body + print(func->body); + + // output repack only if we allocated memory + if (checkForAlloc(func)) + out << endl << printPack(varFinder.outputProperties, func->outputs); + + if (emittingCoroutine) { + out << printCoroutineFinish(numYields, funcName); + } + + doIndent(); + out << "return 0;\n"; + indent--; + + doIndent(); + out << "}\n"; +} + +void CodeGen_ISPC::visit(const VarDecl* op) { + if (emittingCoroutine) { + doIndent(); + op->var.accept(this); + parentPrecedence = Precedence::TOP; + stream << " = "; + op->rhs.accept(this); + stream << ";"; + stream << endl; + } else { + IRPrinter::visit(op); + } +} + +void CodeGen_ISPC::visit(const Yield* op) { + printYield(op, localVars, varMap, labelCount, funcName); +} + +// For Vars, we replace their names with the generated name, +// since we match by reference (not name) +void CodeGen_ISPC::visit(const Var* op) { + taco_iassert(varMap.count(op) > 0) << + "Var " << op->name << " not found in varMap"; + if (emittingCoroutine) { +// out << "TACO_DEREF("; + } + out << varMap[op]; + if (emittingCoroutine) { +// out << ")"; + } +} + +static string genVectorizePragma(int width) { + stringstream ret; + ret << "#pragma clang loop interleave(enable) "; + if (!width) + ret << "vectorize(enable)"; + else + ret << "vectorize_width(" << width << ")"; + + return ret.str(); +} + +static string getParallelizePragma(LoopKind kind) { + stringstream ret; + ret << "#pragma omp parallel for schedule"; + switch (kind) { + case LoopKind::Static: + ret << "(static, 1)"; + break; + case LoopKind::Dynamic: + ret << "(dynamic, 1)"; + break; + case LoopKind::Runtime: + ret << "(runtime)"; + break; + case LoopKind::Static_Chunked: + ret << "(static)"; + break; + default: + break; + } + return ret.str(); +} + +static string getUnrollPragma(size_t unrollFactor) { + return "#pragma unroll " + std::to_string(unrollFactor); +} + +static string getAtomicPragma() { + return "#pragma omp atomic"; +} + +// The next two need to output the correct pragmas depending +// on the loop kind (Serial, Static, Dynamic, Vectorized) +// +// Docs for vectorization pragmas: +// http://clang.llvm.org/docs/LanguageExtensions.html#extensions-for-loop-hint-optimizations +void CodeGen_ISPC::visit(const For* op) { + switch (op->kind) { + case LoopKind::Vectorized: + doIndent(); + out << genVectorizePragma(op->vec_width); + out << "\n"; + break; + case LoopKind::Static: + case LoopKind::Dynamic: + case LoopKind::Runtime: + case LoopKind::Static_Chunked: + doIndent(); + out << getParallelizePragma(op->kind); + out << "\n"; + break; + default: + if (op->unrollFactor > 0) { + doIndent(); + out << getUnrollPragma(op->unrollFactor) << endl; + } + break; + } + + doIndent(); + stream << keywordString("for") << " ("; + if (!emittingCoroutine) { + stream << keywordString(util::toString(op->var.type())) << " "; + } + op->var.accept(this); + stream << " = "; + op->start.accept(this); + stream << keywordString("; "); + op->var.accept(this); + stream << " < "; + parentPrecedence = BOTTOM; + op->end.accept(this); + stream << keywordString("; "); + op->var.accept(this); + + auto lit = op->increment.as(); + if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || + (lit->type.isUInt() && lit->equalsScalar(1)))) { + stream << "++"; + } + else { + stream << " += "; + op->increment.accept(this); + } + stream << ") {\n"; + + op->contents.accept(this); + doIndent(); + stream << "}"; + stream << endl; +} + +void CodeGen_ISPC::visit(const While* op) { + // it's not clear from documentation that clang will vectorize + // while loops + // however, we'll output the pragmas anyway + if (op->kind == LoopKind::Vectorized) { + doIndent(); + out << genVectorizePragma(op->vec_width); + out << "\n"; + } + + IRPrinter::visit(op); +} + +void CodeGen_ISPC::visit(const GetProperty* op) { + taco_iassert(varMap.count(op) > 0) << + "Property " << Expr(op) << " of " << op->tensor << " not found in varMap"; + out << varMap[op]; +} + +void CodeGen_ISPC::visit(const Min* op) { + if (op->operands.size() == 1) { + op->operands[0].accept(this); + return; + } + for (size_t i=0; ioperands.size()-1; i++) { + stream << "TACO_MIN("; + op->operands[i].accept(this); + stream << ","; + } + op->operands.back().accept(this); + for (size_t i=0; ioperands.size()-1; i++) { + stream << ")"; + } +} + +void CodeGen_ISPC::visit(const Max* op) { + if (op->operands.size() == 1) { + op->operands[0].accept(this); + return; + } + for (size_t i=0; ioperands.size()-1; i++) { + stream << "TACO_MAX("; + op->operands[i].accept(this); + stream << ","; + } + op->operands.back().accept(this); + for (size_t i=0; ioperands.size()-1; i++) { + stream << ")"; + } +} + +void CodeGen_ISPC::visit(const Allocate* op) { + string elementType = printCType(op->var.type(), false); + + doIndent(); + op->var.accept(this); + stream << " = ("; + stream << elementType << "*"; + stream << ")"; + if (op->is_realloc) { + stream << "realloc("; + op->var.accept(this); + stream << ", "; + } + else { + // If the allocation was requested to clear the allocated memory, + // use calloc instead of malloc. + if (op->clear) { + stream << "calloc(1, "; + } else { + stream << "malloc("; + } + } + stream << "sizeof(" << elementType << ")"; + stream << " * "; + parentPrecedence = MUL; + op->num_elements.accept(this); + parentPrecedence = TOP; + stream << ");"; + stream << endl; +} + +void CodeGen_ISPC::visit(const Sqrt* op) { + taco_tassert(op->type.isFloat() && op->type.getNumBits() == 64) << + "Codegen doesn't currently support non-double sqrt"; + stream << "sqrt("; + op->a.accept(this); + stream << ")"; +} + +void CodeGen_ISPC::visit(const Assign* op) { + if (op->use_atomics) { + doIndent(); + stream << getAtomicPragma() << endl; + } + IRPrinter::visit(op); +} + +void CodeGen_ISPC::visit(const Store* op) { + if (op->use_atomics) { + doIndent(); + stream << getAtomicPragma() << endl; + } + IRPrinter::visit(op); +} + +void CodeGen_ISPC::generateShim(const Stmt& func, stringstream &ret) { + const Function *funcPtr = func.as(); + + ret << "int _shim_" << funcPtr->name << "(void** parameterPack) {\n"; + ret << " return " << funcPtr->name << "("; + + size_t i=0; + string delimiter = ""; + + const auto returnType = funcPtr->getReturnType(); + if (returnType.second != Datatype()) { + ret << "(void**)(parameterPack[0]), "; + ret << "(char*)(parameterPack[1]), "; + ret << "(" << returnType.second << "*)(parameterPack[2]), "; + ret << "(int32_t*)(parameterPack[3])"; + + i = 4; + delimiter = ", "; + } + + for (auto output : funcPtr->outputs) { + auto var = output.as(); + auto cast_type = var->is_tensor ? "taco_tensor_t*" + : printCType(var->type, var->is_ptr); + + ret << delimiter << "(" << cast_type << ")(parameterPack[" << i++ << "])"; + delimiter = ", "; + } + for (auto input : funcPtr->inputs) { + auto var = input.as(); + auto cast_type = var->is_tensor ? "taco_tensor_t*" + : printCType(var->type, var->is_ptr); + ret << delimiter << "(" << cast_type << ")(parameterPack[" << i++ << "])"; + delimiter = ", "; + } + ret << ");\n"; + ret << "}\n"; +} +} +} diff --git a/src/codegen/codegen_ispc.h b/src/codegen/codegen_ispc.h new file mode 100644 index 000000000..e3c87ece5 --- /dev/null +++ b/src/codegen/codegen_ispc.h @@ -0,0 +1,63 @@ +#ifndef TACO_BACKEND_C_H +#define TACO_BACKEND_C_H +#include +#include + +#include "taco/ir/ir.h" +#include "taco/ir/ir_printer.h" +#include "codegen.h" + +namespace taco { +namespace ir { + + +class CodeGen_ISPC : public CodeGen { +public: + /// Initialize a code generator that generates code to an + /// output stream. + CodeGen_ISPC(std::ostream &dest, OutputKind outputKind, bool simplify=true); + ~CodeGen_ISPC(); + + /// Compile a lowered function + void compile(Stmt stmt, bool isFirst=false); + + /// Generate shims that unpack an array of pointers representing + /// a mix of taco_tensor_t* and scalars into a function call + static void generateShim(const Stmt& func, std::stringstream &stream); + +protected: + using IRPrinter::visit; + + void visit(const Function*); + void visit(const VarDecl*); + void visit(const Yield*); + void visit(const Var*); + void visit(const For*); + void visit(const While*); + void visit(const GetProperty*); + void visit(const Min*); + void visit(const Max*); + void visit(const Allocate*); + void visit(const Sqrt*); + void visit(const Store*); + void visit(const Assign*); + + std::map varMap; + std::vector localVars; + std::ostream &out; + + OutputKind outputKind; + + std::string funcName; + int labelCount; + bool emittingCoroutine; + + class FindVars; + +private: + virtual std::string restrictKeyword() const { return "restrict"; } +}; + +} // namespace ir +} // namespace taco +#endif From 7d4b8b66415709d996061a6311ea2d6fdba78cf5 Mon Sep 17 00:00:00 2001 From: Adhhitha Dias Date: Mon, 28 Jun 2021 17:36:53 -0400 Subject: [PATCH 02/10] minimal changes to support ispc exec --- .gitignore | 3 ++ CMakeLists.txt | 7 ++++ include/taco/cuda.h | 10 +++++ include/taco/version.h.in | 1 + src/codegen/codegen.cpp | 4 ++ src/codegen/codegen_ispc.h | 4 +- src/codegen/module.cpp | 7 ++++ src/cuda.cpp | 11 ++++++ test/tests-scheduling-eval.cpp | 70 +++++++++++++++++++++++++++++++++- tools/taco.cpp | 19 +++++++++ 10 files changed, 132 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 16389f34e..9abc3adc7 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,6 @@ CMakeCache.txt doc apps/tensor_times_vector/tensor_times_vector + +.cache +compile_commands.json diff --git a/CMakeLists.txt b/CMakeLists.txt index a6a80d9d1..7e9359e01 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,10 +10,12 @@ project(taco LANGUAGES C CXX ) option(CUDA "Build for NVIDIA GPU (CUDA must be preinstalled)" OFF) +option(ISPC "Build for Intel ISPC Compiler (ISPC Compiler must be preinstalled)" OFF) option(PYTHON "Build TACO for python environment" OFF) option(OPENMP "Build with OpenMP execution support" OFF) option(COVERAGE "Build with code coverage analysis" OFF) set(TACO_FEATURE_CUDA 0) +set(TACO_FEATURE_ISPC 0) set(TACO_FEATURE_OPENMP 0) set(TACO_FEATURE_PYTHON 0) if(CUDA) @@ -22,6 +24,11 @@ if(CUDA) add_definitions(-DCUDA_BUILT) set(TACO_FEATURE_CUDA 1) endif(CUDA) +if(ISPC) + message("-- Searching for ISPC Installation") + add_definitions(-DISPC_BUILT) + set(TACO_FEATURE_ISPC 1) +endif(ISPC) if(OPENMP) message("-- Will use OpenMP for parallel execution") add_definitions(-DUSE_OPENMP) diff --git a/include/taco/cuda.h b/include/taco/cuda.h index aad6b5229..7ed545c6d 100644 --- a/include/taco/cuda.h +++ b/include/taco/cuda.h @@ -9,7 +9,17 @@ #define CUDA_BUILT false #endif +#ifndef ISPC_BUILT + #define ISPC_BUILT false +#endif + namespace taco { + +/// Functions used by taco to interface with ISPC +bool should_use_ISPC_codegen(); +void set_ISPC_codegen_enabled(bool enabled); + + /// Functions used by taco to interface with CUDA (especially unified memory) /// Check if should use CUDA codegen bool should_use_CUDA_codegen(); diff --git a/include/taco/version.h.in b/include/taco/version.h.in index bc5559d7d..8ef507598 100644 --- a/include/taco/version.h.in +++ b/include/taco/version.h.in @@ -20,5 +20,6 @@ #define TACO_FEATURE_OPENMP @TACO_FEATURE_OPENMP@ #define TACO_FEATURE_PYTHON @TACO_FEATURE_PYTHON@ #define TACO_FEATURE_CUDA @TACO_FEATURE_CUDA@ +#define TACO_FEATURE_ISPC @TACO_FEATURE_ISPC@ #endif /* TACO_VERSION_H */ diff --git a/src/codegen/codegen.cpp b/src/codegen/codegen.cpp index f0c09d98a..f57f9950f 100644 --- a/src/codegen/codegen.cpp +++ b/src/codegen/codegen.cpp @@ -2,6 +2,7 @@ #include "taco/cuda.h" #include "codegen_cuda.h" #include "codegen_c.h" +#include "codegen_ispc.h" #include #include @@ -26,6 +27,9 @@ shared_ptr CodeGen::init_default(std::ostream &dest, OutputKind outputK if (should_use_CUDA_codegen()) { return make_shared(dest, outputKind); } + else if (should_use_ISPC_codegen()) { + return make_shared(dest, outputKind); + } else { return make_shared(dest, outputKind); } diff --git a/src/codegen/codegen_ispc.h b/src/codegen/codegen_ispc.h index e3c87ece5..35da5a01b 100644 --- a/src/codegen/codegen_ispc.h +++ b/src/codegen/codegen_ispc.h @@ -1,5 +1,5 @@ -#ifndef TACO_BACKEND_C_H -#define TACO_BACKEND_C_H +#ifndef TACO_BACKEND_ISPC_H +#define TACO_BACKEND_ISPC_H #include #include diff --git a/src/codegen/module.cpp b/src/codegen/module.cpp index bd0f487b1..409ed4a83 100644 --- a/src/codegen/module.cpp +++ b/src/codegen/module.cpp @@ -13,6 +13,7 @@ #include "taco/util/strings.h" #include "taco/util/env.h" #include "codegen/codegen_c.h" +#include "codegen/codegen_ispc.h" #include "codegen/codegen_cuda.h" #include "taco/cuda.h" @@ -89,6 +90,9 @@ void writeShims(vector funcs, string path, string prefix) { if (should_use_CUDA_codegen()) { CodeGen_CUDA::generateShim(func, shims); } + else if (should_use_ISPC_codegen()) { + CodeGen_ISPC::generateShim(func, shims); + } else { CodeGen_C::generateShim(func, shims); } @@ -98,6 +102,9 @@ void writeShims(vector funcs, string path, string prefix) { if (should_use_CUDA_codegen()) { shims_file.open(path+prefix+"_shims.cpp"); } + else if (should_use_ISPC_codegen()) { + shims_file.open(path+prefix+".ispc", ios::app); + } else { shims_file.open(path+prefix+".c", ios::app); } diff --git a/src/cuda.cpp b/src/cuda.cpp index 059c60105..85139f874 100644 --- a/src/cuda.cpp +++ b/src/cuda.cpp @@ -7,6 +7,17 @@ using namespace std; namespace taco { + +static bool ISPC_codegen_enabled = ISPC_BUILT; +bool should_use_ISPC_codegen() { + return ISPC_codegen_enabled; +} + +void set_ISPC_codegen_enabled(bool enabled) { + ISPC_codegen_enabled = enabled; +} + + /// Functions used by taco to interface with CUDA (especially unified memory) static bool CUDA_codegen_enabled = CUDA_BUILT; static bool CUDA_unified_memory_enabled = CUDA_BUILT; diff --git a/test/tests-scheduling-eval.cpp b/test/tests-scheduling-eval.cpp index 52bd74ab4..f59359081 100644 --- a/test/tests-scheduling-eval.cpp +++ b/test/tests-scheduling-eval.cpp @@ -1,5 +1,7 @@ +#include #include #include +#include #include #include #include "test.h" @@ -44,6 +46,14 @@ IndexStmt scheduleSpMVCPU(IndexStmt stmt, int CHUNK_SIZE=16) { .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); } +IndexStmt scheduleSpMVISPC(IndexStmt stmt, int CHUNK_SIZE=16) { + IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1"); + return stmt; + // return stmt.split(i, i0, i1, CHUNK_SIZE) + // .reorder({i0, i1, j}) + // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); +} + IndexStmt scheduleSpMMCPU(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); return stmt.split(i, i0, i1, CHUNK_SIZE) @@ -1463,7 +1473,63 @@ TEST(scheduling_eval, mttkrpGPU) { ASSERT_TENSOR_EQ(expected, A); } -TEST(generate_evaluation_files, DISABLED_cpu) { + + +TEST(generate_ispc_evaluation_files, ispc) { + std::cout << "Hi Adhitha!\n" << std::endl ; + set_CUDA_codegen_enabled(false); + set_ISPC_codegen_enabled(true); + + vector> spmv_parameters = {{32}}; + vector> spmspv_parameters = {{8}}; + + // 4 to 512 and 4, 8, 16 + vector> spmm_dcsr_parameters = {{16, 8}}; + vector> spmm_parameters = {{16,4}}; + + vector> mttkrp_parameters = {}; + mttkrp_parameters.push_back({64,0}); + + vector> sddmm_parameters = {{8, 8}}; + vector> ttv_parameters = {{32}}; + + int NUM_I = 100; + int NUM_J = 100; + + string file_ending = ".ispc"; + string file_path = "eval_prepared_ispc/"; + mkdir(file_path.c_str(), 0777); + + // spmv + { + stringstream source; + std::shared_ptr codegen = ir::CodeGen::init_default(source, ir::CodeGen::ImplementationGen); + Tensor A("A", {NUM_I, NUM_J}, CSR); + Tensor x("x", {NUM_J}, {Dense}); + Tensor y("y", {NUM_I}, {Dense}); + y(i) = A(i, j) * x(j); + std::cout << "concretizing the assignment statement\n"; + IndexStmt stmt = y.getAssignment().concretize(); + std::cout << "Printing the original IndexStmt: " << stmt << std::endl; + for (auto paramSet : spmv_parameters) { + std::cout << "param set: " << paramSet[0] << std::endl; + IndexStmt scheduled = scheduleSpMVISPC(stmt, paramSet[0]); + std::cout << "scheduled IndexStmt: " << scheduled << std::endl; + ir::Stmt compute = lower(scheduled, "spmv_csr_ispc_taco", false, true); + std::cout << "computed statement: \n" << compute << std::endl; + codegen->compile(compute, false); + } + ofstream source_file; + source_file.open(file_path + "spmv_csr_ispc_taco.h"); + source_file << source.str(); + source_file.close(); + } + + + return; +} + +TEST(generate_evaluation_files, cpu) { if (should_use_CUDA_codegen()) { return; } @@ -1779,7 +1845,7 @@ TEST(generate_evaluation_files, DISABLED_cpu) { } } -TEST(generate_evaluation_files, DISABLED_gpu) { +TEST(generate_evaluation_files, gpu) { if (!should_use_CUDA_codegen()) { return; } diff --git a/tools/taco.cpp b/tools/taco.cpp index cd351a203..ce03b61e1 100644 --- a/tools/taco.cpp +++ b/tools/taco.cpp @@ -20,6 +20,7 @@ #include "taco/lower/lower.h" #include "taco/codegen/module.h" #include "codegen/codegen_c.h" +#include "codegen/codegen_ispc.h" #include "codegen/codegen_cuda.h" #include "codegen/codegen.h" #include "taco/util/strings.h" @@ -188,6 +189,8 @@ static void printUsageInfo() { cout << endl; printFlag("print-nocolor", "Print without colors."); cout << endl; + printFlag("ispc", "Generate ISPC code for Intel CPUs"); + cout << endl; printFlag("cuda", "Generate CUDA code for NVIDIA GPUs"); cout << endl; printFlag("schedule", "Specify parallel execution schedule"); @@ -279,6 +282,8 @@ static void printVersionInfo() { cout << "Built with Python support." << endl; if(TACO_FEATURE_CUDA) cout << "Built with CUDA support." << endl; + if(TACO_FEATURE_ISPC) + cout << "Built with ISPC support." << endl; cout << endl; cout << "Built on: " << TACO_BUILD_DATE << endl; cout << "CMake build type: " << TACO_BUILD_TYPE << endl; @@ -641,6 +646,7 @@ int main(int argc, char* argv[]) { bool color = true; bool readKernels = false; bool cuda = false; + bool ispc = false; bool setSchedule = false; @@ -949,6 +955,10 @@ int main(int argc, char* argv[]) { else if ("-cuda" == argName) { cuda = true; } + else if ("-ispc" == argName) { + std::cout << "ispc true\n"; + ispc = true; + } else if ("-schedule" == argName) { vector descriptor = util::split(argValue, ","); if (descriptor.size() > 2 || descriptor.empty()) { @@ -1129,9 +1139,18 @@ int main(int argc, char* argv[]) { return reportError("TACO must be built for CUDA (cmake -DCUDA=ON ..) to benchmark", 2); } set_CUDA_codegen_enabled(true); + set_ISPC_codegen_enabled(false); + } + else if (ispc) { + if (!ISPC_BUILT && benchmark) { + return reportError("TACO must be built for ISPC (cmake -DISPC=ON .. to benchmark", 2); + } + set_CUDA_codegen_enabled(false); + set_ISPC_codegen_enabled(true); } else { set_CUDA_codegen_enabled(false); + set_ISPC_codegen_enabled(false); } stmt = scalarPromote(stmt); From dd693feb9a56c0ab528fb602e0f30c3d014e3648 Mon Sep 17 00:00:00 2001 From: Adhhitha Dias Date: Mon, 12 Jul 2021 14:10:46 -0400 Subject: [PATCH 03/10] separate ispc code to another stream and smaller conversions to match ispc code --- include/taco/cuda.h | 2 + include/taco/ir/ir.h | 2 +- include/taco/ir/ir_printer.h | 3 + include/taco/util/strings.h | 22 + src/codegen/codegen.cpp | 141 ++++- src/codegen/codegen.h | 15 +- src/codegen/codegen_ispc.cpp | 278 ++++++--- src/codegen/codegen_ispc.h | 3 + src/codegen/module.cpp | 13 + src/cuda.cpp | 8 + src/ir/ir_printer.cpp | 824 +++++++++++++++++++------- src/ir/ir_rewriter.cpp | 1 + src/lower/lowerer_impl_imperative.cpp | 69 ++- src/tensor.cpp | 7 + test/tests-scheduling-eval.cpp | 79 ++- tools/taco.cpp | 2 + 16 files changed, 1127 insertions(+), 342 deletions(-) diff --git a/include/taco/cuda.h b/include/taco/cuda.h index 7ed545c6d..9c4a7aae9 100644 --- a/include/taco/cuda.h +++ b/include/taco/cuda.h @@ -18,6 +18,8 @@ namespace taco { /// Functions used by taco to interface with ISPC bool should_use_ISPC_codegen(); void set_ISPC_codegen_enabled(bool enabled); +bool is_ISPC_code_stream_enabled(); +void set_ISPC_code_stream_enabled(bool enabled); /// Functions used by taco to interface with CUDA (especially unified memory) diff --git a/include/taco/ir/ir.h b/include/taco/ir/ir.h index f852f26b1..cb46b5142 100644 --- a/include/taco/ir/ir.h +++ b/include/taco/ir/ir.h @@ -591,7 +591,7 @@ struct Switch : public StmtNode { static const IRNodeType _type_info = IRNodeType::Switch; }; -enum class LoopKind {Serial, Static, Dynamic, Runtime, Vectorized, Static_Chunked}; +enum class LoopKind {Serial, Static, Dynamic, Runtime, Vectorized, Static_Chunked, Foreach}; /** A for loop from start to end by increment. * A vectorized loop will require the increment to be 1 and the diff --git a/include/taco/ir/ir_printer.h b/include/taco/ir/ir_printer.h index 4e50764e9..c2c505bf5 100644 --- a/include/taco/ir/ir_printer.h +++ b/include/taco/ir/ir_printer.h @@ -16,6 +16,7 @@ class IRPrinter : public IRVisitorStrict { public: IRPrinter(std::ostream& stream); IRPrinter(std::ostream& stream, bool color, bool simplify); + IRPrinter(std::ostream& stream, std::ostream& stream2, bool color, bool simplify); virtual ~IRPrinter(); void setColor(bool color); @@ -72,6 +73,7 @@ class IRPrinter : public IRVisitorStrict { virtual void visit(const Break*); std::ostream &stream; + std::ostream &stream2; int indent; bool color; bool simplify; @@ -109,6 +111,7 @@ class IRPrinter : public IRVisitorStrict { void doIndent(); void printBinOp(Expr a, Expr b, std::string op, Precedence precedence); bool needsParentheses(Precedence precedence); + void sendToStream(std::stringstream &stream); std::string keywordString(std::string); std::string commentString(std::string); diff --git a/include/taco/util/strings.h b/include/taco/util/strings.h index 5dfb2f174..a3c3d863f 100644 --- a/include/taco/util/strings.h +++ b/include/taco/util/strings.h @@ -1,6 +1,7 @@ #ifndef TACO_UTIL_STRINGS_H #define TACO_UTIL_STRINGS_H +#include "taco/cuda.h" #include #include #include @@ -8,6 +9,8 @@ #include #include +#include "taco/type.h" + // To get the value of a compiler macro variable #define STRINGIFY(x) #x #define TO_STRING(x) STRINGIFY(x) @@ -15,6 +18,25 @@ namespace taco { namespace util { +// /// Turn anything except floating points that can be written to a stream +// /// into a string. +// template +// typename std::enable_if::value, std::string>::type +// toStringISPC(const T &val) { + +// std::stringstream sstream; +// if (val == Int32) { +// sstream << "int32"; +// } +// else if (val == Int64) { +// sstream << "int64"; +// } +// else { +// sstream << val; +// } +// return sstream.str(); +// } + /// Turn anything except floating points that can be written to a stream /// into a string. template diff --git a/src/codegen/codegen.cpp b/src/codegen/codegen.cpp index f57f9950f..750f33516 100644 --- a/src/codegen/codegen.cpp +++ b/src/codegen/codegen.cpp @@ -35,6 +35,18 @@ shared_ptr CodeGen::init_default(std::ostream &dest, OutputKind outputK } } +shared_ptr CodeGen::init_default(std::ostream &dest, std::ostream &dest2, OutputKind outputKind) { + if (should_use_CUDA_codegen()) { + return make_shared(dest, outputKind); + } + else if (should_use_ISPC_codegen()) { + return make_shared(dest, dest2, outputKind); + } + else { + return make_shared(dest, outputKind); + } +} + int CodeGen::countYields(const Function *func) { struct CountYields : public IRVisitor { int yields = 0; @@ -233,6 +245,49 @@ string CodeGen::printTensorProperty(string varname, const GetProperty* op, bool return ret.str(); } +string CodeGen::getUnpackedTensorArgument(string varname, const GetProperty* op, + bool is_output_prop) { + stringstream ret; + ret << ""; + + auto tensor = op->tensor.as(); + if (op->property == TensorProperty::Values) { + // for the values, it's in the last slot + ret << "uniform " << printType(tensor->type, false) << " " << varname << "[]"; + return ret.str(); + } else if (op->property == TensorProperty::ValuesSize) { + ret << "int32 " << varname; + return ret.str(); + } + + // for a Dense level, nnz is an int + // for a Fixed level, ptr is an int + // all others are int* + if (op->property == TensorProperty::Dimension) { + if (op->type == Int32) { + ret << "int32 "; + } else if (op->type == Int64) { + ret << "int64 "; + } else { + ret << "int "; + } + ret << varname; + + } else { + taco_iassert(op->property == TensorProperty::Indices); + if (op->type == Int32) { + ret << "uniform int32 "; + } else if (op->type == Int64) { + ret << "uniform int64 "; + } else { + ret << "uniform int "; + } + ret << varname << "[]"; + } + + return ret.str(); +} + string CodeGen::unpackTensorProperty(string varname, const GetProperty* op, bool is_output_prop) { stringstream ret; @@ -314,13 +369,9 @@ string CodeGen::pointTensorProperty(std::string varname) { return ret.str(); } -// helper to print declarations -string CodeGen::printDecls(map varMap, - vector inputs, vector outputs) { - stringstream ret; - unordered_set propsAlreadyGenerated; - - vector sortedProps; +void CodeGen::getSortedProps(map &varMap, + vector &sortedProps, vector &inputs, + vector &outputs) { for (auto const& p: varMap) { if (p.first.as()) @@ -359,6 +410,17 @@ string CodeGen::printDecls(map varMap, return a->index < b->index; }); +} + +// helper to print declarations +string CodeGen::printDecls(map varMap, + vector inputs, vector outputs) { + stringstream ret; + unordered_set propsAlreadyGenerated; + + vector sortedProps; + getSortedProps(varMap, sortedProps, inputs, outputs); + for (auto prop: sortedProps) { bool isOutputProp = (find(outputs.begin(), outputs.end(), prop->tensor) != outputs.end()); @@ -379,6 +441,71 @@ string CodeGen::printDecls(map varMap, return ret.str(); } +string CodeGen::printCallISPCFunc(const Function *func, map varMap, + vector &sortedProps) { + std::stringstream ret; + ret << " "; + unordered_set propsAlreadyGenerated; + + ret << "__" << func->name << "("; + + vector inputs = func->inputs; + vector outputs = func->outputs; + getSortedProps(varMap, sortedProps, inputs, outputs); + + for (unsigned long i=0; i < sortedProps.size(); i++) { + ret << varMap[sortedProps[i]]; + if (i != sortedProps.size()-1) { + ret << ", "; + } + propsAlreadyGenerated.insert(varMap[sortedProps[i]]); + } + + ret << ");\n"; + return ret.str(); +} + +string CodeGen::printISPCFunc(const Function *func, map varMap, + vector &sortedProps) { + std::stringstream ret; + ret << "export void "; + unordered_set propsAlreadyGenerated; + + ret << "__" << func->name << "("; + + vector inputs = func->inputs; + vector outputs = func->outputs; + // getSortedProps(varMap, sortedProps, inputs, outputs); + + for (unsigned long i=0; i < sortedProps.size(); i++) { + auto prop = sortedProps[i]; + bool isOutputProp = (find(outputs.begin(), outputs.end(), + prop->tensor) != outputs.end()); + + auto var = prop->tensor.as(); + if (var->is_parameter) { + if (isOutputProp) { + ret << " " << printTensorProperty(varMap[prop], prop, false) << ";" << endl; + } else { + break; + } + } else { + ret << getUnpackedTensorArgument(varMap[prop], prop, isOutputProp); + } + propsAlreadyGenerated.insert(varMap[prop]); + + if (i!=sortedProps.size()-1) { + ret << ", "; + } + if (i%2==0) { + ret << "\n\t"; + } + } + ret << ") {\n"; + + return ret.str(); +} + string CodeGen::printPack(map, string> outputProperties, vector outputs) { diff --git a/src/codegen/codegen.h b/src/codegen/codegen.h index cc25c80d6..641239834 100644 --- a/src/codegen/codegen.h +++ b/src/codegen/codegen.h @@ -16,9 +16,13 @@ class CodeGen : public IRPrinter { enum CodeGenType { C, CUDA }; CodeGen(std::ostream& stream, CodeGenType type) : IRPrinter(stream), codeGenType(type) {}; - CodeGen(std::ostream& stream, bool color, bool simplify, CodeGenType type) : IRPrinter(stream, color, simplify), codeGenType(type) {}; + CodeGen(std::ostream& stream, bool color, bool simplify, CodeGenType type) + : IRPrinter(stream, color, simplify), codeGenType(type) {}; + CodeGen(std::ostream& stream, std::ostream& stream2, bool color, bool simplify, CodeGenType type) + : IRPrinter(stream, stream2, color, simplify), codeGenType(type) {}; /// Initialize the default code generator static std::shared_ptr init_default(std::ostream &dest, OutputKind outputKind); + static std::shared_ptr init_default(std::ostream &dest, std::ostream &dest2, OutputKind outputKind); /// Compile a lowered function virtual void compile(Stmt stmt, bool isFirst=false) =0; @@ -26,6 +30,9 @@ class CodeGen : public IRPrinter { protected: static bool checkForAlloc(const Function *func); static int countYields(const Function *func); + void getSortedProps(std::map &varMap, + std::vector &sortedProps, std::vector &inputs, + std::vector &outputs); static std::string printCType(Datatype type, bool is_ptr); static std::string printCUDAType(Datatype type, bool is_ptr); @@ -42,6 +49,10 @@ class CodeGen : public IRPrinter { std::string printContextDeclAndInit(std::map varMap, std::vector localVars, int labels, std::string funcName); + std::string printCallISPCFunc(const Function *func, std::map varMap, + std::vector &sortedProps); + std::string printISPCFunc(const Function *func, std::map varMap, + std::vector &sortedProps); std::string printDecls(std::map varMap, std::vector inputs, std::vector outputs); std::string printPack(std::map, @@ -64,6 +75,8 @@ class CodeGen : public IRPrinter { std::string printTensorProperty(std::string varname, const GetProperty* op, bool is_ptr); std::string unpackTensorProperty(std::string varname, const GetProperty* op, bool is_output_prop); + std::string getUnpackedTensorArgument(std::string varname, const GetProperty* op, + bool is_output_prop); std::string packTensorProperty(std::string varname, Expr tnsr, TensorProperty property, int mode, int index); std::string pointTensorProperty(std::string varname); diff --git a/src/codegen/codegen_ispc.cpp b/src/codegen/codegen_ispc.cpp index 4b0e82903..f107728cc 100644 --- a/src/codegen/codegen_ispc.cpp +++ b/src/codegen/codegen_ispc.cpp @@ -5,6 +5,7 @@ #include #include +#include "taco/cuda.h" #include "taco/ir/ir_visitor.h" #include "codegen_ispc.h" #include "taco/error.h" @@ -240,7 +241,10 @@ class CodeGen_ISPC::FindVars : public IRVisitor { }; CodeGen_ISPC::CodeGen_ISPC(std::ostream &dest, OutputKind outputKind, bool simplify) - : CodeGen(dest, false, simplify, C), out(dest), outputKind(outputKind) {} + : CodeGen(dest, false, simplify, C), out(dest), out2(dest), outputKind(outputKind) {} + +CodeGen_ISPC::CodeGen_ISPC(std::ostream &dest, std::ostream &dest2, OutputKind outputKind, bool simplify) + : CodeGen(dest, dest2, false, simplify, C), out(dest), out2(dest2), outputKind(outputKind) {} CodeGen_ISPC::~CodeGen_ISPC() {} @@ -254,9 +258,19 @@ void CodeGen_ISPC::compile(Stmt stmt, bool isFirst) { } out << endl; // generate code for the Stmt + std::cout << "Compiling the code\n"; stmt.accept(this); } +void CodeGen_ISPC::sendToStream(std::stringstream &stream) { + if (is_ISPC_code_stream_enabled()) { + this->out2 << stream.str(); + } + else { + this->out << stream.str(); + } +} + void CodeGen_ISPC::visit(const Function* func) { // if generating a header, protect the function declaration with a guard if (outputKind == HeaderGen) { @@ -300,14 +314,14 @@ void CodeGen_ISPC::visit(const Function* func) { // Print variable declarations out << printDecls(varFinder.varDecls, func->inputs, func->outputs) << endl; + vector sortedProps; + out << printCallISPCFunc(func, varFinder.varDecls, sortedProps); + if (emittingCoroutine) { out << printContextDeclAndInit(varMap, localVars, numYields, func->name) << endl; } - // output body - print(func->body); - // output repack only if we allocated memory if (checkForAlloc(func)) out << endl << printPack(varFinder.outputProperties, func->outputs); @@ -321,21 +335,50 @@ void CodeGen_ISPC::visit(const Function* func) { indent--; doIndent(); - out << "}\n"; + out << "}\n\n"; + + set_ISPC_code_stream_enabled(true); + out2 << printISPCFunc(func, varFinder.varDecls, sortedProps); + indent++; + doIndent(); + // output body + print(func->body); + indent--; + out2 << "}\n"; + set_ISPC_code_stream_enabled(false); + } void CodeGen_ISPC::visit(const VarDecl* op) { - if (emittingCoroutine) { - doIndent(); - op->var.accept(this); - parentPrecedence = Precedence::TOP; - stream << " = "; - op->rhs.accept(this); - stream << ";"; - stream << endl; - } else { - IRPrinter::visit(op); + // std::stringstream stream; + if (is_ISPC_code_stream_enabled()) { + if (emittingCoroutine) { + doIndent(); + op->var.accept(this); + parentPrecedence = Precedence::TOP; + stream2 << " = "; + op->rhs.accept(this); + stream2 << ";"; + stream2 << endl; + } else { + IRPrinter::visit(op); + } } + else { + if (emittingCoroutine) { + doIndent(); + op->var.accept(this); + parentPrecedence = Precedence::TOP; + stream << " = "; + op->rhs.accept(this); + stream << ";"; + stream << endl; + } else { + IRPrinter::visit(op); + } + } + + // sendToStream(stream); } void CodeGen_ISPC::visit(const Yield* op) { @@ -345,14 +388,27 @@ void CodeGen_ISPC::visit(const Yield* op) { // For Vars, we replace their names with the generated name, // since we match by reference (not name) void CodeGen_ISPC::visit(const Var* op) { - taco_iassert(varMap.count(op) > 0) << - "Var " << op->name << " not found in varMap"; - if (emittingCoroutine) { -// out << "TACO_DEREF("; + if (is_ISPC_code_stream_enabled()) { + taco_iassert(varMap.count(op) > 0) << + "Var " << op->name << " not found in varMap"; + if (emittingCoroutine) { + // out << "TACO_DEREF("; + } + out2 << varMap[op]; + if (emittingCoroutine) { + // out << ")"; + } } - out << varMap[op]; - if (emittingCoroutine) { -// out << ")"; + else { + taco_iassert(varMap.count(op) > 0) << + "Var " << op->name << " not found in varMap"; + if (emittingCoroutine) { + // out << "TACO_DEREF("; + } + out << varMap[op]; + if (emittingCoroutine) { + // out << ")"; + } } } @@ -367,31 +423,31 @@ static string genVectorizePragma(int width) { return ret.str(); } -static string getParallelizePragma(LoopKind kind) { - stringstream ret; - ret << "#pragma omp parallel for schedule"; - switch (kind) { - case LoopKind::Static: - ret << "(static, 1)"; - break; - case LoopKind::Dynamic: - ret << "(dynamic, 1)"; - break; - case LoopKind::Runtime: - ret << "(runtime)"; - break; - case LoopKind::Static_Chunked: - ret << "(static)"; - break; - default: - break; - } - return ret.str(); -} - -static string getUnrollPragma(size_t unrollFactor) { - return "#pragma unroll " + std::to_string(unrollFactor); -} +// static string getParallelizePragma(LoopKind kind) { +// stringstream ret; +// ret << "#pragma omp parallel for schedule"; +// switch (kind) { +// case LoopKind::Static: +// ret << "(static, 1)"; +// break; +// case LoopKind::Dynamic: +// ret << "(dynamic, 1)"; +// break; +// case LoopKind::Runtime: +// ret << "(runtime)"; +// break; +// case LoopKind::Static_Chunked: +// ret << "(static)"; +// break; +// default: +// break; +// } +// return ret.str(); +// } + +// static string getUnrollPragma(size_t unrollFactor) { +// return "#pragma unroll " + std::to_string(unrollFactor); +// } static string getAtomicPragma() { return "#pragma omp atomic"; @@ -404,58 +460,75 @@ static string getAtomicPragma() { // http://clang.llvm.org/docs/LanguageExtensions.html#extensions-for-loop-hint-optimizations void CodeGen_ISPC::visit(const For* op) { switch (op->kind) { + // TODO - add ISPC based multi threaded execution handling case LoopKind::Vectorized: - doIndent(); - out << genVectorizePragma(op->vec_width); - out << "\n"; - break; case LoopKind::Static: case LoopKind::Dynamic: case LoopKind::Runtime: case LoopKind::Static_Chunked: - doIndent(); - out << getParallelizePragma(op->kind); - out << "\n"; - break; default: - if (op->unrollFactor > 0) { - doIndent(); - out << getUnrollPragma(op->unrollFactor) << endl; - } break; } doIndent(); - stream << keywordString("for") << " ("; - if (!emittingCoroutine) { - stream << keywordString(util::toString(op->var.type())) << " "; - } - op->var.accept(this); - stream << " = "; - op->start.accept(this); - stream << keywordString("; "); - op->var.accept(this); - stream << " < "; - parentPrecedence = BOTTOM; - op->end.accept(this); - stream << keywordString("; "); - op->var.accept(this); - auto lit = op->increment.as(); - if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || - (lit->type.isUInt() && lit->equalsScalar(1)))) { - stream << "++"; - } - else { - stream << " += "; - op->increment.accept(this); + if (op->kind == LoopKind::Foreach) { + stream2 << keywordString("foreach") << " ("; + // if (!emittingCoroutine) { + // if (op->var.type() == Int32) { + // stream << "int32 "; + // } + // else if (op->var.type() == Int64) { + // stream << "int64 "; + // } + + // } + op->var.accept(this); + stream2 << " = "; + op->start.accept(this); + stream2 << keywordString(" ... "); + op->end.accept(this); + stream2 << ") {\n"; + + } else { + stream2 << keywordString("for") << " ("; + if (!emittingCoroutine) { + if (op->var.type() == Int32) { + stream2 << "int32 "; + } + else if (op->var.type() == Int64) { + stream2 << "int64 "; + } + + } + op->var.accept(this); + stream2 << " = "; + op->start.accept(this); + stream2 << keywordString("; "); + op->var.accept(this); + stream2 << " < "; + parentPrecedence = BOTTOM; + op->end.accept(this); + stream2 << keywordString("; "); + op->var.accept(this); + + auto lit = op->increment.as(); + if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || + (lit->type.isUInt() && lit->equalsScalar(1)))) { + stream2 << "++"; + } + else { + stream2 << " += "; + op->increment.accept(this); + } + stream2 << ") {\n"; } - stream << ") {\n"; op->contents.accept(this); doIndent(); - stream << "}"; - stream << endl; + stream2 << "}"; + stream2 << endl; + } void CodeGen_ISPC::visit(const While* op) { @@ -474,7 +547,13 @@ void CodeGen_ISPC::visit(const While* op) { void CodeGen_ISPC::visit(const GetProperty* op) { taco_iassert(varMap.count(op) > 0) << "Property " << Expr(op) << " of " << op->tensor << " not found in varMap"; - out << varMap[op]; + if (is_ISPC_code_stream_enabled()) { + out2 << varMap[op]; + } + else { + out << varMap[op]; + } + } void CodeGen_ISPC::visit(const Min* op) { @@ -549,17 +628,34 @@ void CodeGen_ISPC::visit(const Sqrt* op) { } void CodeGen_ISPC::visit(const Assign* op) { - if (op->use_atomics) { - doIndent(); - stream << getAtomicPragma() << endl; + if (is_ISPC_code_stream_enabled()) { + if (op->use_atomics) { + doIndent(); + stream2 << getAtomicPragma() << endl; + } + } + else { + if (op->use_atomics) { + doIndent(); + stream << getAtomicPragma() << endl; + } } + IRPrinter::visit(op); } void CodeGen_ISPC::visit(const Store* op) { - if (op->use_atomics) { - doIndent(); - stream << getAtomicPragma() << endl; + if (is_ISPC_code_stream_enabled()) { + if (op->use_atomics) { + doIndent(); + stream2 << getAtomicPragma() << endl; + } + } + else { + if (op->use_atomics) { + doIndent(); + stream << getAtomicPragma() << endl; + } } IRPrinter::visit(op); } diff --git a/src/codegen/codegen_ispc.h b/src/codegen/codegen_ispc.h index 35da5a01b..8abd1cc09 100644 --- a/src/codegen/codegen_ispc.h +++ b/src/codegen/codegen_ispc.h @@ -16,6 +16,7 @@ class CodeGen_ISPC : public CodeGen { /// Initialize a code generator that generates code to an /// output stream. CodeGen_ISPC(std::ostream &dest, OutputKind outputKind, bool simplify=true); + CodeGen_ISPC(std::ostream &dest, std::ostream &dest2, OutputKind outputKind, bool simplify=true); ~CodeGen_ISPC(); /// Compile a lowered function @@ -45,6 +46,7 @@ class CodeGen_ISPC : public CodeGen { std::map varMap; std::vector localVars; std::ostream &out; + std::ostream &out2; OutputKind outputKind; @@ -56,6 +58,7 @@ class CodeGen_ISPC : public CodeGen { private: virtual std::string restrictKeyword() const { return "restrict"; } + void sendToStream(std::stringstream &stream); }; } // namespace ir diff --git a/src/codegen/module.cpp b/src/codegen/module.cpp index 409ed4a83..d9cbe2edc 100644 --- a/src/codegen/module.cpp +++ b/src/codegen/module.cpp @@ -116,6 +116,7 @@ void writeShims(vector funcs, string path, string prefix) { } // anonymous namespace string Module::compile() { + std::cout << "Module::compile\n"; string prefix = tmpdir+libname; string fullpath = prefix + ".so"; @@ -130,6 +131,12 @@ string Module::compile() { file_ending = ".cu"; shims_file = prefix + "_shims.cpp"; } + else if (should_use_ISPC_codegen()) { + cc = util::getFromEnv(target.compiler_env, target.compiler); + cflags = util::getFromEnv("TACO_CFLAGS", + "-O3 -ffast-math -std=c99") + " -shared -fPIC"; + + } else { cc = util::getFromEnv(target.compiler_env, target.compiler); cflags = util::getFromEnv("TACO_CFLAGS", @@ -150,6 +157,12 @@ string Module::compile() { // write out the shims writeShims(funcs, tmpdir, libname); + for (auto &statement : funcs) { + std::cout << "----- statement --------" << std::endl; + std::cout << statement; + std::cout << std::endl; + } + std::cout << tmpdir << std::endl << libname << std::endl; // now compile it int err = system(cmd.data()); diff --git a/src/cuda.cpp b/src/cuda.cpp index 85139f874..68e49fe98 100644 --- a/src/cuda.cpp +++ b/src/cuda.cpp @@ -9,14 +9,22 @@ using namespace std; namespace taco { static bool ISPC_codegen_enabled = ISPC_BUILT; +static bool ISPC_code_stream_enabled = false; bool should_use_ISPC_codegen() { return ISPC_codegen_enabled; } +bool is_ISPC_code_stream_enabled() { + return ISPC_code_stream_enabled; +} + void set_ISPC_codegen_enabled(bool enabled) { ISPC_codegen_enabled = enabled; } +void set_ISPC_code_stream_enabled(bool enabled) { + ISPC_code_stream_enabled = enabled; +} /// Functions used by taco to interface with CUDA (especially unified memory) static bool CUDA_codegen_enabled = CUDA_BUILT; diff --git a/src/ir/ir_printer.cpp b/src/ir/ir_printer.cpp index a1997a9b7..f96251c5a 100644 --- a/src/ir/ir_printer.cpp +++ b/src/ir/ir_printer.cpp @@ -1,6 +1,7 @@ #include #include +#include "taco/cuda.h" #include "taco/ir/ir.h" #include "taco/ir/ir_printer.h" #include "taco/ir/simplify.h" @@ -34,7 +35,11 @@ IRPrinter::IRPrinter(ostream &s) : IRPrinter(s, false, false) { } IRPrinter::IRPrinter(ostream &s, bool color, bool simplify) - : stream(s), indent(0), color(color), simplify(simplify) { + : stream(s), stream2(s), indent(0), color(color), simplify(simplify) { +} + +IRPrinter::IRPrinter(ostream &s, ostream &s2, bool color, bool simplify) + : stream(s), stream2(s2), indent(0), color(color), simplify(simplify) { } IRPrinter::~IRPrinter() { @@ -59,79 +64,169 @@ void IRPrinter::print(Stmt stmt) { } void IRPrinter::visit(const Literal* op) { - if (color) { - stream << blue ; - } - - switch (op->type.getKind()) { - case Datatype::Bool: - stream << op->getValue(); - break; - case Datatype::UInt8: - stream << static_cast(op->getValue()); - break; - case Datatype::UInt16: - stream << op->getValue(); - break; - case Datatype::UInt32: - stream << op->getValue(); - break; - case Datatype::UInt64: - stream << op->getValue(); - break; - case Datatype::UInt128: - taco_not_supported_yet; - break; - case Datatype::Int8: - stream << static_cast(op->getValue()); - break; - case Datatype::Int16: - stream << op->getValue(); - break; - case Datatype::Int32: - stream << op->getValue(); - break; - case Datatype::Int64: - stream << op->getValue(); - break; - case Datatype::Int128: - taco_not_supported_yet; - break; - case Datatype::Float32: - stream << ((op->getValue() != 0.0) - ? util::toString(op->getValue()) : "0.0"); - break; - case Datatype::Float64: - stream << ((op->getValue()!=0.0) - ? util::toString(op->getValue()) : "0.0"); - break; - case Datatype::Complex64: { - std::complex val = op->getValue>(); - stream << val.real() << " + I*" << val.imag(); - } - break; - case Datatype::Complex128: { - std::complex val = op->getValue>(); - stream << val.real() << " + I*" << val.imag(); - } - break; - case Datatype::Undefined: - taco_ierror << "Undefined type in IR"; - break; - } + if (is_ISPC_code_stream_enabled()) { + if (color) { + stream2 << blue ; + } - if (color) { - stream << nc; + // It seems this is where all the types get printed in the final code generation. + // Come up with a way to generate different values if stream2 is used to generate ispc code + switch (op->type.getKind()) { + case Datatype::Bool: + stream2 << op->getValue(); + break; + case Datatype::UInt8: + stream2 << static_cast(op->getValue()); + break; + case Datatype::UInt16: + stream2 << op->getValue(); + break; + case Datatype::UInt32: + stream2 << op->getValue(); + break; + case Datatype::UInt64: + stream2 << op->getValue(); + break; + case Datatype::UInt128: + taco_not_supported_yet; + break; + case Datatype::Int8: + stream2 << static_cast(op->getValue()); + break; + case Datatype::Int16: + stream2 << op->getValue(); + break; + case Datatype::Int32: + stream2 << op->getValue(); + break; + case Datatype::Int64: + stream2 << op->getValue(); + break; + case Datatype::Int128: + taco_not_supported_yet; + break; + case Datatype::Float32: + stream2 << ((op->getValue() != 0.0) + ? util::toString(op->getValue()) : "0.0"); + break; + case Datatype::Float64: + stream2 << ((op->getValue()!=0.0) + ? util::toString(op->getValue()) : "0.0"); + break; + case Datatype::Complex64: { + std::complex val = op->getValue>(); + stream2 << val.real() << " + I*" << val.imag(); + } + break; + case Datatype::Complex128: { + std::complex val = op->getValue>(); + stream2 << val.real() << " + I*" << val.imag(); + } + break; + case Datatype::Undefined: + taco_ierror << "Undefined type in IR"; + break; + } + + if (color) { + stream2 << nc; + } + } + + + + else { + + if (color) { + stream << blue ; + } + + // It seems this is where all the types get printed in the final code generation. + // Come up with a way to generate different values if stream2 is used to generate ispc code + switch (op->type.getKind()) { + case Datatype::Bool: + stream << op->getValue(); + break; + case Datatype::UInt8: + stream << static_cast(op->getValue()); + break; + case Datatype::UInt16: + stream << op->getValue(); + break; + case Datatype::UInt32: + stream << op->getValue(); + break; + case Datatype::UInt64: + stream << op->getValue(); + break; + case Datatype::UInt128: + taco_not_supported_yet; + break; + case Datatype::Int8: + stream << static_cast(op->getValue()); + break; + case Datatype::Int16: + stream << op->getValue(); + break; + case Datatype::Int32: + stream << op->getValue(); + break; + case Datatype::Int64: + stream << op->getValue(); + break; + case Datatype::Int128: + taco_not_supported_yet; + break; + case Datatype::Float32: + stream << ((op->getValue() != 0.0) + ? util::toString(op->getValue()) : "0.0"); + break; + case Datatype::Float64: + stream << ((op->getValue()!=0.0) + ? util::toString(op->getValue()) : "0.0"); + break; + case Datatype::Complex64: { + std::complex val = op->getValue>(); + stream << val.real() << " + I*" << val.imag(); + } + break; + case Datatype::Complex128: { + std::complex val = op->getValue>(); + stream << val.real() << " + I*" << val.imag(); + } + break; + case Datatype::Undefined: + taco_ierror << "Undefined type in IR"; + break; + } + + if (color) { + stream << nc; + } + + } + } void IRPrinter::visit(const Var* op) { - if (varNames.contains(op)) { - stream << varNames.get(op); + if (is_ISPC_code_stream_enabled()) { + if (varNames.contains(op)) { + stream2 << varNames.get(op); + } + else { + stream2 << op->name; + } } else { - stream << op->name; + if (varNames.contains(op)) { + stream << varNames.get(op); + } + else { + stream << op->name; + } } + } void IRPrinter::visit(const Neg* op) { @@ -248,41 +343,83 @@ void IRPrinter::visit(const IfThenElse* op) { taco_iassert(op->cond.defined()); taco_iassert(op->then.defined()); doIndent(); - stream << keywordString("if "); - stream << "("; - parentPrecedence = Precedence::TOP; - op->cond.accept(this); - stream << ")"; + if (is_ISPC_code_stream_enabled()) { + stream2 << keywordString("if "); + stream2 << "("; + parentPrecedence = Precedence::TOP; + op->cond.accept(this); + stream2 << ")"; + + Stmt scopedStmt = Stmt(to(op->then)->scopedStmt); + if (isa(scopedStmt)) { + stream2 << " {" << endl; + op->then.accept(this); + doIndent(); + stream2 << "}"; + } + else if (isa(scopedStmt)) { + int tmp = indent; + indent = 0; + stream2 << " "; + scopedStmt.accept(this); + indent = tmp; + } + else { + stream2 << endl; + op->then.accept(this); + } - Stmt scopedStmt = Stmt(to(op->then)->scopedStmt); - if (isa(scopedStmt)) { - stream << " {" << endl; - op->then.accept(this); - doIndent(); - stream << "}"; - } - else if (isa(scopedStmt)) { - int tmp = indent; - indent = 0; - stream << " "; - scopedStmt.accept(this); - indent = tmp; + if (op->otherwise.defined()) { + stream2 << "\n"; + doIndent(); + stream2 << keywordString("else"); + stream2 << " {\n"; + op->otherwise.accept(this); + doIndent(); + stream2 << "}"; + } + stream2 << endl; } + + else { - stream << endl; - op->then.accept(this); - } + stream << keywordString("if "); + stream << "("; + parentPrecedence = Precedence::TOP; + op->cond.accept(this); + stream << ")"; - if (op->otherwise.defined()) { - stream << "\n"; - doIndent(); - stream << keywordString("else"); - stream << " {\n"; - op->otherwise.accept(this); - doIndent(); - stream << "}"; + Stmt scopedStmt = Stmt(to(op->then)->scopedStmt); + if (isa(scopedStmt)) { + stream << " {" << endl; + op->then.accept(this); + doIndent(); + stream << "}"; + } + else if (isa(scopedStmt)) { + int tmp = indent; + indent = 0; + stream << " "; + scopedStmt.accept(this); + indent = tmp; + } + else { + stream << endl; + op->then.accept(this); + } + + if (op->otherwise.defined()) { + stream << "\n"; + doIndent(); + stream << keywordString("else"); + stream << " {\n"; + op->otherwise.accept(this); + doIndent(); + stream << "}"; + } + stream << endl; } - stream << endl; + } void IRPrinter::visit(const Case* op) { @@ -345,12 +482,22 @@ void IRPrinter::visit(const Switch* op) { } void IRPrinter::visit(const Load* op) { - parentPrecedence = Precedence::LOAD; - op->arr.accept(this); - stream << "["; - parentPrecedence = Precedence::LOAD; - op->loc.accept(this); - stream << "]"; + if (is_ISPC_code_stream_enabled()) { + parentPrecedence = Precedence::LOAD; + op->arr.accept(this); + stream2 << "["; + parentPrecedence = Precedence::LOAD; + op->loc.accept(this); + stream2 << "]"; + } + else { + parentPrecedence = Precedence::LOAD; + op->arr.accept(this); + stream << "["; + parentPrecedence = Precedence::LOAD; + op->loc.accept(this); + stream << "]"; + } } void IRPrinter::visit(const Malloc* op) { @@ -367,66 +514,149 @@ void IRPrinter::visit(const Sizeof* op) { } void IRPrinter::visit(const Store* op) { - doIndent(); - op->arr.accept(this); - stream << "["; - parentPrecedence = Precedence::TOP; - op->loc.accept(this); - stream << "] = "; - parentPrecedence = Precedence::TOP; - op->data.accept(this); - stream << ";"; - stream << endl; + if (is_ISPC_code_stream_enabled()) { + doIndent(); + op->arr.accept(this); + stream2 << "["; + parentPrecedence = Precedence::TOP; + op->loc.accept(this); + stream2 << "] = "; + parentPrecedence = Precedence::TOP; + op->data.accept(this); + stream2 << ";"; + stream2 << endl; + } + else { + doIndent(); + op->arr.accept(this); + stream << "["; + parentPrecedence = Precedence::TOP; + op->loc.accept(this); + stream << "] = "; + parentPrecedence = Precedence::TOP; + op->data.accept(this); + stream << ";"; + stream << endl; + } + } void IRPrinter::visit(const For* op) { - doIndent(); - stream << keywordString("for") << " (" - << keywordString(util::toString(op->var.type())) << " "; - op->var.accept(this); - stream << " = "; - op->start.accept(this); - stream << keywordString("; "); - op->var.accept(this); - stream << " < "; - parentPrecedence = BOTTOM; - op->end.accept(this); - stream << keywordString("; "); - op->var.accept(this); + std::cout << "This is IRPrinter::visit For op method\n"; + if (is_ISPC_code_stream_enabled()) { + doIndent(); + stream2 << keywordString("for") << " (" + << keywordString(util::toString(op->var.type())) << " "; + op->var.accept(this); + stream2 << " = "; + op->start.accept(this); + stream2 << keywordString("; "); + op->var.accept(this); + stream2 << " < "; + parentPrecedence = BOTTOM; + op->end.accept(this); + stream2 << keywordString("; "); + op->var.accept(this); + + auto lit = op->increment.as(); + if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || + (lit->type.isUInt() && lit->equalsScalar(1)))) { + stream2 << "++"; + } + else { + stream2 << " += "; + op->increment.accept(this); + } + stream2 << ") {\n"; - auto lit = op->increment.as(); - if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || - (lit->type.isUInt() && lit->equalsScalar(1)))) { - stream << "++"; + op->contents.accept(this); + doIndent(); + stream2 << "}"; + stream2 << endl; } + + else { - stream << " += "; - op->increment.accept(this); + doIndent(); + stream << keywordString("for") << " (" + << keywordString(util::toString(op->var.type())) << " "; + op->var.accept(this); + stream << " = "; + op->start.accept(this); + stream << keywordString("; "); + op->var.accept(this); + stream << " < "; + parentPrecedence = BOTTOM; + op->end.accept(this); + stream << keywordString("; "); + op->var.accept(this); + + auto lit = op->increment.as(); + if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || + (lit->type.isUInt() && lit->equalsScalar(1)))) { + stream << "++"; + } + else { + stream << " += "; + op->increment.accept(this); + } + stream << ") {\n"; + + op->contents.accept(this); + doIndent(); + stream << "}"; + stream << endl; } - stream << ") {\n"; - op->contents.accept(this); - doIndent(); - stream << "}"; - stream << endl; +} + +void IRPrinter::sendToStream(std::stringstream &stream) { + if (is_ISPC_code_stream_enabled()) { + this->stream2 << stream.str(); + } + else { + this->stream << stream.str(); + } } void IRPrinter::visit(const While* op) { - doIndent(); - stream << keywordString("while "); - stream << "("; - parentPrecedence = Precedence::TOP; - op->cond.accept(this); - stream << ")"; - stream << " {\n"; - op->contents.accept(this); - doIndent(); - stream << "}"; - stream << endl; + // std::stringstream stream; + if (is_ISPC_code_stream_enabled()) { + doIndent(); + stream2 << keywordString("while "); + stream2 << "("; + parentPrecedence = Precedence::TOP; + op->cond.accept(this); + stream2 << ")"; + stream2 << " {\n"; + op->contents.accept(this); + doIndent(); + stream2 << "}"; + stream2 << endl; + } + else { + doIndent(); + stream << keywordString("while "); + stream << "("; + parentPrecedence = Precedence::TOP; + op->cond.accept(this); + stream << ")"; + stream << " {\n"; + op->contents.accept(this); + doIndent(); + stream << "}"; + stream << endl; + } + // sendToStream(stream); } void IRPrinter::visit(const Block* op) { - acceptJoin(this, stream, op->contents, ""); + if (is_ISPC_code_stream_enabled()) { + acceptJoin(this, stream2, op->contents, ""); + } + else { + acceptJoin(this, stream, op->contents, ""); + } } void IRPrinter::visit(const Scope* op) { @@ -438,85 +668,183 @@ void IRPrinter::visit(const Scope* op) { } void IRPrinter::visit(const Function* op) { - stream << keywordString("void ") << op->name; - stream << "("; - if (op->outputs.size() > 0) stream << "Tensor "; - acceptJoin(this, stream, op->outputs, ", Tensor "); - if (op->outputs.size() > 0 && op->inputs.size()) stream << ", "; - if (op->inputs.size() > 0) stream << "Tensor "; - acceptJoin(this, stream, op->inputs, ", Tensor "); - stream << ") {" << endl; + if (is_ISPC_code_stream_enabled()) { + stream2 << keywordString("void ") << op->name; + stream2 << "("; + if (op->outputs.size() > 0) stream2 << "Tensor "; + acceptJoin(this, stream2, op->outputs, ", Tensor "); + if (op->outputs.size() > 0 && op->inputs.size()) stream2 << ", "; + if (op->inputs.size() > 0) stream2 << "Tensor "; + acceptJoin(this, stream2, op->inputs, ", Tensor "); + stream2 << ") {" << endl; + + resetNameCounters(); + op->body.accept(this); + + doIndent(); + stream2 << "}"; + } + else { + stream << keywordString("void ") << op->name; + stream << "("; + if (op->outputs.size() > 0) stream << "Tensor "; + acceptJoin(this, stream, op->outputs, ", Tensor "); + if (op->outputs.size() > 0 && op->inputs.size()) stream << ", "; + if (op->inputs.size() > 0) stream << "Tensor "; + acceptJoin(this, stream, op->inputs, ", Tensor "); + stream << ") {" << endl; - resetNameCounters(); - op->body.accept(this); + resetNameCounters(); + op->body.accept(this); + + doIndent(); + stream << "}"; + } - doIndent(); - stream << "}"; } void IRPrinter::visit(const VarDecl* op) { - doIndent(); - stream << keywordString(util::toString(op->var.type())); - taco_iassert(isa(op->var)); - if (to(op->var)->is_ptr) { - stream << "* restrict"; - } - stream << " "; - string varName = varNameGenerator.getUniqueName(util::toString(op->var)); - varNames.insert({op->var, varName}); - op->var.accept(this); - parentPrecedence = Precedence::TOP; - stream << " = "; - op->rhs.accept(this); - stream << ";"; - stream << endl; + if (is_ISPC_code_stream_enabled()) { + doIndent(); + if (op->var.type() == Int32) { + stream2 << keywordString("int32"); + } + else if (op->var.type() == Int64) { + stream2 << keywordString("int64"); + } else { + stream2 << keywordString(util::toString(op->var.type())); + } + taco_iassert(isa(op->var)); + if (to(op->var)->is_ptr) { + stream2 << "* restrict"; + } + stream2 << " "; + string varName = varNameGenerator.getUniqueName(util::toString(op->var)); + varNames.insert({op->var, varName}); + op->var.accept(this); + parentPrecedence = Precedence::TOP; + stream2 << " = "; + op->rhs.accept(this); + stream2 << ";"; + stream2 << endl; + } + else { + doIndent(); + stream << keywordString(util::toString(op->var.type())); + taco_iassert(isa(op->var)); + if (to(op->var)->is_ptr) { + stream << "* restrict"; + } + stream << " "; + string varName = varNameGenerator.getUniqueName(util::toString(op->var)); + varNames.insert({op->var, varName}); + op->var.accept(this); + parentPrecedence = Precedence::TOP; + stream << " = "; + op->rhs.accept(this); + stream << ";"; + stream << endl; + } + } void IRPrinter::visit(const Assign* op) { - doIndent(); - op->lhs.accept(this); - parentPrecedence = Precedence::TOP; - bool printed = false; - if (simplify) { - if (isa(op->rhs)) { - auto add = to(op->rhs); - if (add->a == op->lhs) { - const Literal* lit = add->b.as(); - if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || - (lit->type.isUInt() && lit->equalsScalar(1)))) { - stream << "++"; + if (is_ISPC_code_stream_enabled()) { + doIndent(); + op->lhs.accept(this); + parentPrecedence = Precedence::TOP; + bool printed = false; + if (simplify) { + if (isa(op->rhs)) { + auto add = to(op->rhs); + if (add->a == op->lhs) { + const Literal* lit = add->b.as(); + if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || + (lit->type.isUInt() && lit->equalsScalar(1)))) { + stream2 << "++"; + } + else { + stream2 << " += "; + add->b.accept(this); + } + printed = true; } - else { - stream << " += "; - add->b.accept(this); + } + else if (isa(op->rhs)) { + auto mul = to(op->rhs); + if (mul->a == op->lhs) { + stream2 << " *= "; + mul->b.accept(this); + printed = true; } - printed = true; } - } - else if (isa(op->rhs)) { - auto mul = to(op->rhs); - if (mul->a == op->lhs) { - stream << " *= "; - mul->b.accept(this); - printed = true; + else if (isa(op->rhs)) { + auto bitOr = to(op->rhs); + if (bitOr->a == op->lhs) { + stream2 << " |= "; + bitOr->b.accept(this); + printed = true; + } } } - else if (isa(op->rhs)) { - auto bitOr = to(op->rhs); - if (bitOr->a == op->lhs) { - stream << " |= "; - bitOr->b.accept(this); - printed = true; - } + if (!printed) { + stream2 << " = "; + op->rhs.accept(this); } + + stream2 << ";"; + stream2 << endl; } - if (!printed) { - stream << " = "; - op->rhs.accept(this); + + + + else { + doIndent(); + op->lhs.accept(this); + parentPrecedence = Precedence::TOP; + bool printed = false; + if (simplify) { + if (isa(op->rhs)) { + auto add = to(op->rhs); + if (add->a == op->lhs) { + const Literal* lit = add->b.as(); + if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || + (lit->type.isUInt() && lit->equalsScalar(1)))) { + stream << "++"; + } + else { + stream << " += "; + add->b.accept(this); + } + printed = true; + } + } + else if (isa(op->rhs)) { + auto mul = to(op->rhs); + if (mul->a == op->lhs) { + stream << " *= "; + mul->b.accept(this); + printed = true; + } + } + else if (isa(op->rhs)) { + auto bitOr = to(op->rhs); + if (bitOr->a == op->lhs) { + stream << " |= "; + bitOr->b.accept(this); + printed = true; + } + } + } + if (!printed) { + stream << " = "; + op->rhs.accept(this); + } + + stream << ";"; + stream << endl; } - stream << ";"; - stream << endl; } void IRPrinter::visit(const Yield* op) { @@ -559,17 +887,32 @@ void IRPrinter::visit(const Comment* op) { } void IRPrinter::visit(const BlankLine*) { - stream << endl; + if (is_ISPC_code_stream_enabled()) { + stream2 << endl; + } + else { + stream << endl; + } } void IRPrinter::visit(const Continue*) { doIndent(); - stream << "continue;" << endl; + if (!is_ISPC_code_stream_enabled()) { + stream << "continue;" << endl; + } + else { + stream2 << "continue;" << endl; + } } void IRPrinter::visit(const Break*) { doIndent(); - stream << "break;" << endl; + if (!is_ISPC_code_stream_enabled()) { + stream << "break;" << endl; + } + else { + stream2 << "break;" << endl; + } } void IRPrinter::visit(const Print* op) { @@ -585,7 +928,12 @@ void IRPrinter::visit(const Print* op) { } void IRPrinter::visit(const GetProperty* op) { - stream << op->name; + if (is_ISPC_code_stream_enabled()) { + stream2 << op->name; + } + else { + stream << op->name; + } } void IRPrinter::visit(const Sort* op) { @@ -643,23 +991,47 @@ void IRPrinter::resetNameCounters() { } void IRPrinter::doIndent() { - for (int i=0; ivar); Expr start = rewrite(op->start); Expr end = rewrite(op->end); diff --git a/src/lower/lowerer_impl_imperative.cpp b/src/lower/lowerer_impl_imperative.cpp index b4c9ea710..53ffd936f 100644 --- a/src/lower/lowerer_impl_imperative.cpp +++ b/src/lower/lowerer_impl_imperative.cpp @@ -1,4 +1,5 @@ #include +#include "taco/cuda.h" #include "taco/lower/lowerer_impl_imperative.h" #include "taco/lower/lowerer_impl.h" @@ -26,6 +27,7 @@ class LowererImplImperative::Visitor : public IndexNotationVisitorStrict { public: Visitor(LowererImplImperative* impl) : impl(impl) {} Stmt lower(IndexStmt stmt) { + std::cout << "lowering IndexStmt to ir:Stmt - IndexStmt: " << stmt << std::endl; this->stmt = Stmt(); impl->accessibleIterators.scope(); IndexStmtVisitorStrict::visit(stmt); @@ -200,6 +202,7 @@ static std::set hasSparseInserts(IndexStmt stmt, Iterators iterators, return ret; } + Stmt LowererImplImperative::lower(IndexStmt stmt, string name, bool assemble, bool compute, bool pack, bool unpack) @@ -586,19 +589,27 @@ LowererImplImperative::splitAppenderAndInserters(const vector& results } +// important function +/* +* This is the for loop lowering part +*/ Stmt LowererImplImperative::lowerForall(Forall forall) { + std::cout << "doing lowerForall: " << forall << std::endl; bool hasExactBound = provGraph.hasExactBound(forall.getIndexVar()); bool forallNeedsUnderivedGuards = !hasExactBound && emitUnderivedGuards; if (!ignoreVectorize && forallNeedsUnderivedGuards && (forall.getParallelUnit() == ParallelUnit::CPUVector || forall.getUnrollFactor() > 0)) { + std::cout << "calling lowerForallCloned(forall)\n"; return lowerForallCloned(forall); } + std::cout << "inParallelLoopDepth: " << inParallelLoopDepth << "========================\n"; if (forall.getParallelUnit() != ParallelUnit::NotParallel) { inParallelLoopDepth++; } + std::cout << "inParallelLoopDepth: " << inParallelLoopDepth << "========================\n"; // Recover any available parents that were not recoverable previously vector recoverySteps; @@ -786,19 +797,23 @@ Stmt LowererImplImperative::lowerForall(Forall forall) } if (!isWhereProducer && hasPosDescendant && underivedAncestors.size() > 1 && provGraph.isPosVariable(iterator.getIndexVar()) && posDescendant == forall.getIndexVar()) { + std::cout << "calling lowerForallFusedPosition(forall\n"; loops = lowerForallFusedPosition(forall, iterator, locators, inserters, appenders, reducedAccesses, recoveryStmt); } else if (canAccelWithSparseIteration) { + std::cout << "calling lowerForallDenseAcceleration(forall\n"; loops = lowerForallDenseAcceleration(forall, locators, inserters, appenders, reducedAccesses, recoveryStmt); } // Emit dimension coordinate iteration loop else if (iterator.isDimensionIterator()) { + std::cout << "calling lowerForallDimension(forall\n"; loops = lowerForallDimension(forall, point.locators(), inserters, appenders, reducedAccesses, recoveryStmt); } // Emit position iteration loop else if (iterator.hasPosIter()) { + std::cout << "calling lowerForallPosition(forall\n"; loops = lowerForallPosition(forall, iterator, locators, inserters, appenders, reducedAccesses, recoveryStmt); } @@ -816,6 +831,10 @@ Stmt LowererImplImperative::lowerForall(Forall forall) loops = lowerMergeLattice(lattice, underivedAncestors[0], forall.getStmt(), reducedAccesses); } + + std::cout << "printing loops ----------------------------------------------------------------------------------------------\n"; + std::cout << loops << std::endl; + std::cout << "loops printed -----------------------------------------------------------------------------------------------\n"; // taco_iassert(loops.defined()); if (!generateComputeCode() && !hasStores(loops)) { @@ -832,6 +851,7 @@ Stmt LowererImplImperative::lowerForall(Forall forall) parallelUnitIndexVars.erase(forall.getParallelUnit()); parallelUnitSizes.erase(forall.getParallelUnit()); } + return Block::blanks(preInitValues, temporaryValuesInitFree[0], loops, @@ -1136,6 +1156,7 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, set reducedAccesses, ir::Stmt recoveryStmt) { + std::cout << "1 Stmt LowererImplImperative::lowerForallDimension\n"; Expr coordinate = getCoordinateVar(forall.getIndexVar()); if (forall.getParallelUnit() != ParallelUnit::NotParallel && forall.getOutputRaceStrategy() == OutputRaceStrategy::Atomics) { @@ -1143,6 +1164,8 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, atomicParallelUnit = forall.getParallelUnit(); } + std::cout << "original forall : " << forall << std::endl; + std::cout << "inside IndexStmt: " << forall.getStmt() << std::endl; Stmt body = lowerForallBody(coordinate, forall.getStmt(), locators, inserters, appenders, reducedAccesses); @@ -1158,7 +1181,13 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, std::vector bounds = provGraph.deriveIterBounds(forall.getIndexVar(), definedIndexVarsOrdered, underivedBounds, indexVarToExprMap, iterators); LoopKind kind = LoopKind::Serial; - if (forall.getParallelUnit() == ParallelUnit::CPUVector && !ignoreVectorize) { + if (should_use_ISPC_codegen()) { + std::cout << "Foreach compatible loop\n"; + if (forall.getParallelUnit() == ParallelUnit::CPUVector && !ignoreVectorize) { + kind = LoopKind::Foreach; + } + } + else if (forall.getParallelUnit() == ParallelUnit::CPUVector && !ignoreVectorize) { kind = LoopKind::Vectorized; } else if (forall.getParallelUnit() != ParallelUnit::NotParallel @@ -1166,6 +1195,7 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, kind = LoopKind::Runtime; } + std::cout << "2 Stmt LowererImplImperative::lowerForallDimension\n"; return Block::blanks(For::make(coordinate, bounds[0], bounds[1], 1, body, kind, ignoreVectorize ? ParallelUnit::NotParallel : forall.getParallelUnit(), ignoreVectorize ? 0 : forall.getUnrollFactor()), @@ -1179,6 +1209,7 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, set reducedAccesses, ir::Stmt recoveryStmt) { + std::cout << "1 Stmt LowererImplImperative::lowerForallDenseAcceleration\n"; taco_iassert(locators.size() == 1) << "Optimizing a dense workspace is only supported when the consumer is the only RHS tensor"; taco_iassert(provGraph.isFullyDerived(forall.getIndexVar())) << "Sparsely accelerating a dense workspace only works with fully derived index vars"; taco_iassert(forall.getParallelUnit() == ParallelUnit::NotParallel) << "Sparsely accelerating a dense workspace only works within serial loops"; @@ -1204,6 +1235,8 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, } Stmt declareVar = VarDecl::make(coordinate, Load::make(indexList, loopVar)); + std::cout << "original forall : " << forall << std::endl; + std::cout << "inside IndexStmt: " << forall.getStmt() << std::endl; Stmt body = lowerForallBody(coordinate, forall.getStmt(), locators, inserters, appenders, reducedAccesses); Stmt resetGuard = ir::Store::make(bitGuard, coordinate, ir::Literal::make(false), markAssignsAtomicDepth > 0, atomicParallelUnit); @@ -1216,7 +1249,12 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, Stmt posAppend = generateAppendPositions(appenders); LoopKind kind = LoopKind::Serial; - if (forall.getParallelUnit() == ParallelUnit::CPUVector && !ignoreVectorize) { + if (should_use_ISPC_codegen()) { + if (forall.getParallelUnit() == ParallelUnit::CPUVector && !ignoreVectorize) { + kind = LoopKind::Foreach; + } + } + else if (forall.getParallelUnit() == ParallelUnit::CPUVector && !ignoreVectorize) { kind = LoopKind::Vectorized; } else if (forall.getParallelUnit() != ParallelUnit::NotParallel @@ -1224,6 +1262,7 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, kind = LoopKind::Runtime; } + std::cout << "2 Stmt LowererImplImperative::lowerForallDenseAcceleration\n"; return Block::blanks(For::make(loopVar, 0, indexListSize, 1, body, kind, ignoreVectorize ? ParallelUnit::NotParallel : forall.getParallelUnit(), ignoreVectorize ? 0 : forall.getUnrollFactor()), @@ -1247,6 +1286,8 @@ Stmt LowererImplImperative::lowerForallPosition(Forall forall, Iterator iterator set reducedAccesses, ir::Stmt recoveryStmt) { + std::cout << "1 Stmt LowererImplImperative::lowerForallPosition\n" << std::endl; + Expr coordinate = getCoordinateVar(forall.getIndexVar()); Stmt declareCoordinate = Stmt(); Stmt strideGuard = Stmt(); @@ -1278,6 +1319,11 @@ Stmt LowererImplImperative::lowerForallPosition(Forall forall, Iterator iterator markAssignsAtomicDepth++; } + // see we are inside a forall. ex: forall(i, forall(j, y(i) += A(i,j) * x(j))) + // when you call forall.getStmt it returns forall(j, y(i) += A(i,j) * x(j)) which is the + // IndexStmt inside the forall IndexStmt + std::cout << "original forall : " << forall << std::endl; + std::cout << "inside IndexStmt: " << forall.getStmt() << std::endl; Stmt body = lowerForallBody(coordinate, forall.getStmt(), locators, inserters, appenders, reducedAccesses); @@ -1339,6 +1385,7 @@ Stmt LowererImplImperative::lowerForallPosition(Forall forall, Iterator iterator kind = LoopKind::Runtime; } + std::cout << "2 Stmt LowererImplImperative::lowerForallPosition\n" << std::endl; // Loop with preamble and postamble return Block::blanks( boundsCompute, @@ -1357,6 +1404,7 @@ Stmt LowererImplImperative::lowerForallFusedPosition(Forall forall, Iterator ite set reducedAccesses, ir::Stmt recoveryStmt) { + std::cout << "1 Stmt LowererImplImperative::lowerForallFusedPosition\n" << std::endl; Expr coordinate = getCoordinateVar(forall.getIndexVar()); Stmt declareCoordinate = Stmt(); if (provGraph.isCoordVariable(forall.getIndexVar())) { @@ -1447,6 +1495,8 @@ Stmt LowererImplImperative::lowerForallFusedPosition(Forall forall, Iterator ite markAssignsAtomicDepth++; } + std::cout << "original forall : " << forall << std::endl; + std::cout << "inside IndexStmt: " << forall.getStmt() << std::endl; Stmt body = lowerForallBody(coordinate, forall.getStmt(), locators, inserters, appenders, reducedAccesses); @@ -1503,6 +1553,8 @@ Stmt LowererImplImperative::lowerForallFusedPosition(Forall forall, Iterator ite && forall.getOutputRaceStrategy() != OutputRaceStrategy::ParallelReduction && !ignoreVectorize) { kind = LoopKind::Runtime; } + + std::cout << "2 Stmt LowererImplImperative::lowerForallFusedPosition\n" << std::endl; // Loop with preamble and postamble return Block::blanks(boundsCompute, Block::make(Block::make(searchForUnderivedStart), @@ -1765,6 +1817,9 @@ Stmt LowererImplImperative::lowerForallBody(Expr coordinate, IndexStmt stmt, vector inserters, vector appenders, const set& reducedAccesses) { + + std::cout << "lowering a forall body----------------------------------------------------\n"; + Stmt initVals = resizeAndInitValues(appenders, reducedAccesses); // Inserter positions @@ -1780,6 +1835,7 @@ Stmt LowererImplImperative::lowerForallBody(Expr coordinate, IndexStmt stmt, // Code of loop body statement Stmt body = lower(stmt); + std::cout << "\nBefore: [" << stmt << "]\nAfter : [" << body << "]\n"; // Code to append coordinates Stmt appendCoords = appendCoordinate(appenders, coordinate); @@ -1889,6 +1945,7 @@ vector LowererImplImperative::codeToInitializeDenseAcceleratorArrays(Where Expr p = Var::make("p" + temporary.getName(), Int()); Stmt guardZeroInit = Store::make(alreadySetArr, p, ir::Literal::zero(bitGuardType)); + std::cout << "vector LowererImplImperative::codeToInitializeDenseAcceleratorArrays\n" << std::endl; Stmt zeroInitLoop = For::make(p, 0, bitGuardSize, 1, guardZeroInit, LoopKind::Serial); Stmt inits = Block::make(alreadySetDecl, indexListDecl, allocateAlreadySet, allocateIndexList, zeroInitLoop); return {inits, freeTemps}; @@ -2203,6 +2260,7 @@ Stmt LowererImplImperative::lowerWhere(Where where) { true, false); Expr size = getTemporarySize(where); Stmt zeroInit = Store::make(values, p, ir::Literal::zero(temporary.getType().getDataType())); + std::cout << "Stmt LowererImplImperative::lowerWhere\n"; Stmt loopInit = For::make(p, 0, size, 1, zeroInit, LoopKind::Serial); initializeTemporary = Block::make(initializeTemporary, loopInit); } @@ -2334,6 +2392,7 @@ Stmt LowererImplImperative::lowerAssemble(Assemble assemble) { resultModeOrdering[iter.getMode().getLevel() - 1]); Expr pos = iter.getPosVar(); Stmt initPos = VarDecl::make(pos, iter.locate(locateCoords)[0]); + std::cout << "Stmt LowererImplImperative::lowerAssemble\n"; insertEdgeLoop = For::make(coords.back(), 0, dim, 1, Block::make(initPos, insertEdgeLoop)); } else { @@ -2415,6 +2474,7 @@ Stmt LowererImplImperative::lowerMulti(Multi multi) { } Stmt LowererImplImperative::lowerSuchThat(SuchThat suchThat) { + std::cout << "lowering such that statement\n"; Stmt stmt = lower(suchThat.getStmt()); return Block::make(stmt); } @@ -2942,6 +3002,7 @@ Stmt LowererImplImperative::resizeAndInitValues(const std::vector& app Stmt LowererImplImperative::zeroInitValues(Expr tensor, Expr begin, Expr size) { + std::cout << "1 Stmt LowererImplImperative::zeroInitValues\n"; Expr lower = simplify(ir::Mul::make(begin, size)); Expr upper = simplify(ir::Mul::make(ir::Add::make(begin, 1), size)); Expr p = Var::make("p" + util::toString(tensor), Int()); @@ -2954,6 +3015,10 @@ Stmt LowererImplImperative::zeroInitValues(Expr tensor, Expr begin, Expr size) { return ir::VarDecl::make(ir::Var::make("status", Int()), ir::Call::make("cudaMemset", {values, ir::Literal::make(0, Int()), ir::Mul::make(ir::Sub::make(upper, lower), ir::Literal::make(values.type().getNumBytes()))}, Int())); } + std::cout << "2 Stmt LowererImplImperative::zeroInitValues\n"; + if (should_use_ISPC_codegen()) { + return For::make(p, lower, upper, 1, zeroInit, LoopKind::Foreach); + } return For::make(p, lower, upper, 1, zeroInit, parallel); } diff --git a/src/tensor.cpp b/src/tensor.cpp index fab437ff1..3519456c9 100644 --- a/src/tensor.cpp +++ b/src/tensor.cpp @@ -278,6 +278,7 @@ static size_t unpackTensorData(const taco_tensor_t& tensorData, /// Pack coordinates into a data structure given by the tensor format. void TensorBase::pack() { + std::cout << "TensorBase::Pack() method\n"; if (!needsPack()) { return; } @@ -346,6 +347,7 @@ void TensorBase::pack() { taco_iassert((content->coordinateBufferUsed % content->coordinateSize) == 0); const size_t numCoordinates = content->coordinateBufferUsed / content->coordinateSize; + std::cout << "call helperFuncs\n"; const auto helperFuncs = getHelperFunctions(getFormat(), getComponentType(), dimensions); @@ -623,6 +625,7 @@ void TensorBase::compile() { compile(stmt, content->assembleWhileCompute); } void TensorBase::compile(taco::IndexStmt stmt, bool assembleWhileCompute) { + std::cout << "TensorBase::compile\n"; if (!needsCompile()) { return; } @@ -934,6 +937,7 @@ TensorBase::getHelperFunctions(const Format& format, Datatype ctype, }; const auto dims = util::map(dimensions, getDim); + set_ISPC_code_stream_enabled(false); if (format.getOrder() > 0) { const Format bufferFormat = COO(format.getOrder(), false, true, false, format.getModeOrdering()); @@ -951,6 +955,7 @@ TensorBase::getHelperFunctions(const Format& format, Datatype ctype, } // Lower packing and iterator code. + std::cout << "1 Lower packing and iterator code\n"; helperModule->addFunction(lower(packStmt, "pack", true, true)); helperModule->addFunction(lower(iterateStmt, "iterate", false, true)); } else { @@ -964,12 +969,14 @@ TensorBase::getHelperFunctions(const Format& format, Datatype ctype, IndexVar indexVar; IndexStmt assignment = (packedScalar() = bufferVector(indexVar)); IndexStmt packStmt= makeConcreteNotation(makeReductionNotation(assignment)); + std::cout << "2 Lower packing and iterator code\n"; helperModule->addFunction(lower(packStmt, "pack", true, true)); // Define and lower iterator code. IndexStmt iterateStmt = Yield({}, packedScalar()); helperModule->addFunction(lower(iterateStmt, "iterate", false, true)); } + std::cout << "Compiling the helperModule\n"; helperModule->compile(); helperFunctionsMutex.lock(); diff --git a/test/tests-scheduling-eval.cpp b/test/tests-scheduling-eval.cpp index f59359081..6a228f38b 100644 --- a/test/tests-scheduling-eval.cpp +++ b/test/tests-scheduling-eval.cpp @@ -4,6 +4,7 @@ #include #include #include +#include "taco/cuda.h" #include "test.h" #include "test_tensors.h" #include "taco/tensor.h" @@ -48,10 +49,10 @@ IndexStmt scheduleSpMVCPU(IndexStmt stmt, int CHUNK_SIZE=16) { IndexStmt scheduleSpMVISPC(IndexStmt stmt, int CHUNK_SIZE=16) { IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1"); - return stmt; - // return stmt.split(i, i0, i1, CHUNK_SIZE) - // .reorder({i0, i1, j}) - // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); + // return stmt; + return stmt.split(i, i0, i1, CHUNK_SIZE) + .reorder({i0, i1, j}) + .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); } IndexStmt scheduleSpMMCPU(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { @@ -64,6 +65,16 @@ IndexStmt scheduleSpMMCPU(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, i .parallelize(k, ParallelUnit::CPUVector, OutputRaceStrategy::IgnoreRaces); } +IndexStmt scheduleSpMMISPC(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); + return stmt.split(i, i0, i1, CHUNK_SIZE) + .pos(j, jpos, A(i,j)) + .split(jpos, jpos0, jpos1, UNROLL_FACTOR) + .reorder({i0, i1, jpos0, k, jpos1}) + .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + .parallelize(k, ParallelUnit::CPUVector, OutputRaceStrategy::IgnoreRaces); +} + IndexStmt scheduleSpGEMMCPU(IndexStmt stmt, bool doPrecompute) { Assignment assign = stmt.as().getStmt().as().getStmt() .as().getStmt().as(); @@ -1473,8 +1484,6 @@ TEST(scheduling_eval, mttkrpGPU) { ASSERT_TENSOR_EQ(expected, A); } - - TEST(generate_ispc_evaluation_files, ispc) { std::cout << "Hi Adhitha!\n" << std::endl ; set_CUDA_codegen_enabled(false); @@ -1495,15 +1504,18 @@ TEST(generate_ispc_evaluation_files, ispc) { int NUM_I = 100; int NUM_J = 100; + int NUM_K = 100; + string c_file_ending = ".h"; string file_ending = ".ispc"; string file_path = "eval_prepared_ispc/"; mkdir(file_path.c_str(), 0777); // spmv { - stringstream source; - std::shared_ptr codegen = ir::CodeGen::init_default(source, ir::CodeGen::ImplementationGen); + stringstream source1; + stringstream source2; + std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); Tensor A("A", {NUM_I, NUM_J}, CSR); Tensor x("x", {NUM_J}, {Dense}); Tensor y("y", {NUM_I}, {Dense}); @@ -1511,18 +1523,53 @@ TEST(generate_ispc_evaluation_files, ispc) { std::cout << "concretizing the assignment statement\n"; IndexStmt stmt = y.getAssignment().concretize(); std::cout << "Printing the original IndexStmt: " << stmt << std::endl; + for (auto paramSet : spmv_parameters) { std::cout << "param set: " << paramSet[0] << std::endl; IndexStmt scheduled = scheduleSpMVISPC(stmt, paramSet[0]); std::cout << "scheduled IndexStmt: " << scheduled << std::endl; - ir::Stmt compute = lower(scheduled, "spmv_csr_ispc_taco", false, true); + ir::Stmt compute = lower(scheduled, string("compute_") + util::join(paramSet, "_"), false, true); std::cout << "computed statement: \n" << compute << std::endl; codegen->compile(compute, false); } ofstream source_file; - source_file.open(file_path + "spmv_csr_ispc_taco.h"); - source_file << source.str(); + source_file.open(file_path + "spmv_csr_ispc_taco" + c_file_ending); + source_file << source1.str(); + source_file.close(); + + ofstream ispc_source_file; + ispc_source_file.open(file_path + "__spmv_csr_ispc_taco" + file_ending); + ispc_source_file << source2.str(); + ispc_source_file.close(); + + } + + // spmm + { + stringstream source1; + stringstream source2; + std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); + Tensor A("A", {NUM_I, NUM_J}, CSR); + Tensor B("B", {NUM_J, NUM_K}, {Dense, Dense}); + Tensor C("C", {NUM_I, NUM_K}, {Dense, Dense}); + C(i, k) = A(i, j) * B(j, k); + IndexStmt stmt = C.getAssignment().concretize(); + bool isFirst = true; + for (auto paramSet : spmm_parameters) { + IndexStmt scheduled = scheduleSpMMISPC(stmt, A, paramSet[0], paramSet[1]); + ir::Stmt compute = lower(scheduled, string("compute_") + util::join(paramSet, "_"), false, true); + codegen->compile(compute, isFirst); + isFirst = false; + } + ofstream source_file; + source_file.open(file_path + "spmm_csr_ispc_taco" + c_file_ending); + source_file << source1.str(); source_file.close(); + + ofstream ispc_source_file; + ispc_source_file.open(file_path + "__spmm_csr_ispc_taco" + file_ending); + ispc_source_file << source2.str(); + ispc_source_file.close(); } @@ -1846,9 +1893,13 @@ TEST(generate_evaluation_files, cpu) { } TEST(generate_evaluation_files, gpu) { - if (!should_use_CUDA_codegen()) { - return; - } + // if (!should_use_CUDA_codegen()) { + // return; + // } + set_CUDA_codegen_enabled(true); + set_ISPC_codegen_enabled(false); + + std::cout << "executing generate_evaluation_file.gpu\n"; vector> spmv_parameters = {}; // {NNZ_PER_THREAD, BLOCK_SIZE} for (int i = 3; i <= 20; i++) { diff --git a/tools/taco.cpp b/tools/taco.cpp index ce03b61e1..9a864a699 100644 --- a/tools/taco.cpp +++ b/tools/taco.cpp @@ -1297,6 +1297,7 @@ int main(int argc, char* argv[]) { } bool hasPrinted = false; + std::shared_ptr codegen = ir::CodeGen::init_default(cout, ir::CodeGen::ImplementationGen); codegen->setColor(color); if (printAssemble) { @@ -1317,6 +1318,7 @@ int main(int argc, char* argv[]) { } if (compute.defined()) { + std::cout << "Code generation\n"; codegen->compile(compute, false); } else { From 4e7bd6879c5f7ca1f43397dff5cc92259a7e1eda Mon Sep 17 00:00:00 2001 From: Adhhitha Dias Date: Mon, 19 Jul 2021 15:13:47 -0400 Subject: [PATCH 04/10] add CPUSpmd directive partially --- include/taco/index_notation/transformations.h | 2 + include/taco/ir/ir.h | 2 +- include/taco/ir_tags.h | 2 +- include/taco/lower/lowerer_impl_imperative.h | 3 + src/codegen/codegen.cpp | 66 ----- src/codegen/codegen.h | 13 +- src/codegen/codegen_cuda.cpp | 1 + src/codegen/codegen_ispc.cpp | 257 +++++++++++++++++- src/codegen/codegen_ispc.h | 7 + src/index_notation/index_notation_printer.cpp | 4 +- src/index_notation/transformations.cpp | 62 ++++- src/ir/ir_printer.cpp | 43 --- src/ir_tags.cpp | 2 +- src/lower/lowerer_impl_imperative.cpp | 76 +++++- src/tensor.cpp | 1 + test/tests-scheduling-eval.cpp | 207 +++++++++++++- tools/taco.cpp | 47 +++- 17 files changed, 647 insertions(+), 148 deletions(-) diff --git a/include/taco/index_notation/transformations.h b/include/taco/index_notation/transformations.h index 7aa2579ad..6bf277d5c 100644 --- a/include/taco/index_notation/transformations.h +++ b/include/taco/index_notation/transformations.h @@ -223,6 +223,8 @@ IndexStmt parallelizeOuterLoop(IndexStmt stmt); */ IndexStmt reorderLoopsTopologically(IndexStmt stmt); +IndexStmt justTraverseThroughTheIndexStmt(IndexStmt stmt); + /** * Performs scalar promotion so that reductions are done by accumulating into * scalar temporaries whenever possible. diff --git a/include/taco/ir/ir.h b/include/taco/ir/ir.h index cb46b5142..651faff4e 100644 --- a/include/taco/ir/ir.h +++ b/include/taco/ir/ir.h @@ -591,7 +591,7 @@ struct Switch : public StmtNode { static const IRNodeType _type_info = IRNodeType::Switch; }; -enum class LoopKind {Serial, Static, Dynamic, Runtime, Vectorized, Static_Chunked, Foreach}; +enum class LoopKind {Serial, Static, Dynamic, Runtime, Vectorized, Static_Chunked, Foreach, Mul_Thread}; /** A for loop from start to end by increment. * A vectorized loop will require the increment to be 1 and the diff --git a/include/taco/ir_tags.h b/include/taco/ir_tags.h index 5858a13e3..6a74be173 100644 --- a/include/taco/ir_tags.h +++ b/include/taco/ir_tags.h @@ -9,7 +9,7 @@ namespace taco { /// ParallelUnit::GPUWarp can be optionally used to allow for GPU warp-level primitives /// ParallelUnit::GPUThread causes for every iteration to be executed on a separate GPU thread enum class ParallelUnit { - NotParallel, DefaultUnit, GPUBlock, GPUWarp, GPUThread, CPUThread, CPUVector, CPUThreadGroupReduction, GPUBlockReduction, GPUWarpReduction + NotParallel, DefaultUnit, GPUBlock, GPUWarp, GPUThread, CPUThread, CPUVector, CPUThreadGroupReduction, GPUBlockReduction, GPUWarpReduction, CPUSimd, CPUSpmd }; extern const char *ParallelUnit_NAMES[]; diff --git a/include/taco/lower/lowerer_impl_imperative.h b/include/taco/lower/lowerer_impl_imperative.h index 65f069fda..d743f5875 100644 --- a/include/taco/lower/lowerer_impl_imperative.h +++ b/include/taco/lower/lowerer_impl_imperative.h @@ -499,10 +499,13 @@ class LowererImplImperative : public LowererImpl { bool emitUnderivedGuards = true; + int loopDepth = 0; int inParallelLoopDepth = 0; std::map parallelUnitSizes; std::map parallelUnitIndexVars; + std::map forUnits; // + std::map whereTempsWithLoopDepth; /// Keep track of what IndexVars have already been defined std::set definedIndexVars; diff --git a/src/codegen/codegen.cpp b/src/codegen/codegen.cpp index 750f33516..7081bc195 100644 --- a/src/codegen/codegen.cpp +++ b/src/codegen/codegen.cpp @@ -441,72 +441,6 @@ string CodeGen::printDecls(map varMap, return ret.str(); } -string CodeGen::printCallISPCFunc(const Function *func, map varMap, - vector &sortedProps) { - std::stringstream ret; - ret << " "; - unordered_set propsAlreadyGenerated; - - ret << "__" << func->name << "("; - - vector inputs = func->inputs; - vector outputs = func->outputs; - getSortedProps(varMap, sortedProps, inputs, outputs); - - for (unsigned long i=0; i < sortedProps.size(); i++) { - ret << varMap[sortedProps[i]]; - if (i != sortedProps.size()-1) { - ret << ", "; - } - propsAlreadyGenerated.insert(varMap[sortedProps[i]]); - } - - ret << ");\n"; - return ret.str(); -} - -string CodeGen::printISPCFunc(const Function *func, map varMap, - vector &sortedProps) { - std::stringstream ret; - ret << "export void "; - unordered_set propsAlreadyGenerated; - - ret << "__" << func->name << "("; - - vector inputs = func->inputs; - vector outputs = func->outputs; - // getSortedProps(varMap, sortedProps, inputs, outputs); - - for (unsigned long i=0; i < sortedProps.size(); i++) { - auto prop = sortedProps[i]; - bool isOutputProp = (find(outputs.begin(), outputs.end(), - prop->tensor) != outputs.end()); - - auto var = prop->tensor.as(); - if (var->is_parameter) { - if (isOutputProp) { - ret << " " << printTensorProperty(varMap[prop], prop, false) << ";" << endl; - } else { - break; - } - } else { - ret << getUnpackedTensorArgument(varMap[prop], prop, isOutputProp); - } - propsAlreadyGenerated.insert(varMap[prop]); - - if (i!=sortedProps.size()-1) { - ret << ", "; - } - if (i%2==0) { - ret << "\n\t"; - } - } - ret << ") {\n"; - - return ret.str(); -} - - string CodeGen::printPack(map, string> outputProperties, vector outputs) { stringstream ret; diff --git a/src/codegen/codegen.h b/src/codegen/codegen.h index 641239834..db891f995 100644 --- a/src/codegen/codegen.h +++ b/src/codegen/codegen.h @@ -49,10 +49,6 @@ class CodeGen : public IRPrinter { std::string printContextDeclAndInit(std::map varMap, std::vector localVars, int labels, std::string funcName); - std::string printCallISPCFunc(const Function *func, std::map varMap, - std::vector &sortedProps); - std::string printISPCFunc(const Function *func, std::map varMap, - std::vector &sortedProps); std::string printDecls(std::map varMap, std::vector inputs, std::vector outputs); std::string printPack(std::map, @@ -63,6 +59,10 @@ class CodeGen : public IRPrinter { std::string printFuncName(const Function *func, std::map inputMap={}, std::map outputMap={}); + + std::string printTensorProperty(std::string varname, const GetProperty* op, bool is_ptr); + std::string getUnpackedTensorArgument(std::string varname, const GetProperty* op, + bool is_output_prop); void resetUniqueNameCounters(); std::string genUniqueName(std::string name); @@ -72,11 +72,8 @@ class CodeGen : public IRPrinter { private: virtual std::string restrictKeyword() const { return ""; } - std::string printTensorProperty(std::string varname, const GetProperty* op, bool is_ptr); std::string unpackTensorProperty(std::string varname, const GetProperty* op, - bool is_output_prop); - std::string getUnpackedTensorArgument(std::string varname, const GetProperty* op, - bool is_output_prop); + bool is_output_prop); std::string packTensorProperty(std::string varname, Expr tnsr, TensorProperty property, int mode, int index); std::string pointTensorProperty(std::string varname); diff --git a/src/codegen/codegen_cuda.cpp b/src/codegen/codegen_cuda.cpp index 77cf0cd88..14505f740 100644 --- a/src/codegen/codegen_cuda.cpp +++ b/src/codegen/codegen_cuda.cpp @@ -646,6 +646,7 @@ void CodeGen_CUDA::printDeviceFunctions(const Function* func) { // Collect device functions resetUniqueNameCounters(); deviceFunctionLoopDepth = 0; + // here they calculate the device FunctionCollecor DeviceFunctionCollector deviceFunctionCollector(func->inputs, func->outputs, this); func->body.accept(&deviceFunctionCollector); deviceFunctions = deviceFunctionCollector.blockFors; diff --git a/src/codegen/codegen_ispc.cpp b/src/codegen/codegen_ispc.cpp index f107728cc..c8480cd25 100644 --- a/src/codegen/codegen_ispc.cpp +++ b/src/codegen/codegen_ispc.cpp @@ -7,6 +7,9 @@ #include "taco/cuda.h" #include "taco/ir/ir_visitor.h" +#include "taco/ir/ir_rewriter.h" +#include "taco/ir/simplify.h" + #include "codegen_ispc.h" #include "taco/error.h" #include "taco/util/strings.h" @@ -240,6 +243,121 @@ class CodeGen_ISPC::FindVars : public IRVisitor { } }; + +// Finds all for loops tagged with accelerator and adds statements to deviceFunctions +// Also tracks scope of when device function is called and +// tracks which variables must be passed to function. +class CodeGen_ISPC::DeviceFunctionCollector : public IRVisitor { +public: + vector blockFors; + vector threadFors; // contents is device function + vector warpFors; + map scopeMap; + + // the variables to pass to each device function + vector>> functionParameters; + vector> currentParameters; // keep as vector so code generation is deterministic + set currentParameterSet; + + set variablesDeclaredInKernel; + + vector> threadIDVars; + vector> blockIDVars; + vector> warpIDVars; + vector numThreads; + vector numWarps; + + CodeGen_ISPC *codeGen; + // copy inputs and outputs into the map + DeviceFunctionCollector(vector inputs, vector outputs, CodeGen_ISPC *codeGen) : codeGen(codeGen) { + inDeviceFunction = false; + for (auto v: inputs) { + auto var = v.as(); + taco_iassert(var) << "Inputs must be vars in codegen"; + taco_iassert(scopeMap.count(var) == 0) << + "Duplicate input found in codegen"; + scopeMap[var] = var->name; + } + for (auto v: outputs) { + auto var = v.as(); + taco_iassert(var) << "Outputs must be vars in codegen"; + taco_iassert(scopeMap.count(var) == 0) << + "Duplicate output found in codegen"; + + scopeMap[var] = var->name; + } + } + +protected: + bool inDeviceFunction; + using IRVisitor::visit; + + virtual void visit(const For *op) { + if (op->parallel_unit == ParallelUnit::CPUSpmd) { + std::cout << "ParallelUnit::CPUSpmd directive found\n"; + inDeviceFunction = false; + op->var.accept(this); + inDeviceFunction = true; + + threadFors.push_back(op); + std::cout << "scopeMap: [" << scopeMap[op->var] << "], varExpr: [" << op->var << "]\n"; + threadIDVars.push_back(pair(scopeMap[op->var], op->var)); + Expr blockSize = ir::simplify(ir::Div::make(ir::Sub::make(op->end, op->start), op->increment)); + numThreads.push_back(blockSize); + + } + else if (op->parallel_unit == ParallelUnit::CPUSimd) { + + } + else{ + op->var.accept(this); + } + op->start.accept(this); + op->end.accept(this); + op->increment.accept(this); + op->contents.accept(this); + } + + virtual void visit(const Var *op) { + if (scopeMap.count(op) == 0) { + string name = codeGen->genUniqueName(op->name); + if (!inDeviceFunction) { + scopeMap[op] = name; + } + } + else if (scopeMap.count(op) == 1 && inDeviceFunction && currentParameterSet.count(op) == 0 + && (threadIDVars.empty() || op != threadIDVars.back().second) + && !variablesDeclaredInKernel.count(op)) { + currentParameters.push_back(pair(scopeMap[op], op)); + currentParameterSet.insert(op); + } + } + + virtual void visit(const VarDecl *op) { + if (inDeviceFunction) { + variablesDeclaredInKernel.insert(op->var); + } + op->var.accept(this); + op->rhs.accept(this); + } + + virtual void visit(const GetProperty *op) { + if (scopeMap.count(op->tensor) == 0 && !inDeviceFunction) { + auto key = + tuple(op->tensor,op->property, + (size_t)op->mode, + (size_t)op->index); + auto unique_name = codeGen->genUniqueName(op->name); + scopeMap[op->tensor] = unique_name; + } + else if (scopeMap.count(op->tensor) == 1 && inDeviceFunction && currentParameterSet.count(op->tensor) == 0) { + currentParameters.push_back(pair(op->tensor.as()->name, op->tensor)); + currentParameterSet.insert(op->tensor); + } + } +}; + + CodeGen_ISPC::CodeGen_ISPC(std::ostream &dest, OutputKind outputKind, bool simplify) : CodeGen(dest, false, simplify, C), out(dest), out2(dest), outputKind(outputKind) {} @@ -262,6 +380,76 @@ void CodeGen_ISPC::compile(Stmt stmt, bool isFirst) { stmt.accept(this); } +string CodeGen_ISPC::printCallISPCFunc(const Function *func, map varMap, + vector &sortedProps) { + std::stringstream ret; + ret << " "; + unordered_set propsAlreadyGenerated; + + ret << "__" << func->name << "("; + + vector inputs = func->inputs; + vector outputs = func->outputs; + getSortedProps(varMap, sortedProps, inputs, outputs); + + for (unsigned long i=0; i < sortedProps.size(); i++) { + ret << varMap[sortedProps[i]]; + if (i != sortedProps.size()-1) { + ret << ", "; + } + propsAlreadyGenerated.insert(varMap[sortedProps[i]]); + } + + ret << ");\n"; + return ret.str(); +} + +string CodeGen_ISPC::printISPCFunc(const Function *func, map varMap, + vector &sortedProps) { + + DeviceFunctionCollector deviceFunctionCollector(func->inputs, func->outputs, this); + func->body.accept(&deviceFunctionCollector); + + + std::stringstream ret; + ret << "export void "; + unordered_set propsAlreadyGenerated; + + ret << "__" << func->name << "("; + + vector inputs = func->inputs; + vector outputs = func->outputs; + // getSortedProps(varMap, sortedProps, inputs, outputs); + + for (unsigned long i=0; i < sortedProps.size(); i++) { + auto prop = sortedProps[i]; + bool isOutputProp = (find(outputs.begin(), outputs.end(), + prop->tensor) != outputs.end()); + + auto var = prop->tensor.as(); + if (var->is_parameter) { + if (isOutputProp) { + ret << " " << printTensorProperty(varMap[prop], prop, false) << ";" << endl; + } else { + break; + } + } else { + ret << getUnpackedTensorArgument(varMap[prop], prop, isOutputProp); + } + propsAlreadyGenerated.insert(varMap[prop]); + + if (i!=sortedProps.size()-1) { + ret << ", "; + } + if (i%2==0) { + ret << "\n\t"; + } + } + ret << "\n) {\n\n"; + + return ret.str(); +} + void CodeGen_ISPC::sendToStream(std::stringstream &stream) { if (is_ISPC_code_stream_enabled()) { this->out2 << stream.str(); @@ -466,6 +654,21 @@ void CodeGen_ISPC::visit(const For* op) { case LoopKind::Dynamic: case LoopKind::Runtime: case LoopKind::Static_Chunked: + case LoopKind::Mul_Thread: + op->start.accept(this); + stream2 << std::endl; + op->start.accept(this); + stream2 << std::endl; + op->start.accept(this); + stream2 << std::endl; + op->start.accept(this); + stream2 << std::endl; + op->end.accept(this); + stream2 << std::endl; + op->end.accept(this); + stream2 << std::endl; + op->end.accept(this); + stream2 << std::endl; default: break; } @@ -629,10 +832,58 @@ void CodeGen_ISPC::visit(const Sqrt* op) { void CodeGen_ISPC::visit(const Assign* op) { if (is_ISPC_code_stream_enabled()) { - if (op->use_atomics) { - doIndent(); - stream2 << getAtomicPragma() << endl; + doIndent(); + op->lhs.accept(this); + parentPrecedence = Precedence::TOP; + bool printed = false; + if (simplify) { + if (isa(op->rhs)) { + auto add = to(op->rhs); + if (add->a == op->lhs) { + const Literal* lit = add->b.as(); + if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || + (lit->type.isUInt() && lit->equalsScalar(1)))) { + stream2 << "++"; + } + else { + if (op->use_atomics) { + stream2 << " += reduce_add("; + add->b.accept(this); + stream2 << ")"; + } + else { + stream2 << " += "; + add->b.accept(this); + } + } + printed = true; + } + } + else if (isa(op->rhs)) { + auto mul = to(op->rhs); + if (mul->a == op->lhs) { + stream2 << " *= "; + mul->b.accept(this); + printed = true; + } + } + else if (isa(op->rhs)) { + auto bitOr = to(op->rhs); + if (bitOr->a == op->lhs) { + stream2 << " |= "; + bitOr->b.accept(this); + printed = true; + } + } + } + if (!printed) { + stream2 << " = "; + op->rhs.accept(this); } + + stream2 << ";"; + stream2 << endl; + } else { if (op->use_atomics) { diff --git a/src/codegen/codegen_ispc.h b/src/codegen/codegen_ispc.h index 8abd1cc09..279d0db7a 100644 --- a/src/codegen/codegen_ispc.h +++ b/src/codegen/codegen_ispc.h @@ -43,6 +43,12 @@ class CodeGen_ISPC : public CodeGen { void visit(const Store*); void visit(const Assign*); + Stmt simplifyFunctionBodies(Stmt stmt); + std::string printCallISPCFunc(const Function *func, std::map varMap, + std::vector &sortedProps); + std::string printISPCFunc(const Function *func, std::map varMap, + std::vector &sortedProps); + std::map varMap; std::vector localVars; std::ostream &out; @@ -55,6 +61,7 @@ class CodeGen_ISPC : public CodeGen { bool emittingCoroutine; class FindVars; + class DeviceFunctionCollector; private: virtual std::string restrictKeyword() const { return "restrict"; } diff --git a/src/index_notation/index_notation_printer.cpp b/src/index_notation/index_notation_printer.cpp index 0b41615ad..d7ee998ae 100644 --- a/src/index_notation/index_notation_printer.cpp +++ b/src/index_notation/index_notation_printer.cpp @@ -224,9 +224,9 @@ void IndexNotationPrinter::visit(const YieldNode* op) { void IndexNotationPrinter::visit(const ForallNode* op) { os << "forall(" << op->indexVar << ", "; op->stmt.accept(this); - if (op->parallel_unit != ParallelUnit::NotParallel) { + // if (op->parallel_unit != ParallelUnit::NotParallel) { os << ", " << ParallelUnit_NAMES[(int) op->parallel_unit] << ", " << OutputRaceStrategy_NAMES[(int) op->output_race_strategy]; - } + // } os << ")"; } diff --git a/src/index_notation/transformations.cpp b/src/index_notation/transformations.cpp index 47fc1dd55..011779caf 100644 --- a/src/index_notation/transformations.cpp +++ b/src/index_notation/transformations.cpp @@ -1,8 +1,10 @@ #include "taco/index_notation/transformations.h" +#include "taco/cuda.h" #include "taco/index_notation/index_notation.h" #include "taco/index_notation/index_notation_rewriter.h" #include "taco/index_notation/index_notation_nodes.h" +#include "taco/index_notation/index_notation_printer.h" #include "taco/error/error_messages.h" #include "taco/util/collections.h" #include "taco/lower/iterator.h" @@ -592,7 +594,10 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const { std::string reason = ""; IndexStmt rewriteParallel(IndexStmt stmt) { + std::cout << "1 rewriting IndexStmt to support parallelize schedule directive\n--------------------------------------------\n"; + std::cout << stmt << std::endl; provGraph = ProvenanceGraph(stmt); + std::cout << "2 rewriting IndexStmt to support parallelize schedule directive\n--------------------------------------------\n"; const auto reductionVars = getReductionVars(stmt); reductionIndexVars.clear(); @@ -607,15 +612,22 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const { tensorVars = createIRTensorVars(stmt); assembledByUngroupedInsert.clear(); + std::cout << "3 rewriting IndexStmt to support parallelize schedule directive\n--------------------------------------------\n"; for (const auto& result : getAssembledByUngroupedInsertion(stmt)) { assembledByUngroupedInsert.push_back(tensorVars[result]); } + std::cout << "4 rewriting IndexStmt to support parallelize schedule directive\n--------------------------------------------\n"; + std::cout << stmt << std::endl; return rewrite(stmt); } void visit(const ForallNode* node) { + std::cout << "transformations.cpp void visit(const ForallNode* node)\n"; + std::cout << "node: \n" << node << std::endl; Forall foralli(node); + std::cout << "foralli: \n" << foralli << std::endl; + std::cout << "before stmt update stmt: \n" << stmt << std::endl; IndexVar i = parallelize.geti(); definedIndexVars.insert(foralli.getIndexVar()); @@ -632,6 +644,7 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const { Iterators iterators(foralli, tensorVars); MergeLattice lattice = MergeLattice::make(foralli, iterators, provGraph, definedIndexVars); + std::cout << "iter: " << i << ", lattice: \n" << lattice << std::endl; // Precondition 2: No coiteration of modes (i.e., merge lattice has // only one iterator) @@ -660,6 +673,7 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const { MergeLattice underivedLattice = MergeLattice::make(underivedForall, iterators, provGraph, definedIndexVars); + std::cout << "iter: " << i << ", underivedLattice: \n" << lattice << std::endl; // Precondition 3: Every result iterator must have insert capability for (Iterator iterator : underivedLattice.results()) { @@ -721,6 +735,7 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const { // build consumer that writes from temporary to output, mark consumer as parallel reduction ParallelUnit reductionUnit = ParallelUnit::CPUThreadGroupReduction; if (should_use_CUDA_codegen()) { + std::cout << "should_use_CUDA_codegen() true\n"; if (parentParallelUnits.count(ParallelUnit::GPUWarp)) { reductionUnit = ParallelUnit::GPUWarpReduction; } @@ -728,6 +743,9 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const { reductionUnit = ParallelUnit::GPUBlockReduction; } } + else { + std::cout << "should_use_CUDA_codegen() false\n"; + } IndexStmt consumer = forall(i, Assignment(assignment->lhs, w(i), assignment->op), reductionUnit, OutputRaceStrategy::ParallelReduction); precomputed_stmt = where(consumer, producer); } @@ -746,8 +764,9 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const { return; } - + std::cout << "updated stmt: \n"; stmt = forall(i, foralli.getStmt(), parallelize.getParallelUnit(), parallelize.getOutputRaceStrategy(), foralli.getUnrollFactor()); + std::cout << stmt << std::endl; return; } @@ -1181,6 +1200,7 @@ std::ostream& operator<<(std::ostream& os, IndexStmt parallelizeOuterLoop(IndexStmt stmt) { // get outer ForAll + std::cout << "get outer ForAll ----------------- \n"; Forall forall; bool matched = false; match(stmt, @@ -1215,7 +1235,19 @@ IndexStmt parallelizeOuterLoop(IndexStmt stmt) { } return parallelized256; } + else if (should_use_ISPC_codegen()) { + std::cout << "outer loop parallelization for ISPC codegen\n"; + // IndexStmt parallelized = Parallelize(forall.getIndexVar(), ParallelUnit::CPUSpmd, OutputRaceStrategy::NoRaces).apply(stmt, &reason); + // if (parallelized == IndexStmt()) { + // // can't parallelize + // return stmt; + // } + // return parallelized; + + return stmt; + } else { + std::cout << "outer loop parallelization for CPU codgen index statement\n"; IndexStmt parallelized = Parallelize(forall.getIndexVar(), ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces).apply(stmt, &reason); if (parallelized == IndexStmt()) { // can't parallelize @@ -1320,8 +1352,25 @@ topologicallySort(map> hardDeps, return sortedVars; } +IndexStmt justTraverseThroughTheIndexStmt(IndexStmt stmt) { + struct IndexStatementTraverse : public IndexNotationPrinter { + IndexStatementTraverse(std::ostream& os) : IndexNotationPrinter(os) {}; + using IndexNotationPrinter::visit; + map forallParallelUnit; + map forallOutputRaceStrategy; + }; + + std::cout << "traversing through the index statement\n"; + IndexNotationPrinter printer(std::cout); + std::cout << std::endl; + stmt.accept(&printer); + return stmt; + +} + IndexStmt reorderLoopsTopologically(IndexStmt stmt) { + std::cout << "executing reorderLoopsTopologically\n"; // Collect tensorLevelVars which stores the pairs of IndexVar and tensor // level that each tensor is accessed at struct DAGBuilder : public IndexNotationVisitor { @@ -1384,6 +1433,8 @@ IndexStmt reorderLoopsTopologically(IndexStmt stmt) { Iterators iterators(stmt); DAGBuilder dagBuilder(iterators); stmt.accept(&dagBuilder); + std::cout << "After DAGBuilder\n"; + std::cout << stmt << std::endl; // Construct tensor dependencies (sorted list of IndexVars) from tensorLevelVars map>> tensorVarOrders; @@ -1414,6 +1465,8 @@ IndexStmt reorderLoopsTopologically(IndexStmt stmt) { }; CollectSoftDependencies collectSoftDeps; stmt.accept(&collectSoftDeps); + std::cout << "After CollectSoftDependencies\n"; + std::cout << stmt << std::endl; const auto sortedVars = topologicallySort(hardDeps, collectSoftDeps.softDeps, dagBuilder.indexVarOriginalOrder); @@ -1450,7 +1503,11 @@ IndexStmt reorderLoopsTopologically(IndexStmt stmt) { }; TopoReorderRewriter rewriter(sortedVars, dagBuilder.innerBody, dagBuilder.forallParallelUnit, dagBuilder.forallOutputRaceStrategy); - return rewriter.rewrite(stmt); + IndexStmt stmtChanged = rewriter.rewrite(stmt); + std::cout << "After TopoReorderRewriter\n"; + std::cout << stmtChanged << std::endl; + + return stmtChanged; } IndexStmt scalarPromote(IndexStmt stmt, ProvenanceGraph provGraph, @@ -1478,6 +1535,7 @@ IndexStmt scalarPromote(IndexStmt stmt, ProvenanceGraph provGraph, void visit(const ForallNode* node) { Forall foralli(node); + std::cout << "scalar promote: " << foralli << std::endl; IndexVar i = foralli.getIndexVar(); // Don't allow hoisting out of forall's for GPU warp and block reduction diff --git a/src/ir/ir_printer.cpp b/src/ir/ir_printer.cpp index f96251c5a..ba2bc894b 100644 --- a/src/ir/ir_printer.cpp +++ b/src/ir/ir_printer.cpp @@ -750,50 +750,7 @@ void IRPrinter::visit(const VarDecl* op) { void IRPrinter::visit(const Assign* op) { if (is_ISPC_code_stream_enabled()) { - doIndent(); - op->lhs.accept(this); - parentPrecedence = Precedence::TOP; - bool printed = false; - if (simplify) { - if (isa(op->rhs)) { - auto add = to(op->rhs); - if (add->a == op->lhs) { - const Literal* lit = add->b.as(); - if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || - (lit->type.isUInt() && lit->equalsScalar(1)))) { - stream2 << "++"; - } - else { - stream2 << " += "; - add->b.accept(this); - } - printed = true; - } - } - else if (isa(op->rhs)) { - auto mul = to(op->rhs); - if (mul->a == op->lhs) { - stream2 << " *= "; - mul->b.accept(this); - printed = true; - } - } - else if (isa(op->rhs)) { - auto bitOr = to(op->rhs); - if (bitOr->a == op->lhs) { - stream2 << " |= "; - bitOr->b.accept(this); - printed = true; - } - } - } - if (!printed) { - stream2 << " = "; - op->rhs.accept(this); - } - stream2 << ";"; - stream2 << endl; } diff --git a/src/ir_tags.cpp b/src/ir_tags.cpp index af3dbd775..e7365d6c2 100644 --- a/src/ir_tags.cpp +++ b/src/ir_tags.cpp @@ -2,7 +2,7 @@ namespace taco { -const char *ParallelUnit_NAMES[] = {"NotParallel", "DefaultUnit", "GPUBlock", "GPUWarp", "GPUThread", "CPUThread", "CPUVector", "CPUThreadGroupReduction", "GPUBlockReduction", "GPUWarpReduction"}; +const char *ParallelUnit_NAMES[] = {"NotParallel", "DefaultUnit", "GPUBlock", "GPUWarp", "GPUThread", "CPUThread", "CPUVector", "CPUThreadGroupReduction", "GPUBlockReduction", "GPUWarpReduction", "CPUSimd", "CPUSpmd"}; const char *OutputRaceStrategy_NAMES[] = {"IgnoreRaces", "NoRaces", "Atomics", "Temporary", "ParallelReduction"}; const char *BoundType_NAMES[] = {"MinExact", "MinConstraint", "MaxExact", "MaxConstraint"}; const char *AssembleStrategy_NAMES[] = {"Append", "Insert"}; diff --git a/src/lower/lowerer_impl_imperative.cpp b/src/lower/lowerer_impl_imperative.cpp index 53ffd936f..28bd6c7c2 100644 --- a/src/lower/lowerer_impl_imperative.cpp +++ b/src/lower/lowerer_impl_imperative.cpp @@ -1,5 +1,6 @@ #include #include "taco/cuda.h" +#include "taco/ir_tags.h" #include "taco/lower/lowerer_impl_imperative.h" #include "taco/lower/lowerer_impl.h" @@ -417,6 +418,7 @@ LowererImplImperative::lower(IndexStmt stmt, string name, Stmt LowererImplImperative::lowerAssignment(Assignment assignment) { + std::cout << "\n\n converting assignment IndexStmt============================================ Assignment\n"; taco_iassert(generateAssembleCode() || generateComputeCode()); Stmt computeStmt; @@ -424,7 +426,7 @@ Stmt LowererImplImperative::lowerAssignment(Assignment assignment) Expr var = getTensorVar(result); const bool needComputeAssign = util::contains(needCompute, result); - + std::cout << "does assignment need compute assign: " << needComputeAssign << std::endl; Expr rhs; if (needComputeAssign) { rhs = lower(assignment.getRhs()); @@ -432,20 +434,51 @@ Stmt LowererImplImperative::lowerAssignment(Assignment assignment) // Assignment to scalar variables. if (isScalar(result.getType())) { + std::cout << "assignment to scalar variables\n"; if (needComputeAssign) { + std::cout << "compute assign\n"; if (!assignment.getOperator().defined()) { + std::cout << "assignment operator is not defined\n"; + std::cout << "var: " << var << ", rhs, : " << rhs << std::endl; computeStmt = Assign::make(var, rhs); } else { taco_iassert(isa(assignment.getOperator())); - bool useAtomics = markAssignsAtomicDepth > 0 && - !util::contains(whereTemps, result); + + std::cout << "assignment depth -- loopDepth: " << loopDepth << std::endl; + std::cout << "is markAssignsAtomicDepth > 0: " << (markAssignsAtomicDepth > 0) << std::endl; + for (auto &tensors_ : whereTemps) { + std::cout << tensors_ << ", "; + } + std::cout << std::endl; + std::cout << result << std::endl; + int tempVarInitLoopDepth = whereTempsWithLoopDepth.find(result)->second; + std::cout << "tempInitLoopDepth: " << tempVarInitLoopDepth << std::endl; + + bool reduction = false; + std::map::iterator itr; + for (itr = forUnits.begin(); itr!=forUnits.end(); ++itr) { + if (itr->first<=loopDepth && itr->first>tempVarInitLoopDepth && itr->second == ParallelUnit::CPUSimd) { + reduction = true; + } + std::cout << itr->first << "\t" << ParallelUnit_NAMES[(int) itr->second] << std::endl; + } + + // less than or equal to loopDepth but greater than temp variable initialized loop depth + bool useAtomics = markAssignsAtomicDepth > 0 && (!util::contains(whereTemps, result) || reduction); + std::cout << "whereTemps and result: " << !util::contains(whereTemps, result) << std::endl; + std::cout << "assignment to scalar variables useAtomics: " << useAtomics << std::endl; computeStmt = compoundAssign(var, rhs, useAtomics, atomicParallelUnit); + std::cout << "computeStatment: " << computeStmt << std::endl; } } + else { + std::cout << "not compute assign\n"; + } } // Assignments to tensor variables (non-scalar). else { + std::cout << "assignment to tensor variables\n"; Expr values = getValuesArray(result); Expr loc = generateValueLocExpr(assignment.getLhs()); @@ -479,6 +512,7 @@ Stmt LowererImplImperative::lowerAssignment(Assignment assignment) } if (needComputeAssign && values.defined()) { + std::cout << "assign compute statement\n"; if (!assignment.getOperator().defined()) { computeStmt = Store::make(values, loc, rhs); } @@ -595,9 +629,20 @@ LowererImplImperative::splitAppenderAndInserters(const vector& results */ Stmt LowererImplImperative::lowerForall(Forall forall) { + loopDepth++; + forUnits.insert(std::pair(loopDepth,forall.getParallelUnit())); std::cout << "doing lowerForall: " << forall << std::endl; bool hasExactBound = provGraph.hasExactBound(forall.getIndexVar()); bool forallNeedsUnderivedGuards = !hasExactBound && emitUnderivedGuards; + + + std::cout << "printing temporary variables with their atomic depths\n"; + map::iterator itr; + for (itr = whereTempsWithLoopDepth.begin(); itr != whereTempsWithLoopDepth.end(); ++itr) { + std::cout << itr->first << "\t" << itr->second << "\n"; + } + + if (!ignoreVectorize && forallNeedsUnderivedGuards && (forall.getParallelUnit() == ParallelUnit::CPUVector || forall.getUnrollFactor() > 0)) { @@ -852,6 +897,8 @@ Stmt LowererImplImperative::lowerForall(Forall forall) parallelUnitSizes.erase(forall.getParallelUnit()); } + forUnits.erase(loopDepth); + loopDepth--; return Block::blanks(preInitValues, temporaryValuesInitFree[0], loops, @@ -1157,12 +1204,18 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, ir::Stmt recoveryStmt) { std::cout << "1 Stmt LowererImplImperative::lowerForallDimension\n"; + std::cout << "1 Stmt LowererImplImperative::lowerForallDimension markAssignsAtomicDepth: " << markAssignsAtomicDepth << std::endl; Expr coordinate = getCoordinateVar(forall.getIndexVar()); if (forall.getParallelUnit() != ParallelUnit::NotParallel && forall.getOutputRaceStrategy() == OutputRaceStrategy::Atomics) { markAssignsAtomicDepth++; + std::cout << "1 Stmt LowererImplImperative::lowerForallDimension getParallelUnit() is Not NotParallel and outputRaceStrategy is Atomics\n"; + std::cout << "markAssignsAtomicDepth: " << markAssignsAtomicDepth << std::endl; atomicParallelUnit = forall.getParallelUnit(); } + else { + std::cout << "1 Stmt LowererImplImperative::lowerForallDimension getParallelUnit() is NotParallel or outputRaceStrategy is not Atomics\n"; + } std::cout << "original forall : " << forall << std::endl; std::cout << "inside IndexStmt: " << forall.getStmt() << std::endl; @@ -1183,9 +1236,14 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, LoopKind kind = LoopKind::Serial; if (should_use_ISPC_codegen()) { std::cout << "Foreach compatible loop\n"; - if (forall.getParallelUnit() == ParallelUnit::CPUVector && !ignoreVectorize) { + if (forall.getParallelUnit() == ParallelUnit::CPUSimd) { kind = LoopKind::Foreach; } + else if (forall.getParallelUnit() == ParallelUnit::CPUSpmd + && forall.getOutputRaceStrategy() != OutputRaceStrategy::ParallelReduction + ) { + kind = LoopKind::Mul_Thread; + } } else if (forall.getParallelUnit() == ParallelUnit::CPUVector && !ignoreVectorize) { kind = LoopKind::Vectorized; @@ -1250,7 +1308,7 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, LoopKind kind = LoopKind::Serial; if (should_use_ISPC_codegen()) { - if (forall.getParallelUnit() == ParallelUnit::CPUVector && !ignoreVectorize) { + if (forall.getParallelUnit() == ParallelUnit::CPUSimd) { kind = LoopKind::Foreach; } } @@ -2201,6 +2259,7 @@ vector LowererImplImperative::codeToInitializeTemporary(Where where) { } Stmt LowererImplImperative::lowerWhere(Where where) { + std::cout << "\n--------------------------------------- lowering where statement: " << where << "\n\n\n"; TensorVar temporary = where.getTemporary(); bool accelerateDenseWorkSpace, sortAccelerator; std::tie(accelerateDenseWorkSpace, sortAccelerator) = @@ -2237,6 +2296,7 @@ Stmt LowererImplImperative::lowerWhere(Where where) { }) ); + std::cout << "\ninitiating lowering of where consumer: " << where.getConsumer() << std::endl; Stmt consumer = lower(where.getConsumer()); if (accelerateDenseWorkSpace && sortAccelerator) { // We need to sort the indices array @@ -2266,6 +2326,7 @@ Stmt LowererImplImperative::lowerWhere(Where where) { } whereConsumers.push_back(consumer); + std::cout << "\nwhere temporaries: " << where.getTemporary() << std::endl; whereTemps.push_back(where.getTemporary()); captureNextLocatePos = true; @@ -2276,6 +2337,9 @@ Stmt LowererImplImperative::lowerWhere(Where where) { restoreAtomicDepth = true; } + whereTempsWithLoopDepth.insert(std::pair(where.getTemporary(), loopDepth)); + + std::cout << "\ninitiating lowering of where producer: " << where.getConsumer() << std::endl; Stmt producer = lower(where.getProducer()); if (accelerateDenseWorkSpace) { const Expr indexListSizeExpr = tempToIndexListSize.at(temporary); @@ -2283,6 +2347,8 @@ Stmt LowererImplImperative::lowerWhere(Where where) { initializeTemporary = Block::make(indexListSizeDecl, initializeTemporary); } + whereTempsWithLoopDepth.erase(where.getTemporary()); + if (restoreAtomicDepth) { markAssignsAtomicDepth++; } diff --git a/src/tensor.cpp b/src/tensor.cpp index 3519456c9..dac2c3fd2 100644 --- a/src/tensor.cpp +++ b/src/tensor.cpp @@ -621,6 +621,7 @@ void TensorBase::compile() { IndexStmt stmt = makeConcreteNotation(makeReductionNotation(assignment)); stmt = reorderLoopsTopologically(stmt); stmt = insertTemporaries(stmt); + std::cout << "calling parallelizeOuterLoop(stmt)\n"; stmt = parallelizeOuterLoop(stmt); compile(stmt, content->assembleWhileCompute); } diff --git a/test/tests-scheduling-eval.cpp b/test/tests-scheduling-eval.cpp index 6a228f38b..93ba7b01e 100644 --- a/test/tests-scheduling-eval.cpp +++ b/test/tests-scheduling-eval.cpp @@ -65,14 +65,31 @@ IndexStmt scheduleSpMMCPU(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, i .parallelize(k, ParallelUnit::CPUVector, OutputRaceStrategy::IgnoreRaces); } -IndexStmt scheduleSpMMISPC(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { +IndexStmt scheduleSpMMISPC1(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); return stmt.split(i, i0, i1, CHUNK_SIZE) .pos(j, jpos, A(i,j)) .split(jpos, jpos0, jpos1, UNROLL_FACTOR) .reorder({i0, i1, jpos0, k, jpos1}) .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) - .parallelize(k, ParallelUnit::CPUVector, OutputRaceStrategy::IgnoreRaces); + .parallelize(k, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); +} + +IndexStmt scheduleSpMMISPC2(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); + return stmt + .parallelize(k, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); +} + +IndexStmt scheduleSpMMISPC3(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); + return stmt + // .split(i, i0, i1, CHUNK_SIZE) + // .pos(j, jpos, A(i,j)) + // .split(jpos, jpos0, jpos1, UNROLL_FACTOR) + .reorder({j, k}) + .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + .parallelize(k, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); } IndexStmt scheduleSpGEMMCPU(IndexStmt stmt, bool doPrecompute) { @@ -128,6 +145,27 @@ IndexStmt scheduleSDDMMCPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, .parallelize(kpos1, ParallelUnit::CPUVector, OutputRaceStrategy::ParallelReduction); } +IndexStmt scheduleSDDMMISPC1(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1"); + return stmt.split(i, i0, i1, CHUNK_SIZE) + .pos(k, kpos, B(i,k)) + .split(kpos, kpos0, kpos1, UNROLL_FACTOR) + .reorder({i0, i1, kpos0, j, kpos1}) + .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + .parallelize(kpos1, ParallelUnit::CPUSimd, OutputRaceStrategy::ParallelReduction); +} + +IndexStmt scheduleSDDMMISPC2(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1"); + return stmt; + // .split(i, i0, i1, CHUNK_SIZE) + // .pos(k, kpos, B(i,k)) + // .split(kpos, kpos0, kpos1, UNROLL_FACTOR) + // .reorder({i0, i1, kpos0, j, kpos1}) + // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + // .parallelize(kpos1, ParallelUnit::CPUSimd, OutputRaceStrategy::ParallelReduction); +} + IndexStmt scheduleTTVCPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16) { IndexVar f("f"), fpos("fpos"), chunk("chunk"), fpos2("fpos2"); return stmt.fuse(i, j, f) @@ -1550,24 +1588,80 @@ TEST(generate_ispc_evaluation_files, ispc) { stringstream source2; std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); Tensor A("A", {NUM_I, NUM_J}, CSR); - Tensor B("B", {NUM_J, NUM_K}, {Dense, Dense}); - Tensor C("C", {NUM_I, NUM_K}, {Dense, Dense}); - C(i, k) = A(i, j) * B(j, k); - IndexStmt stmt = C.getAssignment().concretize(); + Tensor X("X", {NUM_J, NUM_K}, {Dense, Dense}); + Tensor Y("Y", {NUM_I, NUM_K}, {Dense, Dense}); + Y(i, k) = A(i, j) * X(j, k); + IndexStmt stmt = Y.getAssignment().concretize(); bool isFirst = true; for (auto paramSet : spmm_parameters) { - IndexStmt scheduled = scheduleSpMMISPC(stmt, A, paramSet[0], paramSet[1]); - ir::Stmt compute = lower(scheduled, string("compute_") + util::join(paramSet, "_"), false, true); + IndexStmt scheduled = scheduleSpMMISPC1(stmt, A, paramSet[0], paramSet[1]); + ir::Stmt compute = lower(scheduled, string("compute1_") + util::join(paramSet, "_"), false, true); + codegen->compile(compute, isFirst); + isFirst = false; + } + ofstream source_file; + source_file.open(file_path + "spmm_csr_ispc_taco1" + c_file_ending); + source_file << source1.str(); + source_file.close(); + + ofstream ispc_source_file; + ispc_source_file.open(file_path + "__spmm_csr_ispc_taco1" + file_ending); + ispc_source_file << source2.str(); + ispc_source_file.close(); + } + + // spmm + { + stringstream source1; + stringstream source2; + std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); + Tensor A("A", {NUM_I, NUM_J}, CSR); + Tensor X("X", {NUM_J, NUM_K}, {Dense, Dense}); + Tensor Y("Y", {NUM_I, NUM_K}, {Dense, Dense}); + Y(i, k) = A(i, j) * X(j, k); + IndexStmt stmt = Y.getAssignment().concretize(); + bool isFirst = true; + for (auto paramSet : spmm_parameters) { + IndexStmt scheduled = scheduleSpMMISPC2(stmt, A, paramSet[0], paramSet[1]); + ir::Stmt compute = lower(scheduled, string("compute2_") + util::join(paramSet, "_"), false, true); + codegen->compile(compute, isFirst); + isFirst = false; + } + ofstream source_file; + source_file.open(file_path + "spmm_csr_ispc_taco2" + c_file_ending); + source_file << source1.str(); + source_file.close(); + + ofstream ispc_source_file; + ispc_source_file.open(file_path + "__spmm_csr_ispc_taco2" + file_ending); + ispc_source_file << source2.str(); + ispc_source_file.close(); + } + + // spmm + { + stringstream source1; + stringstream source2; + std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); + Tensor A("A", {NUM_I, NUM_J}, CSR); + Tensor X("X", {NUM_J, NUM_K}, {Dense, Dense}); + Tensor Y("Y", {NUM_I, NUM_K}, {Dense, Dense}); + Y(i, k) = A(i, j) * X(j, k); + IndexStmt stmt = Y.getAssignment().concretize(); + bool isFirst = true; + for (auto paramSet : spmm_parameters) { + IndexStmt scheduled = scheduleSpMMISPC3(stmt, A, paramSet[0], paramSet[1]); + ir::Stmt compute = lower(scheduled, string("compute3_") + util::join(paramSet, "_"), false, true); codegen->compile(compute, isFirst); isFirst = false; } ofstream source_file; - source_file.open(file_path + "spmm_csr_ispc_taco" + c_file_ending); + source_file.open(file_path + "spmm_csr_ispc_taco3" + c_file_ending); source_file << source1.str(); source_file.close(); ofstream ispc_source_file; - ispc_source_file.open(file_path + "__spmm_csr_ispc_taco" + file_ending); + ispc_source_file.open(file_path + "__spmm_csr_ispc_taco3" + file_ending); ispc_source_file << source2.str(); ispc_source_file.close(); } @@ -1576,6 +1670,99 @@ TEST(generate_ispc_evaluation_files, ispc) { return; } + + +TEST(generate_ispc_sddmm_evaluation_files, ispc) { + std::cout << "Hi Adhitha!\n" << std::endl ; + set_CUDA_codegen_enabled(false); + set_ISPC_codegen_enabled(true); + + vector> spmv_parameters = {{32}}; + vector> spmspv_parameters = {{8}}; + + // 4 to 512 and 4, 8, 16 + vector> spmm_dcsr_parameters = {{16, 8}}; + vector> spmm_parameters = {{16,4}}; + + vector> mttkrp_parameters = {}; + mttkrp_parameters.push_back({64,0}); + + vector> sddmm_parameters = {{8, 8}}; + vector> ttv_parameters = {{32}}; + + int NUM_I = 100; + int NUM_J = 100; + int NUM_K = 100; + + string c_file_ending = ".h"; + string file_ending = ".ispc"; + string file_path = "eval_prepared_ispc/sddmm/"; + mkdir(file_path.c_str(), 0777); + + // sddmm + { + stringstream source1; + stringstream source2; + std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); + Tensor A("A", {NUM_I, NUM_K}, {Dense, Dense}); + Tensor B("B", {NUM_I, NUM_K}, CSR); + Tensor C("C", {NUM_I, NUM_J}, {Dense, Dense}); + Tensor D("D", {NUM_J, NUM_K}, {Dense, Dense}); + A(i,k) = B(i,k) * C(i,j) * D(j,k); + IndexStmt stmt = A.getAssignment().concretize(); + bool isFirst = true; + for (auto paramSet : sddmm_parameters) { + IndexStmt scheduled = scheduleSDDMMISPC1(stmt, B, paramSet[0], paramSet[1]); + ir::Stmt compute = lower(scheduled, string("compute1_") + util::join(paramSet, "_"), false, true); + codegen->compile(compute, isFirst); + isFirst = false; + } + ofstream source_file; + source_file.open(file_path + "sddmm_cpu_ispc_taco1" + file_ending); + source_file << source1.str(); + source_file.close(); + + ofstream ispc_source_file; + ispc_source_file.open(file_path + "__sddmm_cpu_ispc_taco1" + file_ending); + ispc_source_file << source2.str(); + ispc_source_file.close(); + } + + + // sddmm + { + stringstream source1; + stringstream source2; + std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); + Tensor Y("Y", {NUM_I, NUM_K}, {Dense, Dense}); + Tensor A("A", {NUM_I, NUM_K}, CSR); + Tensor X("X", {NUM_I, NUM_J}, {Dense, Dense}); + Y(i,j) = A(i,j) * X(i,k) * X(j,k); + IndexStmt stmt = Y.getAssignment().concretize(); + bool isFirst = true; + for (auto paramSet : sddmm_parameters) { + IndexStmt scheduled = scheduleSDDMMISPC2(stmt, A, paramSet[0], paramSet[1]); + ir::Stmt compute = lower(scheduled, string("compute2_") + util::join(paramSet, "_"), false, true); + codegen->compile(compute, isFirst); + isFirst = false; + } + ofstream source_file; + source_file.open(file_path + "sddmm_cpu_ispc_taco2" + file_ending); + source_file << source1.str(); + source_file.close(); + + ofstream ispc_source_file; + ispc_source_file.open(file_path + "__sddmm_cpu_ispc_taco2" + file_ending); + ispc_source_file << source2.str(); + ispc_source_file.close(); + } + + + return; +} + + + TEST(generate_evaluation_files, cpu) { if (should_use_CUDA_codegen()) { return; diff --git a/tools/taco.cpp b/tools/taco.cpp index 9a864a699..bf7e7c9dc 100644 --- a/tools/taco.cpp +++ b/tools/taco.cpp @@ -265,7 +265,7 @@ static void printSchedulingHelp() { "an output race strategy `strat`. Since the other transformations " "expect serial code, parallelize must come last in a series of " "transformations. Possible parallel hardware units are: " - "NotParallel, GPUBlock, GPUWarp, GPUThread, CPUThread, CPUVector. " + "NotParallel, GPUBlock, GPUWarp, GPUThread, CPUThread, CPUVector, CPUSimd, CPUSimd. " "Possible output race strategies are: " "IgnoreRaces, NoRaces, Atomics, Temporary, ParallelReduction."); } @@ -313,7 +313,8 @@ static void printCommandLine(ostream& os, int argc, char* argv[]) { } } -static bool setSchedulingCommands(vector> scheduleCommands, parser::Parser& parser, IndexStmt& stmt) { +static int setSchedulingCommands(vector> scheduleCommands, parser::Parser& parser, IndexStmt& stmt) { + std::cout << "setting scheduling commands\n"; auto findVar = [&stmt](string name) { ProvenanceGraph graph(stmt); for (auto v : graph.getAllIndexVars()) { @@ -326,9 +327,15 @@ static bool setSchedulingCommands(vector> scheduleCommands, parse abort(); // to silence a warning: control reaches end of non-void function }; - bool isGPU = false; + int isGPU = 0; + int isISPC = 0; for(vector scheduleCommand : scheduleCommands) { + std::cout << "running schedluing command: "; + for (auto &command : scheduleCommand) { + std::cout << command << " "; + } + std::cout << std::endl; string command = scheduleCommand[0]; scheduleCommand.erase(scheduleCommand.begin()); @@ -541,7 +548,15 @@ static bool setSchedulingCommands(vector> scheduleCommands, parse parallel_unit = ParallelUnit::CPUThread; } else if (unit == "CPUVector") { parallel_unit = ParallelUnit::CPUVector; - } else { + } else if (unit == "CPUSimd") { + isISPC = true; + parallel_unit = ParallelUnit::CPUSimd; + } + else if (unit == "CPUSpmd") { + parallel_unit = ParallelUnit::CPUSpmd; + isISPC = true; + } + else { taco_uerror << "Parallel hardware not defined."; goto end; } @@ -562,6 +577,8 @@ static bool setSchedulingCommands(vector> scheduleCommands, parse goto end; } + std::cout << "stmt before parallelizing the statement: " << stmt << endl; + std::cout << "ParallelUnit: " << ParallelUnit_NAMES[(int) parallel_unit] << ", outputRaceStrategy: " << OutputRaceStrategy_NAMES[(int) output_race_strategy] << std::endl; stmt = stmt.parallelize(findVar(i), parallel_unit, output_race_strategy); } else if (command == "assemble") { @@ -617,7 +634,13 @@ static bool setSchedulingCommands(vector> scheduleCommands, parse end:; } - return isGPU; + if (isGPU) { + return 1; + } + else if (isISPC) { + return 2; + } + return 0; } int main(int argc, char* argv[]) { @@ -1011,6 +1034,8 @@ int main(int argc, char* argv[]) { } } + std::cout << "cuda: " << cuda << ", ispc: " << ispc << std::endl; + // Print compute is the default if nothing else was asked for if (!printAssemble && !printEvaluate && !printIterationGraph && !writeCompute && !writeAssemble && !writeKernels && !readKernels && @@ -1019,6 +1044,7 @@ int main(int argc, char* argv[]) { } // pre-parse expression, to determine existence and order of loaded tensors + std::cout << "pre-parse expression, to determine existence and order of loaded tensors\n"; map loadedTensors; TensorBase temp_tensor; parser::Parser temp_parser(exprStr, formats, dataTypes, tensorsDimensions, loadedTensors, 42); @@ -1124,15 +1150,22 @@ int main(int argc, char* argv[]) { IndexStmt stmt = makeConcreteNotation(makeReductionNotation(tensor.getAssignment())); + std::cout << "concrete index statement: " << stmt << std::endl; + stmt = justTraverseThroughTheIndexStmt(stmt); stmt = reorderLoopsTopologically(stmt); + std::cout << "topologically reordered loops statement: " << stmt << std::endl; if (setSchedule) { - cuda |= setSchedulingCommands(scheduleCommands, parser, stmt); + int val = setSchedulingCommands(scheduleCommands, parser, stmt); + cuda |= (val==1); + ispc |= (val==2); } else { stmt = insertTemporaries(stmt); stmt = parallelizeOuterLoop(stmt); } + std::cout << "after setting the scheduling commands\n"; + std::cout << stmt << std::endl; if (cuda) { if (!CUDA_BUILT && benchmark) { @@ -1153,6 +1186,7 @@ int main(int argc, char* argv[]) { set_ISPC_codegen_enabled(false); } + std::cout << "running scalar promote\n" << std::endl; stmt = scalarPromote(stmt); if (printConcrete) { cout << stmt << endl; @@ -1240,6 +1274,7 @@ int main(int argc, char* argv[]) { } } else { + std::cout << "lowering stmt: " << stmt << std::endl; compute = lower(stmt, prefix+"compute", computeWithAssemble, true); assemble = lower(stmt, prefix+"assemble", true, false); evaluate = lower(stmt, prefix+"evaluate", true, true); From 0a4169728d9d6bcdfc1b1dabc40a0daf7e7e1e0a Mon Sep 17 00:00:00 2001 From: Adhhitha Dias Date: Mon, 26 Jul 2021 19:43:37 -0400 Subject: [PATCH 05/10] add tests and ispc compilation --- include/taco/codegen/module.h | 1 + src/codegen/codegen.cpp | 4 +- src/codegen/codegen_ispc.cpp | 249 ++++++++++---- src/codegen/codegen_ispc.h | 4 +- src/codegen/module.cpp | 79 ++++- src/tensor.cpp | 6 +- taco-uml.wsd | 411 +++++++++++++++++++++++ test/test.cpp | 14 + test/test.h | 1 + test/tests-scheduling-eval.cpp | 575 ++++++++++++++++++++++++++++++++- 10 files changed, 1263 insertions(+), 81 deletions(-) create mode 100644 taco-uml.wsd diff --git a/include/taco/codegen/module.h b/include/taco/codegen/module.h index 36eb34f1a..3df7c8e0f 100644 --- a/include/taco/codegen/module.h +++ b/include/taco/codegen/module.h @@ -68,6 +68,7 @@ class Module { private: std::stringstream source; + std::stringstream additional_source; std::stringstream header; std::string libname; std::string tmpdir; diff --git a/src/codegen/codegen.cpp b/src/codegen/codegen.cpp index 7081bc195..6ec54a2f8 100644 --- a/src/codegen/codegen.cpp +++ b/src/codegen/codegen.cpp @@ -265,9 +265,9 @@ string CodeGen::getUnpackedTensorArgument(string varname, const GetProperty* op, // all others are int* if (op->property == TensorProperty::Dimension) { if (op->type == Int32) { - ret << "int32 "; + ret << "uniform int32 "; } else if (op->type == Int64) { - ret << "int64 "; + ret << "uniform int64 "; } else { ret << "int "; } diff --git a/src/codegen/codegen_ispc.cpp b/src/codegen/codegen_ispc.cpp index c8480cd25..237bc822d 100644 --- a/src/codegen/codegen_ispc.cpp +++ b/src/codegen/codegen_ispc.cpp @@ -6,10 +6,12 @@ #include #include "taco/cuda.h" +#include "taco/ir/ir_printer.h" #include "taco/ir/ir_visitor.h" #include "taco/ir/ir_rewriter.h" #include "taco/ir/simplify.h" +#include "codegen_c.h" #include "codegen_ispc.h" #include "taco/error.h" #include "taco/util/strings.h" @@ -295,6 +297,7 @@ class CodeGen_ISPC::DeviceFunctionCollector : public IRVisitor { virtual void visit(const For *op) { if (op->parallel_unit == ParallelUnit::CPUSpmd) { std::cout << "ParallelUnit::CPUSpmd directive found\n"; + inDeviceFunction = false; op->var.accept(this); inDeviceFunction = true; @@ -380,6 +383,8 @@ void CodeGen_ISPC::compile(Stmt stmt, bool isFirst) { stmt.accept(this); } + + string CodeGen_ISPC::printCallISPCFunc(const Function *func, map varMap, vector &sortedProps) { std::stringstream ret; @@ -388,9 +393,6 @@ string CodeGen_ISPC::printCallISPCFunc(const Function *func, mapname << "("; - vector inputs = func->inputs; - vector outputs = func->outputs; - getSortedProps(varMap, sortedProps, inputs, outputs); for (unsigned long i=0; i < sortedProps.size(); i++) { ret << varMap[sortedProps[i]]; @@ -404,50 +406,123 @@ string CodeGen_ISPC::printCallISPCFunc(const Function *func, map varMap, +// varMap is already sorted <- make sure to pass the sorted varMap +void CodeGen_ISPC::printISPCFunc(const Function *func, map varMap, vector &sortedProps) { DeviceFunctionCollector deviceFunctionCollector(func->inputs, func->outputs, this); func->body.accept(&deviceFunctionCollector); - - std::stringstream ret; - ret << "export void "; - unordered_set propsAlreadyGenerated; - - ret << "__" << func->name << "("; - + std::stringstream variables; vector inputs = func->inputs; vector outputs = func->outputs; - // getSortedProps(varMap, sortedProps, inputs, outputs); + unordered_set propsAlreadyGenerated; - for (unsigned long i=0; i < sortedProps.size(); i++) { - auto prop = sortedProps[i]; - bool isOutputProp = (find(outputs.begin(), outputs.end(), - prop->tensor) != outputs.end()); - - auto var = prop->tensor.as(); - if (var->is_parameter) { - if (isOutputProp) { - ret << " " << printTensorProperty(varMap[prop], prop, false) << ";" << endl; + for (unsigned long i=0; i < sortedProps.size(); i++) { + auto prop = sortedProps[i]; + bool isOutputProp = (find(outputs.begin(), outputs.end(), + prop->tensor) != outputs.end()); + + auto var = prop->tensor.as(); + if (var->is_parameter) { + if (isOutputProp) { + variables << " " << printTensorProperty(varMap[prop], prop, false) << ";" << endl; + } else { + break; + } } else { - break; + variables << getUnpackedTensorArgument(varMap[prop], prop, isOutputProp); } - } else { - ret << getUnpackedTensorArgument(varMap[prop], prop, isOutputProp); - } - propsAlreadyGenerated.insert(varMap[prop]); + propsAlreadyGenerated.insert(varMap[prop]); - if (i!=sortedProps.size()-1) { - ret << ", "; - } - if (i%2==0) { - ret << "\n\t"; + if (i!=sortedProps.size()-1) { + variables << ", "; + } + if (i%2==0) { + variables << "\n\t"; + } } + + resetUniqueNameCounters(); + for (size_t i = 0; i < deviceFunctionCollector.threadFors.size(); i++) { + + const For *threadloop = to(deviceFunctionCollector.threadFors[i]); + taco_iassert(threadloop->parallel_unit == ParallelUnit::CPUSpmd); + Stmt function = threadloop->contents; + std::cout << "threadloop function: " << function << std::endl; + + out2 << "static task void __" << func->name << "__ ("; + out2 << variables.str(); + out2 << "\n) {\n\n"; + + indent++; + doIndent(); + // output body + print(threadloop); + indent--; + out2 << "}\n"; + + out2 << "export void __" << func->name << "("; + out2 << variables.str(); + out2 << "\n) {\n\n"; + indent++; + doIndent(); + out2 << "launch[4] " << printCallISPCFunc(func, varMap, sortedProps) << "\n"; + indent--; + out2 << "}\n"; + } - ret << "\n) {\n\n"; - return ret.str(); + if (deviceFunctionCollector.threadFors.size()==0) { + out2 << "export void __" << func->name << " ("; + out2 << variables.str(); + out2 << "\n) {\n\n"; + + indent++; + doIndent(); + // output body + print(func->body); + indent--; + out2 << "}\n"; + } + + // out2 << "export void "; + + // out2 << "__" << func->name << "("; + + // for (unsigned long i=0; i < sortedProps.size(); i++) { + // auto prop = sortedProps[i]; + // bool isOutputProp = (find(outputs.begin(), outputs.end(), + // prop->tensor) != outputs.end()); + + // auto var = prop->tensor.as(); + // if (var->is_parameter) { + // if (isOutputProp) { + // out2 << " " << printTensorProperty(varMap[prop], prop, false) << ";" << endl; + // } else { + // break; + // } + // } else { + // out2 << getUnpackedTensorArgument(varMap[prop], prop, isOutputProp); + // } + // propsAlreadyGenerated.insert(varMap[prop]); + + // if (i!=sortedProps.size()-1) { + // out2 << ", "; + // } + // if (i%2==0) { + // out2 << "\n\t"; + // } + // } + // out2 << "\n) {\n\n"; + + // indent++; + // doIndent(); + // // output body + // print(func->body); + // indent--; + // out2 << "}\n"; + } void CodeGen_ISPC::sendToStream(std::stringstream &stream) { @@ -461,6 +536,75 @@ void CodeGen_ISPC::sendToStream(std::stringstream &stream) { void CodeGen_ISPC::visit(const Function* func) { // if generating a header, protect the function declaration with a guard + if (func->name == "assemble") { + if (outputKind == HeaderGen) { + out << "#ifndef TACO_GENERATED_" << func->name << "\n"; + out << "#define TACO_GENERATED_" << func->name << "\n"; + } + + int numYields = countYields(func); + emittingCoroutine = (numYields > 0); + funcName = func->name; + labelCount = 0; + + resetUniqueNameCounters(); + FindVars inputVarFinder(func->inputs, {}, this); + func->body.accept(&inputVarFinder); + FindVars outputVarFinder({}, func->outputs, this); + func->body.accept(&outputVarFinder); + + // output function declaration + doIndent(); + out << printFuncName(func, inputVarFinder.varDecls, outputVarFinder.varDecls); + + // if we're just generating a header, this is all we need to do + if (outputKind == HeaderGen) { + out << ";\n"; + out << "#endif\n"; + return; + } + + out << " {\n"; + + indent++; + + // find all the vars that are not inputs or outputs and declare them + resetUniqueNameCounters(); + FindVars varFinder(func->inputs, func->outputs, this); + func->body.accept(&varFinder); + varMap = varFinder.varMap; + localVars = varFinder.localVars; + + // Print variable declarations + out << printDecls(varFinder.varDecls, func->inputs, func->outputs) << endl; + + if (emittingCoroutine) { + out << printContextDeclAndInit(varMap, localVars, numYields, func->name) + << endl; + } + + // output body + print(func->body); + + // output repack only if we allocated memory + if (checkForAlloc(func)) + out << endl << printPack(varFinder.outputProperties, func->outputs); + + if (emittingCoroutine) { + out << printCoroutineFinish(numYields, funcName); + } + + doIndent(); + out << "return 0;\n"; + indent--; + + doIndent(); + out << "}\n"; + return; + + } + + if (outputKind == HeaderGen) { out << "#ifndef TACO_GENERATED_" << func->name << "\n"; out << "#define TACO_GENERATED_" << func->name << "\n"; @@ -503,6 +647,9 @@ void CodeGen_ISPC::visit(const Function* func) { out << printDecls(varFinder.varDecls, func->inputs, func->outputs) << endl; vector sortedProps; + vector inputs = func->inputs; + vector outputs = func->outputs; + getSortedProps(varFinder.varDecls, sortedProps, inputs, outputs); out << printCallISPCFunc(func, varFinder.varDecls, sortedProps); if (emittingCoroutine) { @@ -526,13 +673,7 @@ void CodeGen_ISPC::visit(const Function* func) { out << "}\n\n"; set_ISPC_code_stream_enabled(true); - out2 << printISPCFunc(func, varFinder.varDecls, sortedProps); - indent++; - doIndent(); - // output body - print(func->body); - indent--; - out2 << "}\n"; + printISPCFunc(func, varFinder.varDecls, sortedProps); set_ISPC_code_stream_enabled(false); } @@ -655,20 +796,20 @@ void CodeGen_ISPC::visit(const For* op) { case LoopKind::Runtime: case LoopKind::Static_Chunked: case LoopKind::Mul_Thread: - op->start.accept(this); - stream2 << std::endl; - op->start.accept(this); - stream2 << std::endl; - op->start.accept(this); - stream2 << std::endl; - op->start.accept(this); - stream2 << std::endl; - op->end.accept(this); - stream2 << std::endl; - op->end.accept(this); - stream2 << std::endl; - op->end.accept(this); - stream2 << std::endl; + // op->start.accept(this); + // stream2 << std::endl; + // op->start.accept(this); + // stream2 << std::endl; + // op->start.accept(this); + // stream2 << std::endl; + // op->start.accept(this); + // stream2 << std::endl; + // op->end.accept(this); + // stream2 << std::endl; + // op->end.accept(this); + // stream2 << std::endl; + // op->end.accept(this); + // stream2 << std::endl; default: break; } diff --git a/src/codegen/codegen_ispc.h b/src/codegen/codegen_ispc.h index 279d0db7a..08e73b252 100644 --- a/src/codegen/codegen_ispc.h +++ b/src/codegen/codegen_ispc.h @@ -5,7 +5,7 @@ #include "taco/ir/ir.h" #include "taco/ir/ir_printer.h" -#include "codegen.h" +#include "codegen_c.h" namespace taco { namespace ir { @@ -46,7 +46,7 @@ class CodeGen_ISPC : public CodeGen { Stmt simplifyFunctionBodies(Stmt stmt); std::string printCallISPCFunc(const Function *func, std::map varMap, std::vector &sortedProps); - std::string printISPCFunc(const Function *func, std::map varMap, + void printISPCFunc(const Function *func, std::map varMap, std::vector &sortedProps); std::map varMap; diff --git a/src/codegen/module.cpp b/src/codegen/module.cpp index d9cbe2edc..82b736a13 100644 --- a/src/codegen/module.cpp +++ b/src/codegen/module.cpp @@ -43,6 +43,7 @@ void Module::addFunction(Stmt func) { void Module::compileToSource(string path, string prefix) { if (!moduleFromUserSource) { + std::cout << "module not from user source\n"; // create a codegen instance and add all the funcs bool didGenRuntime = false; @@ -51,11 +52,13 @@ void Module::compileToSource(string path, string prefix) { header.clear(); source.str(""); source.clear(); + additional_source.str(""); + additional_source.clear(); taco_tassert(target.arch == Target::C99) << "Only C99 codegen supported currently"; std::shared_ptr sourcegen = - CodeGen::init_default(source, CodeGen::ImplementationGen); + CodeGen::init_default(source, additional_source, CodeGen::ImplementationGen); std::shared_ptr headergen = CodeGen::init_default(header, CodeGen::HeaderGen); @@ -69,8 +72,17 @@ void Module::compileToSource(string path, string prefix) { ofstream source_file; string file_ending = should_use_CUDA_codegen() ? ".cu" : ".c"; source_file.open(path+prefix+file_ending); + if (should_use_ISPC_codegen()) { + source_file << "#include \"" << path+prefix+"_ispc.h\"\n"; + } source_file << source.str(); source_file.close(); + + ofstream additional_source_file; + string file_ending2 = ".ispc"; + additional_source_file.open(path+prefix+file_ending2); + additional_source_file << additional_source.str(); + additional_source_file.close(); ofstream header_file; header_file.open(path+prefix+".h"); @@ -90,9 +102,9 @@ void writeShims(vector funcs, string path, string prefix) { if (should_use_CUDA_codegen()) { CodeGen_CUDA::generateShim(func, shims); } - else if (should_use_ISPC_codegen()) { - CodeGen_ISPC::generateShim(func, shims); - } + // else if (should_use_ISPC_codegen()) { + // CodeGen_ISPC::generateShim(func, shims); + // } else { CodeGen_C::generateShim(func, shims); } @@ -102,9 +114,9 @@ void writeShims(vector funcs, string path, string prefix) { if (should_use_CUDA_codegen()) { shims_file.open(path+prefix+"_shims.cpp"); } - else if (should_use_ISPC_codegen()) { - shims_file.open(path+prefix+".ispc", ios::app); - } + // else if (should_use_ISPC_codegen()) { + // shims_file.open(path+prefix+".c", ios::app); + // } else { shims_file.open(path+prefix+".c", ios::app); } @@ -131,12 +143,13 @@ string Module::compile() { file_ending = ".cu"; shims_file = prefix + "_shims.cpp"; } - else if (should_use_ISPC_codegen()) { - cc = util::getFromEnv(target.compiler_env, target.compiler); - cflags = util::getFromEnv("TACO_CFLAGS", - "-O3 -ffast-math -std=c99") + " -shared -fPIC"; - - } + // else if (should_use_ISPC_codegen()) { + // cc = util::getFromEnv("TACO_ISPC", "ispc"); + // cflags = util::getFromEnv("TACO_ISPC_FLAGS", + // " --target=sse2-i32x4,sse4-i32x8,avx1-i32x8,avx2-i32x8,avx512knl-i32x16,avx512skx-i32x16 --pic -O3 --addressing=64 --arch=x86-64" + // ) + " "; + + // } else { cc = util::getFromEnv(target.compiler_env, target.compiler); cflags = util::getFromEnv("TACO_CFLAGS", @@ -151,9 +164,15 @@ string Module::compile() { string cmd = cc + " " + cflags + " " + prefix + file_ending + " " + shims_file + " " + "-o " + fullpath + " -lm"; + std::cout << "--------------------------------------------------------------------------------tmpdir: " << tmpdir << std::endl; + std::cout << "--------------------------------------------------------------------------------libname: " << libname << std::endl; + std::cout << "--------------------------------------------------------------------------------prefix: " << prefix << std::endl; + std::cout << "--------------------------------------------------------------------------------fullpath: " << fullpath << std::endl; + std::cout << "--------------------------------------------------------------------------------cmd: " << cmd << std::endl; // open the output file & write out the source compileToSource(tmpdir, libname); + // write out the shims writeShims(funcs, tmpdir, libname); @@ -164,10 +183,36 @@ string Module::compile() { } std::cout << tmpdir << std::endl << libname << std::endl; - // now compile it - int err = system(cmd.data()); - taco_uassert(err == 0) << "Compilation command failed:\n" << cmd - << "\nreturned " << err; + if (should_use_ISPC_codegen()) { + string ispc = util::getFromEnv("TACO_ISPC", "ispc"); + string ispcflags = util::getFromEnv("TACO_ISPC_FLAGS", + " --target=sse2-i32x4,sse4-i32x8,avx1-i32x8,avx2-i32x8,avx512knl-i32x16,avx512skx-i32x16 --pic -O3 --addressing=64 --arch=x86-64" + ) + " "; + string cmd = ispc + " " + ispcflags + " -o " + prefix + ".ispc.o " + " --emit-obj " + prefix + ".ispc " + "-h " + prefix + "_ispc.h"; + + // now compile the ispc file to generate the object file and the ispc header file + std::cout << "--------------------------------------------------------------------------------cmd: " << cmd << std::endl; + int err = system(cmd.data()); + taco_uassert(err == 0) << "Compilation command failed:\n" << cmd + << "\nreturned " << err; + + string ispc_object_file = " " + prefix + ".ispc.o "; + string ispc_object_files_for_diff_targets = " " + prefix + ".ispc_* "; + cmd = cc + " " + cflags + " " + + prefix + file_ending + " " + ispc_object_file + ispc_object_files_for_diff_targets + shims_file + " " + + "-o " + fullpath + " -lm -lrt "; + + // now compile the c file linking the ispc object file. ispc header is added to the top of the c file + std::cout << "--------------------------------------------------------------------------------cmd: " << cmd << std::endl; + err = system(cmd.data()); + taco_uassert(err == 0) << "Compilation command failed:\n" << cmd + << "\nreturned " << err; + } else { + // now compile it + int err = system(cmd.data()); + taco_uassert(err == 0) << "Compilation command failed:\n" << cmd + << "\nreturned " << err; + } // use dlsym() to open the compiled library if (lib_handle) { diff --git a/src/tensor.cpp b/src/tensor.cpp index dac2c3fd2..5e02d2660 100644 --- a/src/tensor.cpp +++ b/src/tensor.cpp @@ -808,9 +808,9 @@ void TensorBase::assemble() { void TensorBase::compute() { taco_uassert(!needsCompile()) << error::compute_without_compile; - if (!needsCompute()) { - return; - } + // if (!needsCompute()) { + // return; + // } setNeedsCompute(false); // Sync operand tensors if needed. auto operands = getTensors(getAssignment().getRhs()); diff --git a/taco-uml.wsd b/taco-uml.wsd new file mode 100644 index 000000000..4b8e39802 --- /dev/null +++ b/taco-uml.wsd @@ -0,0 +1,411 @@ +@startuml taco +scale 1 + + +class IntrusivePtr { + +T *ptr +} +class Uncopyable {} + +class IRNode { + +virtual void accept(IRVisitorStrict *v) const = 0 + +virtual IRNodeType type_info() const = 0; +} + +class BaseStmtNode {} +class BaseExprNode { + +Datatype type +} + +class StmtNode { + +void accept(IRVisitorStrict *v) const +} +class ExprNode { + +void accept(IRVisitorStrict *v) const +} + +Uncopyable <|-- IRNode +IRNode <|-- BaseStmtNode +IRNode <|-- BaseExprNode +BaseStmtNode <|-- StmtNode +BaseExprNode <|-- ExprNode + +class IRHandle { + +void accept(IRVisitorStrict *v) const +} +class Expr {} +class Stmt {} + +IntrusivePtr <|-- IRHandle +IRHandle <|-- Expr +IRHandle <|-- Stmt + +IRHandle "1" *-- "1" IRNode : contains + + + +' this class is abstract but plantuml version does not support interface keyword +interface IRVisitorStrict { + +virtual void visit(const IRNode*) const = 0 +} + +/' +IRVisitor is not an interface or abstract because it +has not pure virtual methods +'/ +class IRVisitor { + +virtual void visit(const IRNode*) +} + +class IRRewriter { + ' protected fields and methods + #Expr expr + #Stmt stmt + + #virtual void visit(const ExprNode* op) + #virtual void visit(const StmtNode* op) + + ' public fields and methods + +Expr rewrite(Expr) + +Stmt rewrite(Stmt) +} +class IRPrinter { + #std::ostream &stream + #std::ostream &stream2 + #int indent + #bool color + #bool simplify + #enum Precedence + #Precedence parentPrecedence = BOTTOM + #NameGenerator varNameGenerator + #scopedMap varNames + + #void doIndent() + #void printBinOp(Expr a, Expr b, std::string op, Precedence precedence) + #void fewMoreMethods() + + #virtual void visit(const ExprNode*) + #virtual void visit(const StmtNode*) + + +setColor(bool color) + +print(Stmt) +} +class IRVerifier {} + +IRVisitorStrict <|-- IRVisitor +IRVisitorStrict <|-- IRPrinter +IRVisitorStrict <|-- IRRewriter +IRVisitor <|-- IRVerifier + +' Inheritance from IRRewriter +' simplifier for ir::Expr +class ExpressionSimplifier {} +IRRewriter <|-- ExpressionSimplifier + +' simplifiers for ir::Stmt +class RemoveRedundantStatements {} +class RemoveRedundantLoops {} +class RemoveDuplicateBody {} + +IRRewriter <|-- RemoveRedundantStatements +IRRewriter <|-- RemoveRedundantLoops +IRRewriter <|-- RemoveDuplicateBody + + +' Inheritance from IRPrinter +class CodeGen {} +class CodeGen_C {} +class CodeGen_CUDA {} +class CodeGen_ISPC { + -class FindVars +} + +class FindVars {} + +IRPrinter <|-- CodeGen +CodeGen <|-- CodeGen_C +CodeGen <|-- CodeGen_ISPC +CodeGen <|-- CodeGen_CUDA + +IRVisitor <|-- FindVars +CodeGen_ISPC +-- FindVars + +class Manageable {} +class IndexStmtNode { + -virtual void accept(IndexStmtVisitorStrict*) const = 0 +} +class IndexExprNode { + -virtual void accept(IndexStmtVisitorStrict*) const = 0 +} + + +Manageable <|-- IndexStmtNode +Uncopyable <|-- IndexStmtNode +Manageable <|-- IndexExprNode +Uncopyable <|-- IndexExprNode + +class IndexStmt {} +class IndexExpr {} + +IntrusivePtr <|-- IndexStmt +IndexStmt "1" *-- "1" IndexStmtNode +IntrusivePtr <|-- IndexExpr +IndexExpr "1" *-- "1" IndexExprNode + + +abstract class IndexExprVisitorStrict { + +void visit(const IndexStmt&) + +virtual void visit(const AccessNode*) = 0 + +virtual void visit(const LiteralNode*) = 0 + +virtual void visit(const NegNode*) = 0 + +virtual void visit(const AddNode*) = 0 + +virtual void visit(const SubNode*) = 0 + +virtual void visit(const MulNode*) = 0 + +virtual void visit(const DivNode*) = 0 + +virtual void visit(const SqrtNode*) = 0 + +virtual void visit(const CastNode*) = 0 + +virtual void visit(const CallIntrinsicNode*) = 0 + +virtual void visit(const ReductionNode*) = 0 +} +abstract class IndexStmtVisitorStrict { + +void visit(const IndexStmt&) + +virtual void visit(const AssignmentNode*) = 0 + +virtual void visit(const YieldNode*) = 0 + +virtual void visit(const ForallNode*) = 0 + +virtual void visit(const WhereNode*) = 0 + +virtual void visit(const SequenceNode*) = 0 + +virtual void visit(const AssembleNode*) = 0 + +virtual void visit(const MultiNode*) = 0 + +virtual void visit(const SuchThatNode*) = 0 +} + +abstract class IndexNotationVisitorStrict {} +class IndexNotationPrinter { + +void print(const IndexExpr& expr) + +void print(const IndexStmt& expr) + + ' Index Expressions visit() + +void visit(const AccessNode* node) + +void visit(const LiteralNode* node) + + void visit(const NegNode* node) + + void visit(const AddNode* node) + + void visit(const SubNode* node) + + void visit(const MulNode* node) + + void visit(const DivNode* node) + + void visit(const SqrtNode* node) + + void visit(const CastNode* node) + + void visit(const CallIntrinsicNode* node) + + void visit(const UnaryExprNode* node) + + void visit(const BinaryExprNode* node) + + void visit(const ReductionNode* node) + + ' Index Statement visit() + + void visit(const AssignmentNode* node) + + void visit(const YieldNode* node) + + void visit(const ForallNode* node) + + void visit(const WhereNode* node) + + void visit(const SequenceNode* node) + + void visit(const AssembleNode* node) + + void visit(const MultiNode* node) + + void visit(const SuchThatNode* node) +} +class IndexNotationVisitor { + ' Index Expressions visit() + +virtual void visit(const AccessNode* node) + +virtual void visit(const LiteralNode* node) + +virtual void visit(const NegNode* node) + +virtual void visit(const AddNode* node) + +virtual void visit(const SubNode* node) + +virtual void visit(const MulNode* node) + +virtual void visit(const DivNode* node) + +virtual void visit(const SqrtNode* node) + +virtual void visit(const CastNode* node) + +virtual void visit(const CallIntrinsicNode* node) + +virtual void visit(const UnaryExprNode* node) + +virtual void visit(const BinaryExprNode* node) + +virtual void visit(const ReductionNode* node) + + ' Index Statement visit() + +virtual void visit(const AssignmentNode* node) + +virtual void visit(const YieldNode* node) + +virtual void visit(const ForallNode* node) + +virtual void visit(const WhereNode* node) + +virtual void visit(const SequenceNode* node) + +virtual void visit(const AssembleNode* node) + +virtual void visit(const MultiNode* node) + +virtual void visit(const SuchThatNode* node) +} +class Matcher { + +} + +abstract class IndexExprRewriterStrict { + +IndexExpr rewrite(IndexExpr) + + #IndexExpr expr + + #virtual void visit(const AccessNode* op) = 0 + #virtual void visit(const LiteralNode* op) = 0 + #virtual void visit(const NegNode* op) = 0 + #virtual void visit(const SqrtNode* op) = 0 + #virtual void visit(const AddNode* op) = 0 + #virtual void visit(const SubNode* op) = 0 + #virtual void visit(const MulNode* op) = 0 + #virtual void visit(const DivNode* op) = 0 + #virtual void visit(const CastNode* op) = 0 + #virtual void visit(const CallIntrinsicNode* op) = 0 + #virtual void visit(const ReductionNode* op) = 0 +} +abstract class IndexStmtRewriterStrict { + +IndexStmt rewrite(IndexStmt) + + #IndexStmt stmt + + #virtual void visit(const AssignmentNode* op) = 0 + #virtual void visit(const YieldNode* op) = 0 + #virtual void visit(const ForallNode* op) = 0 + #virtual void visit(const WhereNode* op) = 0 + #virtual void visit(const SequenceNode* op) = 0 + #virtual void visit(const AssembleNode* op) = 0 + #virtual void visit(const MultiNode* op) = 0 + #virtual void visit(const SuchThatNode* op) = 0 +} +abstract class IndexNotationRewriterStrict {} +class IndexNotationRewriter { + ' Index Expressions visit() + +virtual void visit(const AccessNode* node) + +virtual void visit(const LiteralNode* node) + +virtual void visit(const NegNode* node) + +virtual void visit(const AddNode* node) + +virtual void visit(const SubNode* node) + +virtual void visit(const MulNode* node) + +virtual void visit(const DivNode* node) + +virtual void visit(const SqrtNode* node) + +virtual void visit(const CastNode* node) + +virtual void visit(const CallIntrinsicNode* node) + +virtual void visit(const UnaryExprNode* node) + +virtual void visit(const BinaryExprNode* node) + +virtual void visit(const ReductionNode* node) + + ' Index Statement visit() + +virtual void visit(const AssignmentNode* node) + +virtual void visit(const YieldNode* node) + +virtual void visit(const ForallNode* node) + +virtual void visit(const WhereNode* node) + +virtual void visit(const SequenceNode* node) + +virtual void visit(const AssembleNode* node) + +virtual void visit(const MultiNode* node) + +virtual void visit(const SuchThatNode* node) +} + + +IndexExprVisitorStrict <|-- IndexNotationVisitorStrict +IndexStmtVisitorStrict <|-- IndexNotationVisitorStrict +IndexNotationVisitorStrict <|-- IndexNotationVisitor +IndexNotationVisitorStrict <|-- IndexNotationPrinter +IndexNotationVisitor <|-- Matcher + +IndexExprVisitorStrict <|-- IndexExprRewriterStrict +IndexStmtVisitorStrict <|-- IndexStmtRewriterStrict +IndexExprRewriterStrict <|-- IndexNotationRewriterStrict +IndexStmtRewriterStrict <|-- IndexNotationRewriterStrict + +IndexNotationRewriterStrict <|-- IndexNotationRewriter + +' - private +' # protected +' ~ package private +' + public + +' {static} +' {abstract} virtual methods + +' lowering part -- convertion from IndexExpr and IndexStmt to ir::Expr and ir::Stmt +class Lowerer { + +std::shared_ptr impl; +} +abstract class LowererImpl { + ' protected fields and methods + #class Visitor; + #friend class Visitor; + #std::shared_ptr visitor; + + #virtual ir::Stmt lower(IndexStmt stmt); + #virtual ir::Expr lower(IndexExpr expr); + + #virtual ir::Expr lowerExpr(IndexExpr expr) = 0; + #virtual ir::Stmt lowerStmt(IndexStmt stmt) = 0; + + ' public fields and methods + +virtual ir::Stmt lower(IndexStmt stmt, std::string name, + bool assemble, bool compute, bool pack, bool unpack) = 0; +} + +class LowererImplImperative { + ' private fields and methods + -class Visitor + -fiend class Visitor + -std::shared_ptr visitor + -bool assemble + -bool compute + -vars a_bunch_of_other_fields + + ' protected fields and methods + #virtual ir::Stmt lowerExpr(IndexExpr expr); + #virtual ir::Stmt lowerStmt(IndexStmt stmt); + + ' public fields and methods + +ir::Stmt lower(IndexStmt stmt, std::string name, + bool assemble, bool compute, bool pack, bool unpack) + +} +note bottom of LowererImplImperative : Stmt LowererImplImperative::lower(IndexStmt stmt) {\n return visitor->lower(stmt);\n} + +Uncopyable <|-- LowererImpl +Lowerer "1" *-- "1" LowererImpl : contains + + +' visitor that does the lowering +class Visitor { + ' private fields and methods + -LowererImpl* impl + -Expr expr + -Stmt stmt + + -void visit(const AssignmentNode* node) + -void visit(const YieldNode* node) + -void visit(const ForallNode* node) + -void visit(const WhereNode* node) + -void visit(const MultiNode* node) + -void visit(const SuchThatNode* node) + -void visit(const SequenceNode* node) + -void visit(const AssembleNode* node) + -void visit(const AccessNode* node) + -void visit(const LiteralNode* node) + -void visit(const NegNode* node) + -void visit(const AddNode* node) + -void visit(const SubNode* node) + -void visit(const MulNode* node) + -void visit(const DivNode* node) + -void visit(const SqrtNode* node) + -void visit(const CastNode* node) + -void visit(const CallIntrinsicNode* node) + -void visit(const ReductionNode* node) + + ' public fields and methods + +Visitor(LowererImplImperative* impl) + +Stmt lower(IndexStmt stmt) + +Expr lower(IndexExpr expr) +} + +note bottom of Visitor: Stmt lower(IndexStmt stmt) {\n this->stmt = Stmt();\n impl->accessibleIterators.scope();\n IndexStmtVisitorStrict::visit(stmt);\n impl->accessibleIterators.unscope();\n return this->stmt;\n} + +IndexNotationVisitorStrict <|-- Visitor +LowererImpl "1" +-- "1" Visitor : contains +Visitor "1" *-- "1" LowererImpl : contains + +LowererImpl <|-- LowererImplImperative +LowererImplImperative "1" +-- "1" Visitor : contains +Visitor "1" *-- "1" LowererImplImperative : contains + +@enduml \ No newline at end of file diff --git a/test/test.cpp b/test/test.cpp index a49f10ff7..851493b7f 100644 --- a/test/test.cpp +++ b/test/test.cpp @@ -38,6 +38,20 @@ void ASSERT_TENSOR_EQ(TensorBase expected, TensorBase actual) { ASSERT_TRUE(equals(expected, actual)); } +// void ASSERT_TENSOR_VAL(TensorBase expected, TensorBase actual) { +// std::cout << "order: " << expected.getOrder(); +// std::vector modes{}; +// for (int mode = 0; mode < expected.getOrder(); mode++) { +// if (expected.getDimension(mode) != actual.getDimension(mode)) { +// ASSERT_TRUE(false); +// } + +// for (int i=0; i expected, void ASSERT_STORAGE_EQ(TensorStorage expected, TensorStorage actual); void ASSERT_TENSOR_EQ(TensorBase expected, TensorBase actual); +// void ASSERT_TENSOR_VAL(TensorBase expected, TensorBase actual); template void ASSERT_COMPONENTS_EQUALS(vector>> expectedIndices, diff --git a/test/tests-scheduling-eval.cpp b/test/tests-scheduling-eval.cpp index 93ba7b01e..4957418e0 100644 --- a/test/tests-scheduling-eval.cpp +++ b/test/tests-scheduling-eval.cpp @@ -12,6 +12,23 @@ #include "taco/index_notation/transformations.h" #include "codegen/codegen.h" #include "taco/lower/lower.h" +#include "taco/util/timers.h" + + +#define TOOL_BENCHMARK_TIMER(CODE,NAME,TIMER) { \ + if (time) { \ + taco::util::Timer timer; \ + timer.start(); \ + CODE; \ + timer.stop(); \ + taco::util::TimeResults result = timer.getResult(); \ + cout << NAME << " " << result << " ms" << endl; \ + TIMER=result; \ + } \ + else { \ + CODE; \ + } \ +} using namespace taco; const IndexVar i("i"), j("j"), k("k"), l("l"), m("m"), n("n"); @@ -52,7 +69,7 @@ IndexStmt scheduleSpMVISPC(IndexStmt stmt, int CHUNK_SIZE=16) { // return stmt; return stmt.split(i, i0, i1, CHUNK_SIZE) .reorder({i0, i1, j}) - .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); + .parallelize(i0, ParallelUnit::CPUSimd, OutputRaceStrategy::NoRaces); } IndexStmt scheduleSpMMCPU(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { @@ -71,16 +88,42 @@ IndexStmt scheduleSpMMISPC1(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, .pos(j, jpos, A(i,j)) .split(jpos, jpos0, jpos1, UNROLL_FACTOR) .reorder({i0, i1, jpos0, k, jpos1}) - .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) .parallelize(k, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); } +IndexStmt scheduleSpMMISPC1_2(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); + return stmt.split(i, i0, i1, CHUNK_SIZE) + .pos(j, jpos, A(i,j)) + .split(jpos, jpos0, jpos1, UNROLL_FACTOR) + .reorder({i0, i1, jpos0, k, jpos1}) + // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + .parallelize(i0, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); +} + +IndexStmt scheduleSpMMISPC1_3(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); + return stmt.split(i, i0, i1, CHUNK_SIZE) + .pos(j, jpos, A(i,j)) + .split(jpos, jpos0, jpos1, UNROLL_FACTOR) + .reorder({i0, i1, jpos0, k, jpos1}) + // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + .parallelize(i1, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); +} + IndexStmt scheduleSpMMISPC2(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); return stmt .parallelize(k, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); } +IndexStmt scheduleSpMMISPC2_2(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); + return stmt + .parallelize(i, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); +} + IndexStmt scheduleSpMMISPC3(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); return stmt @@ -88,10 +131,21 @@ IndexStmt scheduleSpMMISPC3(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, // .pos(j, jpos, A(i,j)) // .split(jpos, jpos0, jpos1, UNROLL_FACTOR) .reorder({j, k}) - .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) .parallelize(k, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); } +IndexStmt scheduleSpMMISPC3_2(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); + return stmt + // .split(i, i0, i1, CHUNK_SIZE) + // .pos(j, jpos, A(i,j)) + // .split(jpos, jpos0, jpos1, UNROLL_FACTOR) + .reorder({j, k}) + // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + .parallelize(i, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); +} + IndexStmt scheduleSpGEMMCPU(IndexStmt stmt, bool doPrecompute) { Assignment assign = stmt.as().getStmt().as().getStmt() .as().getStmt().as(); @@ -145,6 +199,16 @@ IndexStmt scheduleSDDMMCPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, .parallelize(kpos1, ParallelUnit::CPUVector, OutputRaceStrategy::ParallelReduction); } +IndexStmt scheduleSDDMMISPC(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1"); + return stmt.split(i, i0, i1, CHUNK_SIZE) + .pos(k, kpos, B(i,k)) + .split(kpos, kpos0, kpos1, UNROLL_FACTOR) + .reorder({i0, i1, kpos0, j, kpos1}) + // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); + .parallelize(kpos1, ParallelUnit::CPUSimd, OutputRaceStrategy::ParallelReduction); +} + IndexStmt scheduleSDDMMISPC1(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1"); return stmt.split(i, i0, i1, CHUNK_SIZE) @@ -175,6 +239,16 @@ IndexStmt scheduleTTVCPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16) { .parallelize(chunk, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); } +IndexStmt scheduleTTVISPC(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16) { + IndexVar f("f"), fpos("fpos"), chunk("chunk"), fpos2("fpos2"); + return stmt; + // return stmt.fuse(i, j, f) + // .pos(f, fpos, B(i,j,k)) + // .split(fpos, chunk, fpos2, CHUNK_SIZE) + // .reorder({chunk, fpos2, k}) + // .parallelize(chunk, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); +} + IndexStmt scheduleTTVCPUCSR(IndexStmt stmt) { TensorVar result = stmt.as().getStmt().as().getStmt() .as().getStmt().as().getLhs() @@ -635,6 +709,92 @@ TEST(scheduling_eval, spmmCPU) { ASSERT_TENSOR_EQ(expected, C); } +TEST(scheduling_eval, spmmISPC) { + taco::util::TimeResults timevalue; + bool time = true; + + set_ISPC_codegen_enabled(false); + set_CUDA_codegen_enabled(false); + + int NUM_I = 1021/10; + int NUM_J = 1039/10; + int NUM_K = 128; + float SPARSITY = .1; + Tensor A("A", {NUM_I, NUM_J}, CSR); + Tensor B("B", {NUM_J, NUM_K}, {Dense, Dense}); + Tensor C("C", {NUM_I, NUM_K}, {Dense, Dense}); + + srand(75883); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + if (rand_float < SPARSITY) { + A.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + } + + for (int j = 0; j < NUM_J; j++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + B.insert({j, k}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + + A.pack(); + B.pack(); + + set_ISPC_codegen_enabled(true); + C(i, k) = A(i, j) * B(j, k); + + IndexStmt stmt = C.getAssignment().concretize(); + // stmt = scheduleSpMMISPC1(stmt, A); + // stmt = scheduleSpMMISPC1_2(stmt, A); + stmt = scheduleSpMMISPC1_3(stmt, A); + + // stmt = scheduleSpMMISPC2(stmt, A); + // stmt = scheduleSpMMISPC2_2(stmt, A); + + // stmt = scheduleSpMMISPC3(stmt, A); + // stmt = scheduleSpMMISPC3_2(stmt, A); + + //printToFile("spmm_cpu", stmt); + + C.compile(stmt); + C.assemble(); + C.compute(); + + set_ISPC_codegen_enabled(false); + Tensor expected("expected", {NUM_I, NUM_K}, {Dense, Dense}); + expected(i, k) = A(i, j) * B(j, k); + IndexStmt stmt_taco = expected.getAssignment().concretize(); + stmt_taco = scheduleSpMMCPU(stmt_taco, A); + + expected.compile(stmt_taco); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, C); + + float ERROR_MARGIN = 0.01; + // ASSERT_TENSOR_VAL(expected, y); + for (int i = 0; i < NUM_I; i++) { + for (int k = 0; k < NUM_K; k++) { + if (expected(i,k) <= C(i,k) + ERROR_MARGIN && expected(i,k) >= C(i,k) - ERROR_MARGIN) { + // std::cout << "matched values: expected -> " << expected(j) << " == " << y(j) << " <- actual\n"; + } + else { + std::cout << "unmatched values: expected -> " << expected(i,k) << " != " << C(i,k) << " <- actual\n"; + ASSERT_TRUE(false); + }; + } + } + + for (int i=0; i<10; i++) { + TOOL_BENCHMARK_TIMER(C.compute(), "Compute ISPC: ", timevalue); + TOOL_BENCHMARK_TIMER(expected.compute(), "Compute TACO: ", timevalue); + } +} + struct spgemm : public TestWithParam> {}; TEST_P(spgemm, scheduling_eval) { @@ -878,6 +1038,96 @@ TEST(scheduling_eval, sddmmCPU) { ASSERT_TENSOR_EQ(expected, A); } +// bin/taco-test --gtest_filter=scheduling_eval.sddmmISPC +TEST(scheduling_eval, sddmmISPC) { + + taco::util::TimeResults timevalue; + bool time = true; + + set_CUDA_codegen_enabled(false); + set_ISPC_codegen_enabled(false); + + int NUM_I = 1021/10; + int NUM_J = 1039/10; + int NUM_K = 1057/10; + float SPARSITY = .3; + Tensor A("A", {NUM_I, NUM_K}, {Dense, Dense}); + Tensor B("B", {NUM_I, NUM_K}, CSR); + Tensor C("C", {NUM_I, NUM_J}, {Dense, Dense}); + Tensor D("D", {NUM_J, NUM_K}, {Dense, Dense}); + + srand(268238); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + C.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + + for (int i = 0; i < NUM_I; i++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, k}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + } + + for (int j = 0; j < NUM_J; j++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + D.insert({j, k}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + + B.pack(); + C.pack(); + D.pack(); + + set_ISPC_codegen_enabled(true); + A(i,k) = B(i,k) * C(i,j) * D(j,k); + + IndexStmt stmt = A.getAssignment().concretize(); + stmt = scheduleSDDMMISPC(stmt, B); + + //printToFile("sddmm_cpu", stmt); + + A.compile(stmt); + A.assemble(); + // A.compute(); + + set_ISPC_codegen_enabled(false); + Tensor expected("expected", {NUM_I, NUM_K}, {Dense, Dense}); + expected(i,k) = B(i,k) * C(i,j) * D(j,k); + IndexStmt stmt_taco = A.getAssignment().concretize(); + stmt_taco = scheduleSDDMMCPU(stmt_taco, B); + expected.compile(stmt_taco); + expected.assemble(); + // expected.compute(); + + TOOL_BENCHMARK_TIMER(A.compute(), "Compute ISPC: ", timevalue); + TOOL_BENCHMARK_TIMER(expected.compute(), "Compute TACO: ", timevalue); + + ASSERT_TENSOR_EQ(expected, A); + + + float ERROR_MARGIN = 0.01; + // ASSERT_TENSOR_VAL(expected, y); + for (int i = 0; i < NUM_I; i++) { + for (int k = 0; k < NUM_K; k++) { + if (expected(i,k) <= A(i,k) + ERROR_MARGIN && expected(i,k) >= A(i,k) - ERROR_MARGIN) { + // std::cout << "matched values: expected -> " << expected(j) << " == " << y(j) << " <- actual\n"; + } + else { + std::cout << "unmatched values: expected -> " << expected(i,k) << " != " << A(i,k) << " <- actual\n"; + ASSERT_TRUE(false); + }; + } + } + std::cout << "test scheduling_eval.sddmmISPC passed\n"; + +} + TEST(scheduling_eval, spmvCPU) { if (should_use_CUDA_codegen()) { return; @@ -926,6 +1176,100 @@ TEST(scheduling_eval, spmvCPU) { ASSERT_TENSOR_EQ(expected, y); } + +TEST(scheduling_eval, spmvISPC) { + + taco::util::TimeResults timevalue; + bool time = true; + + set_ISPC_codegen_enabled(false); + set_CUDA_codegen_enabled(false); + + int NUM_I = 200021/10; + int NUM_J = 200039/10; + float SPARSITY = .2; + Tensor A("A", {NUM_I, NUM_J}, CSR); + Tensor x("x", {NUM_J}, Format({Dense})); + Tensor y("y", {NUM_I}, Format({Dense})); + + srand(120); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + if (rand_float < SPARSITY) { + A.insert({i, j}, (double) ((int) (rand_float * 3 / SPARSITY))); + } + } + } + + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + x.insert({j}, (double) ((int) (rand_float*3/SPARSITY))); + } + + x.pack(); + A.pack(); + + set_ISPC_codegen_enabled(true); + + y(i) = A(i, j) * x(j); + + IndexStmt stmt = y.getAssignment().concretize(); + stmt = scheduleSpMVISPC(stmt); + + //printToFile("spmv_cpu", stmt); + + y.compile(stmt); + y.assemble(); + // y.compile(); + + set_ISPC_codegen_enabled(false); + + // Tensor expected("expected", {NUM_I}, Format({Dense})); + // expected(i) = A(i, j) * x(j); + // expected.compile(); + // expected.assemble(); + // expected.compute(); + + + Tensor expected("expected", {NUM_I}, Format({Dense})); + expected(i) = A(i, j) * x(j); + IndexStmt stmt_taco = expected.getAssignment().concretize(); + stmt_taco = scheduleSpMVCPU(stmt_taco); + + expected.compile(stmt_taco); + expected.assemble(); + // expected.compile(); + + + TOOL_BENCHMARK_TIMER(y.compute(), "Compute ISPC: ", timevalue); + TOOL_BENCHMARK_TIMER(expected.compute(), "Compute TACO: ", timevalue); + + + ASSERT_TENSOR_EQ(expected, y); + + float ERROR_MARGIN = 0.01; + // ASSERT_TENSOR_VAL(expected, y); + for (int j = 0; j < NUM_J; j++) { + if (expected(j) <= y(j) + ERROR_MARGIN && expected(j) >= y(j) - ERROR_MARGIN) { + // std::cout << "matched values: expected -> " << expected(j) << " == " << y(j) << " <- actual\n"; + } + else { + std::cout << "unmatched values: expected -> " << expected(j) << " != " << y(j) << " <- actual\n"; + ASSERT_TRUE(false); + }; + } + + std::cout << "test scheduling_eval.spmvISPC passed\n"; + + for (int i=0; i<10; i++) { + TOOL_BENCHMARK_TIMER(y.compute(), "Compute ISPC: ", timevalue); + TOOL_BENCHMARK_TIMER(expected.compute(), "Compute TACO: ", timevalue); + } + + +} + TEST(scheduling_eval, ttvCPU) { if (should_use_CUDA_codegen()) { return; @@ -977,6 +1321,65 @@ TEST(scheduling_eval, ttvCPU) { ASSERT_TENSOR_EQ(expected, A); } + +TEST(scheduling_eval, ttvISPC) { + if (should_use_CUDA_codegen()) { + return; + } + set_CUDA_codegen_enabled(false); + set_ISPC_codegen_enabled(false); + int NUM_I = 1021/10; + int NUM_J = 1039/10; + int NUM_K = 1057/10; + float SPARSITY = .3; + Tensor A("A", {NUM_I, NUM_J}, {Dense, Dense}); // TODO: change to sparse outputs + Tensor B("B", {NUM_I, NUM_J, NUM_K}, {Sparse, Sparse, Sparse}); + Tensor c("c", {NUM_K}, Format({Dense})); + + srand(9536); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float) rand() / (float) (RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, j, k}, (double) ((int) (rand_float * 3 / SPARSITY))); + } + } + } + } + + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + c.insert({k}, (double) ((int) (rand_float*3))); + } + + B.pack(); + c.pack(); + + set_ISPC_codegen_enabled(true); + A(i,j) = B(i,j,k) * c(k); + + IndexStmt stmt = A.getAssignment().concretize(); + stmt = scheduleTTVISPC(stmt, B); + + //printToFile("ttv_cpu", stmt); + + A.compile(stmt); + A.assemble(); + A.compute(); + + set_ISPC_codegen_enabled(false); + Tensor expected("expected", {NUM_I, NUM_J}, {Dense, Dense}); + expected(i,j) = B(i,j,k) * c(k); + IndexStmt stmt_taco = expected.getAssignment().concretize(); + stmt_taco = scheduleTTVCPU(stmt_taco, B); + expected.compile(); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, A); +} + + TEST(scheduling_eval, ttvCPU_CSR) { if (should_use_CUDA_codegen()) { return; @@ -1081,6 +1484,60 @@ TEST(scheduling_eval, ttmCPU) { ASSERT_TENSOR_EQ(expected, A); } +TEST(scheduling_eval, ttmISPC) { + if (should_use_CUDA_codegen()) { + return; + } + int NUM_I = 1021/40; + int NUM_J = 1039/40; + int NUM_K = 1057/40; + int NUM_L = 1232/40; + float SPARSITY = .1; + Tensor A("A", {NUM_I, NUM_J, NUM_L}, {Dense, Dense, Dense}); // TODO: change to sparse outputs + Tensor B("B", {NUM_I, NUM_J, NUM_K}, {Sparse, Sparse, Sparse}); + Tensor C("C", {NUM_K, NUM_L}, {Dense, Dense}); + + srand(935); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float) rand() / (float) (RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, j, k}, (double) ((int) (rand_float * 3 / SPARSITY))); + } + } + } + } + + for (int k = 0; k < NUM_K; k++) { + for (int l = 0; l < NUM_L; l++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + C.insert({k, l}, (double) ((int) (rand_float*3))); + } + } + + B.pack(); + C.pack(); + + A(i,j,l) = B(i,j,k) * C(k,l); + + IndexStmt stmt = A.getAssignment().concretize(); + stmt = scheduleTTMCPU(stmt, B); + + //printToFile("ttm_cpu", stmt); + + A.compile(stmt); + A.assemble(); + A.compute(); + + Tensor expected("expected", {NUM_I, NUM_J, NUM_L}, {Dense, Dense, Dense}); + expected(i,j,l) = B(i,j,k) * C(k,l); + expected.compile(); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, A); +} + TEST(scheduling_eval, mttkrpCPU) { if (should_use_CUDA_codegen()) { return; @@ -1143,6 +1600,69 @@ TEST(scheduling_eval, mttkrpCPU) { ASSERT_TENSOR_EQ(expected, A); } + +TEST(scheduling_eval, mttkrpISPC) { + if (should_use_CUDA_codegen()) { + return; + } + int NUM_I = 1021/20; + int NUM_J = 1039/20; + int NUM_K = 1057/20; + int NUM_L = 1232/20; + float SPARSITY = .1; + Tensor A("A", {NUM_I, NUM_J}, {Dense, Dense}); + Tensor B("B", {NUM_I, NUM_K, NUM_L}, {Dense, Sparse, Sparse}); + Tensor C("C", {NUM_K, NUM_J}, {Dense, Dense}); + Tensor D("D", {NUM_L, NUM_J}, {Dense, Dense}); + + srand(549694); + for (int i = 0; i < NUM_I; i++) { + for (int k = 0; k < NUM_K; k++) { + for (int l = 0; l < NUM_L; l++) { + float rand_float = (float) rand() / (float) (RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, k, l}, (double) ((int) (rand_float * 3 / SPARSITY))); + } + } + } + } + + for (int k = 0; k < NUM_K; k++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + C.insert({k, j}, (double) ((int) (rand_float*3))); + } + } + + for (int l = 0; l < NUM_L; l++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + D.insert({l, j}, (double) ((int) (rand_float*3))); + } + } + + B.pack(); + C.pack(); + D.pack(); + + A(i,j) = B(i,k,l) * C(k,j) * D(l,j); + + IndexStmt stmt = A.getAssignment().concretize(); + stmt = scheduleMTTKRPCPU(stmt, B); + //printToFile("mttkrp_cpu", stmt); + + A.compile(stmt); + A.assemble(); + A.compute(); + + Tensor expected("expected", {NUM_I, NUM_J}, {Dense, Dense}); + expected(i,j) = B(i,k,l) * C(k,j) * D(l,j); + expected.compile(); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, A); +} + TEST(scheduling_eval, spmvGPU) { if (!should_use_CUDA_codegen()) { return; @@ -2079,6 +2599,55 @@ TEST(generate_evaluation_files, cpu) { } } +TEST(generate_evaluation_files_spmv, ispc) { + set_CUDA_codegen_enabled(false); + set_ISPC_codegen_enabled(true); + + std::cout << "executing generate_evaluation_file.ispc\n"; + + int NUM_I = 100; + int NUM_J = 100; + + vector> spmv_parameters = {}; // {NNZ_PER_THREAD, BLOCK_SIZE} + for (int i = 3; i <= 20; i++) { + spmv_parameters.push_back({i, 512}); + } + + string file_ending_c = ".c"; + string file_ending_ispc = ".ispc"; + string file_path = "eval_prepared_ispc/spmv/"; + mkdir(file_path.c_str(), 0777); + + // spmv + { + stringstream source1; + stringstream source2; + std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); + Tensor A("A", {NUM_I, NUM_J}, CSR); + Tensor x("x", {NUM_J}, Format({Dense})); + Tensor y("y", {NUM_I}, Format({Dense})); + IndexExpr precomputed = A(i, j) * x(j); + y(i) = precomputed; + IndexStmt stmt = y.getAssignment().concretize(); + bool isFirst = true; + for (auto paramSet : spmv_parameters) { + IndexStmt scheduled = scheduleSpMVCPU(stmt); + ir::Stmt compute = lower(scheduled, string("compute_") + util::join(paramSet, "_"), false, true); + codegen->compile(compute, isFirst); + isFirst = false; + } + ofstream source_file1; + source_file1.open(file_path + "spmv_ispc" + file_ending_c); + source_file1 << source1.str(); + source_file1.close(); + + ofstream source_file2; + source_file2.open(file_path + "__spmv_ispc" + file_ending_ispc); + source_file2 << source2.str(); + source_file2.close(); + } +} + TEST(generate_evaluation_files, gpu) { // if (!should_use_CUDA_codegen()) { // return; From a5c3a8cea4c8c736d7bf0c4cf976095cbed11401 Mon Sep 17 00:00:00 2001 From: Adhhitha Dias Date: Wed, 8 Sep 2021 10:26:47 -0400 Subject: [PATCH 06/10] add class diagram --- .gitignore | 1 + out/taco-uml/._taco.svg | Bin 0 -> 4096 bytes out/taco-uml/taco.svg | 878 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 879 insertions(+) create mode 100755 out/taco-uml/._taco.svg create mode 100644 out/taco-uml/taco.svg diff --git a/.gitignore b/.gitignore index 9abc3adc7..215b56e9a 100644 --- a/.gitignore +++ b/.gitignore @@ -14,4 +14,5 @@ doc apps/tensor_times_vector/tensor_times_vector .cache +.vscode compile_commands.json diff --git a/out/taco-uml/._taco.svg b/out/taco-uml/._taco.svg new file mode 100755 index 0000000000000000000000000000000000000000..e88dbd51b684b39e4ea0b0f4425ef9bc02f5d445 GIT binary patch literal 4096 zcmZQz6=P>$Vqox1Ojhs@R)|o50+1L3ClDJkFz{^v(m+1nBL)UWIUt(=a103vVwlIZ z9-@O`0Z_RBnifVNA1W@DoS&IntrusivePtrT *ptrUncopyableIRNodevirtual void accept(IRVisitorStrict *v) const = 0virtual IRNodeType type_info() const = 0;BaseStmtNodeBaseExprNodeDatatype typeStmtNodevoid accept(IRVisitorStrict *v) constExprNodevoid accept(IRVisitorStrict *v) constIRHandlevoid accept(IRVisitorStrict *v) constExprStmtIRVisitorStrictvirtual void visit(const IRNode*) const = 0IRVisitorvirtual void visit(const IRNode*)IRRewriterExpr exprStmt stmtvirtual void visit(const ExprNode* op)virtual void visit(const StmtNode* op)Expr rewrite(Expr)Stmt rewrite(Stmt)IRPrinterstd::ostream &streamstd::ostream &stream2int indentbool colorbool simplifyenum PrecedencePrecedence parentPrecedence = BOTTOMNameGenerator varNameGeneratorscopedMap<Expr, std::String> varNamesvoid doIndent()void printBinOp(Expr a, Expr b, std::string op, Precedence precedence)void fewMoreMethods()virtual void visit(const ExprNode*)virtual void visit(const StmtNode*)setColor(bool color)print(Stmt)IRVerifierExpressionSimplifierRemoveRedundantStatementsRemoveRedundantLoopsRemoveDuplicateBodyCodeGenCodeGen_CCodeGen_CUDACodeGen_ISPCManageableIndexStmtNodevirtual void accept(IndexStmtVisitorStrict*) const = 0IndexExprNodevirtual void accept(IndexStmtVisitorStrict*) const = 0IndexStmtIndexExprIndexExprVisitorStrictvoid visit(const IndexStmt&)virtual void visit(const AccessNode*) = 0virtual void visit(const LiteralNode*) = 0virtual void visit(const NegNode*) = 0virtual void visit(const AddNode*) = 0virtual void visit(const SubNode*) = 0virtual void visit(const MulNode*) = 0virtual void visit(const DivNode*) = 0virtual void visit(const SqrtNode*) = 0virtual void visit(const CastNode*) = 0virtual void visit(const CallIntrinsicNode*) = 0virtual void visit(const ReductionNode*) = 0IndexStmtVisitorStrictvoid visit(const IndexStmt&)virtual void visit(const AssignmentNode*) = 0virtual void visit(const YieldNode*) = 0virtual void visit(const ForallNode*) = 0virtual void visit(const WhereNode*) = 0virtual void visit(const SequenceNode*) = 0virtual void visit(const AssembleNode*) = 0virtual void visit(const MultiNode*) = 0virtual void visit(const SuchThatNode*) = 0IndexNotationVisitorStrictIndexNotationPrintervoid print(const IndexExpr& expr)void print(const IndexStmt& expr)void visit(const AccessNode* node)void visit(const LiteralNode* node)void visit(const NegNode* node)void visit(const AddNode* node)void visit(const SubNode* node)void visit(const MulNode* node)void visit(const DivNode* node)void visit(const SqrtNode* node)void visit(const CastNode* node)void visit(const CallIntrinsicNode* node)void visit(const UnaryExprNode* node)void visit(const BinaryExprNode* node)void visit(const ReductionNode* node)void visit(const AssignmentNode* node)void visit(const YieldNode* node)void visit(const ForallNode* node)void visit(const WhereNode* node)void visit(const SequenceNode* node)void visit(const AssembleNode* node)void visit(const MultiNode* node)void visit(const SuchThatNode* node)IndexNotationVisitorvirtual void visit(const AccessNode* node)virtual void visit(const LiteralNode* node)virtual void visit(const NegNode* node)virtual void visit(const AddNode* node)virtual void visit(const SubNode* node)virtual void visit(const MulNode* node)virtual void visit(const DivNode* node)virtual void visit(const SqrtNode* node)virtual void visit(const CastNode* node)virtual void visit(const CallIntrinsicNode* node)virtual void visit(const UnaryExprNode* node)virtual void visit(const BinaryExprNode* node)virtual void visit(const ReductionNode* node)virtual void visit(const AssignmentNode* node)virtual void visit(const YieldNode* node)virtual void visit(const ForallNode* node)virtual void visit(const WhereNode* node)virtual void visit(const SequenceNode* node)virtual void visit(const AssembleNode* node)virtual void visit(const MultiNode* node)virtual void visit(const SuchThatNode* node)MatcherIndexExprRewriterStrictIndexExpr exprIndexExpr rewrite(IndexExpr)virtual void visit(const AccessNode* op) = 0virtual void visit(const LiteralNode* op) = 0virtual void visit(const NegNode* op) = 0virtual void visit(const SqrtNode* op) = 0virtual void visit(const AddNode* op) = 0virtual void visit(const SubNode* op) = 0virtual void visit(const MulNode* op) = 0virtual void visit(const DivNode* op) = 0virtual void visit(const CastNode* op) = 0virtual void visit(const CallIntrinsicNode* op) = 0virtual void visit(const ReductionNode* op) = 0IndexStmtRewriterStrictIndexStmt stmtIndexStmt rewrite(IndexStmt)virtual void visit(const AssignmentNode* op) = 0virtual void visit(const YieldNode* op) = 0virtual void visit(const ForallNode* op) = 0virtual void visit(const WhereNode* op) = 0virtual void visit(const SequenceNode* op) = 0virtual void visit(const AssembleNode* op) = 0virtual void visit(const MultiNode* op) = 0virtual void visit(const SuchThatNode* op) = 0IndexNotationRewriterStrictIndexNotationRewritervirtual void visit(const AccessNode* node)virtual void visit(const LiteralNode* node)virtual void visit(const NegNode* node)virtual void visit(const AddNode* node)virtual void visit(const SubNode* node)virtual void visit(const MulNode* node)virtual void visit(const DivNode* node)virtual void visit(const SqrtNode* node)virtual void visit(const CastNode* node)virtual void visit(const CallIntrinsicNode* node)virtual void visit(const UnaryExprNode* node)virtual void visit(const BinaryExprNode* node)virtual void visit(const ReductionNode* node)virtual void visit(const AssignmentNode* node)virtual void visit(const YieldNode* node)virtual void visit(const ForallNode* node)virtual void visit(const WhereNode* node)virtual void visit(const SequenceNode* node)virtual void visit(const AssembleNode* node)virtual void visit(const MultiNode* node)virtual void visit(const SuchThatNode* node)Lowererstd::shared_ptr<LowererImpl> impl;LowererImplclass Visitor;friend class Visitor;std::shared_ptr<Visitor> visitor;virtual ir::Stmt lower(IndexStmt stmt);virtual ir::Expr lower(IndexExpr expr);virtual ir::Expr lowerExpr(IndexExpr expr) = 0;virtual ir::Stmt lowerStmt(IndexStmt stmt) = 0;virtual ir::Stmt lower(IndexStmt stmt, std::string name,bool assemble, bool compute, bool pack, bool unpack) = 0;LowererImplImperativeclass Visitorfiend class Visitorstd::shared_ptr<Visitor> visitorbool assemblebool computevars a_bunch_of_other_fieldsvirtual ir::Stmt lowerExpr(IndexExpr expr);virtual ir::Stmt lowerStmt(IndexStmt stmt);ir::Stmt lower(IndexStmt stmt, std::string name,bool assemble, bool compute, bool pack, bool unpack)Stmt LowererImplImperative::lower(IndexStmt stmt) {return visitor->lower(stmt);}VisitorLowererImpl* implExpr exprStmt stmtvoid visit(const AssignmentNode* node)void visit(const YieldNode* node)void visit(const ForallNode* node)void visit(const WhereNode* node)void visit(const MultiNode* node)void visit(const SuchThatNode* node)void visit(const SequenceNode* node)void visit(const AssembleNode* node)void visit(const AccessNode* node)void visit(const LiteralNode* node)void visit(const NegNode* node)void visit(const AddNode* node)void visit(const SubNode* node)void visit(const MulNode* node)void visit(const DivNode* node)void visit(const SqrtNode* node)void visit(const CastNode* node)void visit(const CallIntrinsicNode* node)void visit(const ReductionNode* node)Visitor(LowererImplImperative* impl)Stmt lower(IndexStmt stmt)Expr lower(IndexExpr expr)Stmt lower(IndexStmt stmt) {this->stmt = Stmt();impl->accessibleIterators.scope();IndexStmtVisitorStrict::visit(stmt);impl->accessibleIterators.unscope();return this->stmt;}contains111111contains11contains11contains11contains11contains11 \ No newline at end of file From 4a4a569f83b7acf5656eff290fd004c62bdc38b9 Mon Sep 17 00:00:00 2001 From: Adhhitha Dias Date: Wed, 8 Sep 2021 10:35:14 -0400 Subject: [PATCH 07/10] add ispc headers for binary search and fix compile errors --- include/taco/ir/ir.h | 2 +- src/codegen/codegen_ispc.cpp | 397 +++++++++++++++++++++-------------- src/codegen/codegen_ispc.h | 8 +- src/ir/ir_printer.cpp | 40 +++- 4 files changed, 277 insertions(+), 170 deletions(-) diff --git a/include/taco/ir/ir.h b/include/taco/ir/ir.h index 651faff4e..96dc7d034 100644 --- a/include/taco/ir/ir.h +++ b/include/taco/ir/ir.h @@ -591,7 +591,7 @@ struct Switch : public StmtNode { static const IRNodeType _type_info = IRNodeType::Switch; }; -enum class LoopKind {Serial, Static, Dynamic, Runtime, Vectorized, Static_Chunked, Foreach, Mul_Thread}; +enum class LoopKind {Serial, Static, Dynamic, Runtime, Vectorized, Static_Chunked, Foreach, Mul_Thread, Init}; /** A for loop from start to end by increment. * A vectorized loop will require the increment to be 1 and the diff --git a/src/codegen/codegen_ispc.cpp b/src/codegen/codegen_ispc.cpp index 237bc822d..d35af1748 100644 --- a/src/codegen/codegen_ispc.cpp +++ b/src/codegen/codegen_ispc.cpp @@ -145,8 +145,61 @@ const string cHeaders = " free(t);\n" "}\n" "#endif\n"; + +const string ispcHeaders = + "#define __TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))\n" + "#define __TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))\n" + "#define __TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)\n" + "int __cmp(const void *a, const void *b) {\n" + " return *((const int*)a) - *((const int*)b);\n" + "}\n" + "int __taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {\n" + " if (array[arrayStart] >= target) {\n" + " return arrayStart;\n" + " }\n" + " int lowerBound = arrayStart; // always < target\n" + " int upperBound = arrayEnd; // always >= target\n" + " while (upperBound - lowerBound > 1) {\n" + " int mid = (upperBound + lowerBound) / 2;\n" + " int midValue = array[mid];\n" + " if (midValue < target) {\n" + " lowerBound = mid;\n" + " }\n" + " else if (midValue > target) {\n" + " upperBound = mid;\n" + " }\n" + " else {\n" + " return mid;\n" + " }\n" + " }\n" + " return upperBound;\n" + "}\n" + "int __taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {\n" + " if (array[arrayEnd] <= target) {\n" + " return arrayEnd;\n" + " }\n" + " int lowerBound = arrayStart; // always <= target\n" + " int upperBound = arrayEnd; // always > target\n" + " while (upperBound - lowerBound > 1) {\n" + " int mid = (upperBound + lowerBound) / 2;\n" + " int midValue = array[mid];\n" + " if (midValue < target) {\n" + " lowerBound = mid;\n" + " }\n" + " else if (midValue > target) {\n" + " upperBound = mid;\n" + " }\n" + " else {\n" + " return mid;\n" + " }\n" + " }\n" + " return lowerBound;\n" + "}\n\n\n"; + } // anonymous namespace + + // find variables for generating declarations // generates a single var for each GetProperty class CodeGen_ISPC::FindVars : public IRVisitor { @@ -249,11 +302,10 @@ class CodeGen_ISPC::FindVars : public IRVisitor { // Finds all for loops tagged with accelerator and adds statements to deviceFunctions // Also tracks scope of when device function is called and // tracks which variables must be passed to function. -class CodeGen_ISPC::DeviceFunctionCollector : public IRVisitor { +class CodeGen_ISPC::FunctionCollector : public IRVisitor { public: - vector blockFors; vector threadFors; // contents is device function - vector warpFors; + vector initFors; // for loops to initialize statements map scopeMap; // the variables to pass to each device function @@ -271,7 +323,7 @@ class CodeGen_ISPC::DeviceFunctionCollector : public IRVisitor { CodeGen_ISPC *codeGen; // copy inputs and outputs into the map - DeviceFunctionCollector(vector inputs, vector outputs, CodeGen_ISPC *codeGen) : codeGen(codeGen) { + FunctionCollector(vector inputs, vector outputs, CodeGen_ISPC *codeGen) : codeGen(codeGen) { inDeviceFunction = false; for (auto v: inputs) { auto var = v.as(); @@ -310,7 +362,11 @@ class CodeGen_ISPC::DeviceFunctionCollector : public IRVisitor { } else if (op->parallel_unit == ParallelUnit::CPUSimd) { - + std::cout << "************************************************************************** CPUSimd For node\n"; + } + else if (op->kind == LoopKind::Init) { + std::cout << "************************************************************************* Init loop kind found\n"; + initFors.push_back(op); } else{ op->var.accept(this); @@ -376,6 +432,10 @@ void CodeGen_ISPC::compile(Stmt stmt, bool isFirst) { if (isFirst) { // output the headers out << cHeaders; + + if (&out != &out2) { + out2 << ispcHeaders; + } } out << endl; // generate code for the Stmt @@ -385,13 +445,13 @@ void CodeGen_ISPC::compile(Stmt stmt, bool isFirst) { -string CodeGen_ISPC::printCallISPCFunc(const Function *func, map varMap, +string CodeGen_ISPC::printCallISPCFunc(const std::string& funcName, map varMap, vector &sortedProps) { std::stringstream ret; ret << " "; unordered_set propsAlreadyGenerated; - ret << "__" << func->name << "("; + ret << "__" << funcName << "("; for (unsigned long i=0; i < sortedProps.size(); i++) { @@ -410,118 +470,71 @@ string CodeGen_ISPC::printCallISPCFunc(const Function *func, map varMap, vector &sortedProps) { - DeviceFunctionCollector deviceFunctionCollector(func->inputs, func->outputs, this); - func->body.accept(&deviceFunctionCollector); + FunctionCollector functionCollector(func->inputs, func->outputs, this); + func->body.accept(&functionCollector); - std::stringstream variables; vector inputs = func->inputs; vector outputs = func->outputs; unordered_set propsAlreadyGenerated; - for (unsigned long i=0; i < sortedProps.size(); i++) { - auto prop = sortedProps[i]; - bool isOutputProp = (find(outputs.begin(), outputs.end(), - prop->tensor) != outputs.end()); - - auto var = prop->tensor.as(); - if (var->is_parameter) { - if (isOutputProp) { - variables << " " << printTensorProperty(varMap[prop], prop, false) << ";" << endl; - } else { - break; - } + for (unsigned long i=0; i < sortedProps.size(); i++) { + auto prop = sortedProps[i]; + bool isOutputProp = (find(outputs.begin(), outputs.end(), + prop->tensor) != outputs.end()); + + auto var = prop->tensor.as(); + if (var->is_parameter) { + if (isOutputProp) { + funcVariables << " " << printTensorProperty(varMap[prop], prop, false) << ";" << endl; } else { - variables << getUnpackedTensorArgument(varMap[prop], prop, isOutputProp); + break; } - propsAlreadyGenerated.insert(varMap[prop]); + } else { + funcVariables << getUnpackedTensorArgument(varMap[prop], prop, isOutputProp); + } + propsAlreadyGenerated.insert(varMap[prop]); - if (i!=sortedProps.size()-1) { - variables << ", "; - } - if (i%2==0) { - variables << "\n\t"; - } + if (i!=sortedProps.size()-1) { + funcVariables << ", "; + } + if (i%2==0) { + funcVariables << "\n\t"; } + } resetUniqueNameCounters(); - for (size_t i = 0; i < deviceFunctionCollector.threadFors.size(); i++) { - const For *threadloop = to(deviceFunctionCollector.threadFors[i]); + // threadFors code generation + for (size_t i = 0; i < functionCollector.threadFors.size(); i++) { + + const For *threadloop = to(functionCollector.threadFors[i]); taco_iassert(threadloop->parallel_unit == ParallelUnit::CPUSpmd); Stmt function = threadloop->contents; std::cout << "threadloop function: " << function << std::endl; - out2 << "static task void __" << func->name << "__ ("; - out2 << variables.str(); + out2 << "\nstatic task void __" << func->name << "__ ("; + out2 << funcVariables.str(); out2 << "\n) {\n\n"; indent++; - doIndent(); - // output body + // output body of the threadloop + taskCode = true; print(threadloop); indent--; - out2 << "}\n"; - - out2 << "export void __" << func->name << "("; - out2 << variables.str(); - out2 << "\n) {\n\n"; - indent++; - doIndent(); - out2 << "launch[4] " << printCallISPCFunc(func, varMap, sortedProps) << "\n"; - indent--; - out2 << "}\n"; - - } - - if (deviceFunctionCollector.threadFors.size()==0) { - out2 << "export void __" << func->name << " ("; - out2 << variables.str(); - out2 << "\n) {\n\n"; + out2 << "}\n\n"; - indent++; - doIndent(); - // output body - print(func->body); - indent--; - out2 << "}\n"; } - // out2 << "export void "; - - // out2 << "__" << func->name << "("; + taskCode = false; + out2 << "export void __" << func->name << " ("; + out2 << funcVariables.str(); + out2 << "\n) {\n\n"; - // for (unsigned long i=0; i < sortedProps.size(); i++) { - // auto prop = sortedProps[i]; - // bool isOutputProp = (find(outputs.begin(), outputs.end(), - // prop->tensor) != outputs.end()); - - // auto var = prop->tensor.as(); - // if (var->is_parameter) { - // if (isOutputProp) { - // out2 << " " << printTensorProperty(varMap[prop], prop, false) << ";" << endl; - // } else { - // break; - // } - // } else { - // out2 << getUnpackedTensorArgument(varMap[prop], prop, isOutputProp); - // } - // propsAlreadyGenerated.insert(varMap[prop]); - - // if (i!=sortedProps.size()-1) { - // out2 << ", "; - // } - // if (i%2==0) { - // out2 << "\n\t"; - // } - // } - // out2 << "\n) {\n\n"; - - // indent++; - // doIndent(); - // // output body - // print(func->body); - // indent--; - // out2 << "}\n"; + indent++; + // output body + print(func->body); + indent--; + out2 << "}\n"; } @@ -535,6 +548,8 @@ void CodeGen_ISPC::sendToStream(std::stringstream &stream) { } void CodeGen_ISPC::visit(const Function* func) { + set_ISPC_code_stream_enabled(false); + // if generating a header, protect the function declaration with a guard if (func->name == "assemble") { if (outputKind == HeaderGen) { @@ -646,11 +661,11 @@ void CodeGen_ISPC::visit(const Function* func) { // Print variable declarations out << printDecls(varFinder.varDecls, func->inputs, func->outputs) << endl; - vector sortedProps; + sortedProps = {}; vector inputs = func->inputs; vector outputs = func->outputs; getSortedProps(varFinder.varDecls, sortedProps, inputs, outputs); - out << printCallISPCFunc(func, varFinder.varDecls, sortedProps); + out << printCallISPCFunc(func->name, varFinder.varDecls, sortedProps); if (emittingCoroutine) { out << printContextDeclAndInit(varMap, localVars, numYields, func->name) @@ -788,51 +803,84 @@ static string getAtomicPragma() { // Docs for vectorization pragmas: // http://clang.llvm.org/docs/LanguageExtensions.html#extensions-for-loop-hint-optimizations void CodeGen_ISPC::visit(const For* op) { - switch (op->kind) { - // TODO - add ISPC based multi threaded execution handling - case LoopKind::Vectorized: - case LoopKind::Static: - case LoopKind::Dynamic: - case LoopKind::Runtime: - case LoopKind::Static_Chunked: - case LoopKind::Mul_Thread: - // op->start.accept(this); - // stream2 << std::endl; - // op->start.accept(this); - // stream2 << std::endl; - // op->start.accept(this); - // stream2 << std::endl; - // op->start.accept(this); - // stream2 << std::endl; - // op->end.accept(this); - // stream2 << std::endl; - // op->end.accept(this); - // stream2 << std::endl; - // op->end.accept(this); - // stream2 << std::endl; - default: - break; + if (!is_ISPC_code_stream_enabled()) { + CodeGen::visit(op); + return; } - doIndent(); - if (op->kind == LoopKind::Foreach) { - stream2 << keywordString("foreach") << " ("; - // if (!emittingCoroutine) { - // if (op->var.type() == Int32) { - // stream << "int32 "; - // } - // else if (op->var.type() == Int64) { - // stream << "int64 "; - // } + if (op->kind == LoopKind::Mul_Thread) { + if (!taskCode) { + out2 << "launch[4] " << printCallISPCFunc(funcName+"__", varMap, sortedProps) << "\n"; + return; + } + stream2 << "uniform unsigned int chunk_size = ("; + op->end.accept(this); + stream2 << " - "; + op->start.accept(this); + stream2 << ") / taskCount;\n"; + stream2 << " uniform unsigned int modulo = ("; + op->end.accept(this); + stream2 << " - "; + op->start.accept(this); + stream2 << ") % taskCount;\n"; + + stream2 << " uniform unsigned int start = "; + op->start.accept(this); + stream2 << " + chunk_size * taskIndex;\n"; + + stream2 << " if (taskIndex != 0) {\n"; + stream2 << " start += modulo;\n"; + stream2 << " }\n"; + + stream2 << " uniform unsigned int end = start + chunk_size;\n"; + stream2 << " if (taskIndex == 0) {\n"; + stream2 << " end += modulo;\n"; + stream2 << " }\n\n"; + + stream2 << keywordString(" for") << " ("; + if (!emittingCoroutine) { + if (op->var.type() == Int32) { + stream2 << "int32 "; + } + else if (op->var.type() == Int64) { + stream2 << "int64 "; + } - // } + } + op->var.accept(this); + stream2 << " = "; + stream2 << "start"; + // op->start.accept(this); + stream2 << keywordString("; "); + op->var.accept(this); + stream2 << " < "; + parentPrecedence = BOTTOM; + stream2 << "end"; + // op->end.accept(this); + stream2 << keywordString("; "); + op->var.accept(this); + + auto lit = op->increment.as(); + if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || + (lit->type.isUInt() && lit->equalsScalar(1)))) { + stream2 << "++"; + } + else { + stream2 << " += "; + op->increment.accept(this); + } + + } + + else if (op->kind == LoopKind::Foreach) { + stream2 << keywordString("foreach") << " ("; + op->var.accept(this); stream2 << " = "; op->start.accept(this); stream2 << keywordString(" ... "); op->end.accept(this); - stream2 << ") {\n"; } else { stream2 << keywordString("for") << " ("; @@ -865,9 +913,10 @@ void CodeGen_ISPC::visit(const For* op) { stream2 << " += "; op->increment.accept(this); } - stream2 << ") {\n"; + } + stream2 << ") {\n"; op->contents.accept(this); doIndent(); stream2 << "}"; @@ -934,33 +983,69 @@ void CodeGen_ISPC::visit(const Max* op) { void CodeGen_ISPC::visit(const Allocate* op) { string elementType = printCType(op->var.type(), false); - doIndent(); - op->var.accept(this); - stream << " = ("; - stream << elementType << "*"; - stream << ")"; - if (op->is_realloc) { - stream << "realloc("; + + if (is_ISPC_code_stream_enabled()) { + op->var.accept(this); - stream << ", "; - } - else { - // If the allocation was requested to clear the allocated memory, - // use calloc instead of malloc. - if (op->clear) { - stream << "calloc(1, "; - } else { - stream << "malloc("; + stream2 << " = "; + // stream2 << " = ("; + // stream2 << elementType << "*"; + // stream2 << ")"; + if (op->is_realloc) { + stream2 << "realloc("; + op->var.accept(this); + stream2 << ", "; } - } - stream << "sizeof(" << elementType << ")"; - stream << " * "; - parentPrecedence = MUL; - op->num_elements.accept(this); - parentPrecedence = TOP; - stream << ");"; + else { + // If the allocation was requested to clear the allocated memory, + // use calloc instead of malloc. + if (op->clear) { + stream2 << "calloc(1, "; + } else { + stream2 << "new "; + } + } + stream2 << elementType << "["; + parentPrecedence = MUL; + op->num_elements.accept(this); + parentPrecedence = TOP; + stream2 << "];"; + stream2 << endl; + + + } else { + + op->var.accept(this); + stream << " = ("; + stream << elementType << "*"; + stream << ")"; + if (op->is_realloc) { + stream << "realloc("; + op->var.accept(this); + stream << ", "; + } + else { + // If the allocation was requested to clear the allocated memory, + // use calloc instead of malloc. + if (op->clear) { + stream << "calloc(1, "; + } else { + stream << "malloc("; + } + } + stream << "sizeof(" << elementType << ")"; + stream << " * "; + parentPrecedence = MUL; + op->num_elements.accept(this); + parentPrecedence = TOP; + stream << ");"; stream << endl; + + + } + + } void CodeGen_ISPC::visit(const Sqrt* op) { diff --git a/src/codegen/codegen_ispc.h b/src/codegen/codegen_ispc.h index 08e73b252..2e440abc0 100644 --- a/src/codegen/codegen_ispc.h +++ b/src/codegen/codegen_ispc.h @@ -2,6 +2,7 @@ #define TACO_BACKEND_ISPC_H #include #include +#include #include "taco/ir/ir.h" #include "taco/ir/ir_printer.h" @@ -44,24 +45,27 @@ class CodeGen_ISPC : public CodeGen { void visit(const Assign*); Stmt simplifyFunctionBodies(Stmt stmt); - std::string printCallISPCFunc(const Function *func, std::map varMap, + std::string printCallISPCFunc(const std::string& funcName, std::map varMap, std::vector &sortedProps); void printISPCFunc(const Function *func, std::map varMap, std::vector &sortedProps); std::map varMap; std::vector localVars; + bool taskCode = false; std::ostream &out; std::ostream &out2; OutputKind outputKind; std::string funcName; + std::stringstream funcVariables; + std::vector sortedProps; int labelCount; bool emittingCoroutine; class FindVars; - class DeviceFunctionCollector; + class FunctionCollector; private: virtual std::string restrictKeyword() const { return "restrict"; } diff --git a/src/ir/ir_printer.cpp b/src/ir/ir_printer.cpp index ba2bc894b..fa224bde4 100644 --- a/src/ir/ir_printer.cpp +++ b/src/ir/ir_printer.cpp @@ -333,10 +333,18 @@ void IRPrinter::visit(const Cast* op) { } void IRPrinter::visit(const Call* op) { - stream << op->func << "("; - parentPrecedence = Precedence::CALL; - acceptJoin(this, stream, op->args, ", "); - stream << ")"; + if (!is_ISPC_code_stream_enabled()) { + stream << op->func << "("; + parentPrecedence = Precedence::CALL; + acceptJoin(this, stream, op->args, ", "); + stream << ")"; + } else { + // statically added function to the ispc file has __ in the front + stream2 << "__" << op->func << "("; + parentPrecedence = Precedence::CALL; + acceptJoin(this, stream2, op->args, ", "); + stream2 << ")"; + } } void IRPrinter::visit(const IfThenElse* op) { @@ -716,7 +724,7 @@ void IRPrinter::visit(const VarDecl* op) { } taco_iassert(isa(op->var)); if (to(op->var)->is_ptr) { - stream2 << "* restrict"; + stream2 << "* "; // removed restrict keyword from here } stream2 << " "; string varName = varNameGenerator.getUniqueName(util::toString(op->var)); @@ -829,12 +837,22 @@ void IRPrinter::visit(const Allocate* op) { } void IRPrinter::visit(const Free* op) { - doIndent(); - stream << "free("; - parentPrecedence = Precedence::TOP; - op->var.accept(this); - stream << ");"; - stream << endl; + if (is_ISPC_code_stream_enabled()) { + doIndent(); + stream2 << "delete[] "; + parentPrecedence = Precedence::TOP; + op->var.accept(this); + stream2 << ";"; + stream2 << endl; + } + else { + doIndent(); + stream << "free("; + parentPrecedence = Precedence::TOP; + op->var.accept(this); + stream << ");"; + stream << endl; + } } void IRPrinter::visit(const Comment* op) { From 8a42b2f226cece4a8da21f06e548fe46bfc2e124 Mon Sep 17 00:00:00 2001 From: Adhhitha Dias Date: Wed, 8 Sep 2021 10:37:00 -0400 Subject: [PATCH 08/10] add test kernels sddmm, mttkrp, ttv, etc.. --- test/tests-scheduling-eval.cpp | 727 +++++++++++++++++++++++++++++++-- 1 file changed, 695 insertions(+), 32 deletions(-) diff --git a/test/tests-scheduling-eval.cpp b/test/tests-scheduling-eval.cpp index 4957418e0..59debc88e 100644 --- a/test/tests-scheduling-eval.cpp +++ b/test/tests-scheduling-eval.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include "taco/cuda.h" #include "test.h" #include "test_tensors.h" @@ -57,6 +58,31 @@ void printToFile(string filename, IndexStmt stmt) { source_file.close(); } +void printToFile(string filename, string additional_filename, IndexStmt stmt) { + stringstream source1; + stringstream source2; + + string file_path = "eval_generated/"; + mkdir(file_path.c_str(), 0777); + + std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); + ir::Stmt compute = lower(stmt, "compute", false, true); + codegen->compile(compute, true); + + ofstream source_file; + string file_ending = should_use_CUDA_codegen() ? ".cu" : ".c"; + source_file.open(file_path+filename+file_ending); + source_file << source1.str(); + source_file.close(); + + ofstream additional_source_file; + string additional_file_ending = ".ispc"; + additional_source_file.open(file_path+additional_filename+additional_file_ending); + additional_source_file << source2.str(); + additional_source_file.close(); + +} + IndexStmt scheduleSpMVCPU(IndexStmt stmt, int CHUNK_SIZE=16) { IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1"); return stmt.split(i, i0, i1, CHUNK_SIZE) @@ -92,6 +118,16 @@ IndexStmt scheduleSpMMISPC1(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, .parallelize(k, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); } +IndexStmt scheduleSpMMISPCOMP1(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); + return stmt.split(i, i0, i1, CHUNK_SIZE) + .pos(j, jpos, A(i,j)) + .split(jpos, jpos0, jpos1, UNROLL_FACTOR) + .reorder({i0, i1, jpos0, k, jpos1}) + .parallelize(i0, ParallelUnit::CPUSpmd, OutputRaceStrategy::NoRaces) + .parallelize(k, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); +} + IndexStmt scheduleSpMMISPC1_2(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); return stmt.split(i, i0, i1, CHUNK_SIZE) @@ -199,6 +235,27 @@ IndexStmt scheduleSDDMMCPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, .parallelize(kpos1, ParallelUnit::CPUVector, OutputRaceStrategy::ParallelReduction); } +IndexStmt scheduleSDDMMCSRCPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1"); + return stmt; + // return stmt.split(i, i0, i1, CHUNK_SIZE) + // .pos(k, kpos, B(i,k)) + // .split(kpos, kpos0, kpos1, UNROLL_FACTOR) + // .reorder({i0, i1, kpos0, j, kpos1}); + // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); + // .parallelize(k, ParallelUnit::CPUVector, OutputRaceStrategy::IgnoreRaces); +} + +IndexStmt scheduleSDDMM2CPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); + return stmt.split(i, i0, i1, CHUNK_SIZE) + .pos(j, jpos, B(i,j)) + .split(jpos, jpos0, jpos1, UNROLL_FACTOR) + .reorder({i0, i1, jpos0, k, jpos1}) + .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + .parallelize(jpos1, ParallelUnit::CPUVector, OutputRaceStrategy::ParallelReduction); +} + IndexStmt scheduleSDDMMISPC(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1"); return stmt.split(i, i0, i1, CHUNK_SIZE) @@ -209,6 +266,16 @@ IndexStmt scheduleSDDMMISPC(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, .parallelize(kpos1, ParallelUnit::CPUSimd, OutputRaceStrategy::ParallelReduction); } +IndexStmt scheduleSDDMM2ISPC(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); + return stmt.split(i, i0, i1, CHUNK_SIZE) + .pos(j, jpos, B(i,j)) + .split(jpos, jpos0, jpos1, UNROLL_FACTOR) + .reorder({i0, i1, jpos0, k, jpos1}) + // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); + .parallelize(jpos1, ParallelUnit::CPUSimd, OutputRaceStrategy::ParallelReduction); +} + IndexStmt scheduleSDDMMISPC1(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1"); return stmt.split(i, i0, i1, CHUNK_SIZE) @@ -241,12 +308,12 @@ IndexStmt scheduleTTVCPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16) { IndexStmt scheduleTTVISPC(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16) { IndexVar f("f"), fpos("fpos"), chunk("chunk"), fpos2("fpos2"); - return stmt; - // return stmt.fuse(i, j, f) - // .pos(f, fpos, B(i,j,k)) - // .split(fpos, chunk, fpos2, CHUNK_SIZE) - // .reorder({chunk, fpos2, k}) - // .parallelize(chunk, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); + // return stmt; + return stmt.fuse(i, j, f) + .pos(f, fpos, B(i,j,k)) + .split(fpos, chunk, fpos2, CHUNK_SIZE) + .reorder({chunk, fpos2, k}) + .parallelize(chunk, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); } IndexStmt scheduleTTVCPUCSR(IndexStmt stmt) { @@ -258,6 +325,25 @@ IndexStmt scheduleTTVCPUCSR(IndexStmt stmt) { OutputRaceStrategy::NoRaces); } +IndexStmt scheduleTTVCPUCSR_ST(IndexStmt stmt) { + TensorVar result = stmt.as().getStmt().as().getStmt() + .as().getStmt().as().getLhs() + .getTensorVar(); + return stmt.assemble(result, AssembleStrategy::Insert); +} + +IndexStmt scheduleTTVISPCCSR(IndexStmt stmt) { + TensorVar result = stmt.as().getStmt().as().getStmt() + .as().getStmt().as().getLhs() + .getTensorVar(); + return stmt.assemble(result, AssembleStrategy::Insert) + .parallelize(i, ParallelUnit::CPUSimd, OutputRaceStrategy::NoRaces); +} + +IndexStmt scheduleTTVISPCCSR2(IndexStmt stmt) { + return stmt; +} + IndexStmt scheduleTTMCPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { IndexVar f("f"), fpos("fpos"), chunk("chunk"), fpos2("fpos2"), kpos("kpos"), kpos1("kpos1"), kpos2("kpos2"); return stmt.fuse(i, j, f) @@ -282,12 +368,47 @@ IndexStmt scheduleMTTKRPCPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, .parallelize(i1, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); } +IndexStmt scheduleMTTKRPCPU_ST(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i1("i1"), i2("i2"); + IndexExpr precomputeExpr = stmt.as().getStmt().as().getStmt() + .as().getStmt().as().getStmt() + .as().getRhs().as().getA(); + TensorVar w("w", Type(Float64, {Dimension(j)}), taco::dense); + return stmt.split(i, i1, i2, CHUNK_SIZE) + .reorder({i1, i2, k, l, j}) + .precompute(precomputeExpr, j, j, w); + // .parallelize(j, ParallelUnit::CPUVector, OutputRaceStrategy::Atomics); // gives error when lowering for IgnoreRaces, NoRaces and Atomics + // .parallelize(i1, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); +} + +IndexStmt scheduleMTTKRPISPC(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i1("i1"), i2("i2"); + IndexExpr precomputeExpr = stmt.as().getStmt().as().getStmt() + .as().getStmt().as().getStmt() + .as().getRhs().as().getA(); + TensorVar w("w", Type(Float64, {Dimension(j)}), taco::dense); + return stmt.split(i, i1, i2, CHUNK_SIZE) + .reorder({i1, i2, k, l, j}) + .precompute(precomputeExpr, j, j, w) + .parallelize(j, ParallelUnit::CPUSimd, OutputRaceStrategy::NoRaces); +} + IndexStmt scheduleMTTKRPPrecomputedCPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { IndexVar i1("i1"), i2("i2"), j_pre("j_pre"); return stmt.split(i, i1, i2, CHUNK_SIZE) .parallelize(i1, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); } +IndexStmt scheduleMTTKRPPrecomputedCPU_ST(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i1("i1"), i2("i2"), j_pre("j_pre"); + return stmt.split(i, i1, i2, CHUNK_SIZE); +} + +IndexStmt scheduleMTTKRPPrecomputedISPC_ST(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i1("i1"), i2("i2"), j_pre("j_pre"); + return stmt.parallelize(j, ParallelUnit::CPUSimd, OutputRaceStrategy::NoRaces); +} + IndexStmt scheduleMTTKRP4CPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { IndexVar i1("i1"), i2("i2"); return stmt.split(i, i1, i2, CHUNK_SIZE) @@ -295,6 +416,19 @@ IndexStmt scheduleMTTKRP4CPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16 .parallelize(i1, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); } +IndexStmt scheduleMTTKRP4CPU_ST(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i1("i1"), i2("i2"); + return stmt.split(i, i1, i2, CHUNK_SIZE) + .reorder({i1, i2, k, l, m, j}); +} + +IndexStmt scheduleMTTKRP4ISPC_ST(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i1("i1"), i2("i2"); + return stmt.split(i, i1, i2, CHUNK_SIZE) + .reorder({i1, i2, k, l, m, j}) + .parallelize(j, ParallelUnit::CPUSimd, OutputRaceStrategy::NoRaces); +} + IndexStmt scheduleMTTKRP5CPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { IndexVar i1("i1"), i2("i2"); return stmt.split(i, i1, i2, CHUNK_SIZE) @@ -1024,7 +1158,7 @@ TEST(scheduling_eval, sddmmCPU) { IndexStmt stmt = A.getAssignment().concretize(); stmt = scheduleSDDMMCPU(stmt, B); - //printToFile("sddmm_cpu", stmt); + printToFile("sddmm_cpu_ryan2", stmt); A.compile(stmt); A.assemble(); @@ -1038,6 +1172,126 @@ TEST(scheduling_eval, sddmmCPU) { ASSERT_TENSOR_EQ(expected, A); } + +TEST(scheduling_eval, sddmmcsrCPU) { + if (should_use_CUDA_codegen()) { + return; + } + int NUM_I = 1021/10; + int NUM_J = 1039/10; + int NUM_K = 1057/10; + float SPARSITY = .3; + Tensor A("A", {NUM_I, NUM_K}, CSR); + Tensor B("B", {NUM_I, NUM_K}, CSR); + Tensor C("C", {NUM_I, NUM_J}, {Dense, Dense}); + Tensor D("D", {NUM_J, NUM_K}, {Dense, Dense}); + + srand(268238); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + C.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + + for (int i = 0; i < NUM_I; i++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, k}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + } + + for (int j = 0; j < NUM_J; j++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + D.insert({j, k}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + + B.pack(); + C.pack(); + D.pack(); + + A(i,k) = B(i,k) * C(i,j) * D(j,k); + + IndexStmt stmt = A.getAssignment().concretize(); + stmt = scheduleSDDMMCSRCPU(stmt, B); + + printToFile("sddmm_cpu", stmt); + + A.compile(stmt); + A.assemble(); + A.compute(); + + Tensor expected("expected", {NUM_I, NUM_K}, CSR); + expected(i,k) = B(i,k) * C(i,j) * D(j,k); + + IndexStmt stmt_ref = expected.getAssignment().concretize(); + printToFile("sddmm_cpu_ref", stmt_ref); + + expected.compile(stmt_ref); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, A); +} + + +TEST(scheduling_eval, sddmm2CPU) { + if (should_use_CUDA_codegen()) { + return; + } + int NUM_I = 1021/10; + int NUM_J = 1021/10; + int NUM_K = 18; + float SPARSITY = .3; + Tensor Y("Y", {NUM_I, NUM_J}, CSR); + Tensor A("A", {NUM_I, NUM_J}, CSR); + Tensor X("X", {NUM_I, NUM_K}, {Dense, Dense}); + + srand(268238); + + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + if (rand_float < SPARSITY) { + A.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + } + + for (int i = 0; i < NUM_J; i++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + X.insert({i, k}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + + A.pack(); + X.pack(); + + Y(i,j) = A(i,j) * X(i,k) * X(j,k); + + IndexStmt stmt = A.getAssignment().concretize(); + // stmt = scheduleSDDMMCPU(stmt, B); + + //printToFile("sddmm_cpu", stmt); + + A.compile(stmt); + A.assemble(); + A.compute(); + + Tensor expected("expected", {NUM_I, NUM_J}, {Dense, Dense}); + expected(i,j) = A(i,j) * X(i,k) * X(j,k); + expected.compile(); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, A); +} + + + // bin/taco-test --gtest_filter=scheduling_eval.sddmmISPC TEST(scheduling_eval, sddmmISPC) { @@ -1128,6 +1382,89 @@ TEST(scheduling_eval, sddmmISPC) { } + +// bin/taco-test --gtest_filter=scheduling_eval.sddmmISPC +TEST(scheduling_eval, sddmm2ISPC) { + + taco::util::TimeResults timevalue; + bool time = true; + + set_CUDA_codegen_enabled(false); + set_ISPC_codegen_enabled(false); + + int NUM_I = 1021/10; + int NUM_K = 1039/10; + int NUM_J = 1021/10; + float SPARSITY = .3; + Tensor A("A", {NUM_I, NUM_J}, {Dense, Dense}); + Tensor B("B", {NUM_I, NUM_J}, CSR); + Tensor C("C", {NUM_I, NUM_K}, {Dense, Dense}); + + srand(268238); + for (int i = 0; i < NUM_I; i++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + C.insert({i, k}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + } + + B.pack(); + C.pack(); + + set_ISPC_codegen_enabled(true); + A(i,j) = B(i,j) * C(i,k) * C(j,k); + + IndexStmt stmt = A.getAssignment().concretize(); + stmt = scheduleSDDMM2ISPC(stmt, B); + + //printToFile("sddmm_cpu", stmt); + + A.compile(stmt); + A.assemble(); + // A.compute(); + + set_ISPC_codegen_enabled(false); + Tensor expected("expected", {NUM_I, NUM_J}, {Dense, Dense}); + expected(i,j) = B(i,j) * C(i,k) * C(j,k); + IndexStmt stmt_taco = A.getAssignment().concretize(); + stmt_taco = scheduleSDDMM2CPU(stmt_taco, B); + expected.compile(stmt_taco); + expected.assemble(); + // expected.compute(); + + TOOL_BENCHMARK_TIMER(A.compute(), "Compute ISPC: ", timevalue); + TOOL_BENCHMARK_TIMER(expected.compute(), "Compute TACO: ", timevalue); + + ASSERT_TENSOR_EQ(expected, A); + + + float ERROR_MARGIN = 0.01; + // ASSERT_TENSOR_VAL(expected, y); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + if (expected(i,j) <= A(i,j) + ERROR_MARGIN && expected(i,j) >= A(i,j) - ERROR_MARGIN) { + // std::cout << "matched values: expected -> " << expected(j) << " == " << y(j) << " <- actual\n"; + } + else { + std::cout << "unmatched values: expected -> " << expected(i,j) << " != " << A(i,j) << " <- actual\n"; + ASSERT_TRUE(false); + }; + } + } + std::cout << "test scheduling_eval.sddmmISPC passed\n"; + +} + + TEST(scheduling_eval, spmvCPU) { if (should_use_CUDA_codegen()) { return; @@ -1215,9 +1552,9 @@ TEST(scheduling_eval, spmvISPC) { y(i) = A(i, j) * x(j); IndexStmt stmt = y.getAssignment().concretize(); - stmt = scheduleSpMVISPC(stmt); + // stmt = scheduleSpMVISPC(stmt); - //printToFile("spmv_cpu", stmt); + printToFile("spmv_cpu", stmt); y.compile(stmt); y.assemble(); @@ -1307,7 +1644,7 @@ TEST(scheduling_eval, ttvCPU) { IndexStmt stmt = A.getAssignment().concretize(); stmt = scheduleTTVCPU(stmt, B); - //printToFile("ttv_cpu", stmt); + printToFile("ttv_cpu", stmt); A.compile(stmt); A.assemble(); @@ -1362,7 +1699,7 @@ TEST(scheduling_eval, ttvISPC) { IndexStmt stmt = A.getAssignment().concretize(); stmt = scheduleTTVISPC(stmt, B); - //printToFile("ttv_cpu", stmt); + printToFile("ttv_ispc", "__ttv_ispc", stmt); A.compile(stmt); A.assemble(); @@ -1390,7 +1727,7 @@ TEST(scheduling_eval, ttvCPU_CSR) { int NUM_K = 1057/10; float SPARSITY = .3; Tensor A("A", {NUM_I, NUM_J}, {Dense, Sparse}); - Tensor B("B", {NUM_I, NUM_J, NUM_K}, {Sparse, Sparse, Sparse}); + Tensor B("B", {NUM_I, NUM_J, NUM_K}, {Dense, Sparse, Sparse}); Tensor c("c", {NUM_K}, Format({Dense})); srand(9536); @@ -1418,11 +1755,13 @@ TEST(scheduling_eval, ttvCPU_CSR) { IndexStmt stmt = A.getAssignment().concretize(); stmt = scheduleTTVCPUCSR(stmt); + printToFile("ttv_cpu_csr", stmt); + A.compile(stmt); A.assemble(); A.compute(); - Tensor expected("expected", {NUM_I, NUM_J}, {Dense, Dense}); + Tensor expected("expected", {NUM_I, NUM_J}, {Dense, Sparse}); expected(i,j) = B(i,j,k) * c(k); expected.compile(); expected.assemble(); @@ -1430,6 +1769,82 @@ TEST(scheduling_eval, ttvCPU_CSR) { ASSERT_TENSOR_EQ(expected, A); } +TEST(scheduling_eval, ttvISPC_CSR) { + if (should_use_CUDA_codegen()) { + return; + } + + int NUM_I = 10000; + int NUM_J = 1039/10; + int NUM_K = 128; + float SPARSITY = .3; + Tensor A("A", {NUM_I, NUM_J}, {Dense, Sparse}); + Tensor B("B", {NUM_I, NUM_J, NUM_K}, {Dense, Sparse, Sparse}); + Tensor c("c", {NUM_K}, Format({Dense})); + + srand(9536); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float) rand() / (float) (RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, j, k}, (double) ((int) (rand_float * 3 / SPARSITY))); + } + } + } + } + + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + c.insert({k}, (double) ((int) (rand_float*3))); + } + + B.pack(); + c.pack(); + + set_ISPC_codegen_enabled(true); + A(i,j) = B(i,j,k) * c(k); + + IndexStmt stmt = A.getAssignment().concretize(); + stmt = scheduleTTVISPCCSR(stmt); + printToFile("ttv_ispc_csr", "__ttv_ispc_csr", stmt); + + A.compile(stmt); + A.assemble(); + A.compute(); + + set_ISPC_codegen_enabled(false); + Tensor expected("expected", {NUM_I, NUM_J}, {Dense, Sparse}); + expected(i,j) = B(i,j,k) * c(k); + IndexStmt taco_stmt = expected.getAssignment().concretize(); + taco_stmt = scheduleTTVCPUCSR_ST(taco_stmt); + expected.compile(taco_stmt); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, A); + + Tensor A2("A2", {NUM_I, NUM_J}, {Dense, Sparse}); + set_ISPC_codegen_enabled(true); + A2(i,j) = B(i,j,k) * c(k); + + IndexStmt stmt2 = A2.getAssignment().concretize(); + + A2.compile(stmt2); + A2.assemble(); + A2.compute(); + + taco::util::TimeResults timevalue; + bool time = true; + + for (int i=0; i<3; i++) { + TOOL_BENCHMARK_TIMER(expected.compute(), "Compute TACO1: ", timevalue); + TOOL_BENCHMARK_TIMER(A.compute(), "Compute ISPC1: ", timevalue); + TOOL_BENCHMARK_TIMER(A2.compute(), "Compute ISPC2: ", timevalue); + } + + +} + TEST(scheduling_eval, ttmCPU) { if (should_use_CUDA_codegen()) { return; @@ -1605,12 +2020,13 @@ TEST(scheduling_eval, mttkrpISPC) { if (should_use_CUDA_codegen()) { return; } - int NUM_I = 1021/20; - int NUM_J = 1039/20; + set_ISPC_codegen_enabled(false); + set_CUDA_codegen_enabled(false); + int NUM_I = 10000; // 1021/20; + int NUM_J = 256; int NUM_K = 1057/20; int NUM_L = 1232/20; float SPARSITY = .1; - Tensor A("A", {NUM_I, NUM_J}, {Dense, Dense}); Tensor B("B", {NUM_I, NUM_K, NUM_L}, {Dense, Sparse, Sparse}); Tensor C("C", {NUM_K, NUM_J}, {Dense, Dense}); Tensor D("D", {NUM_L, NUM_J}, {Dense, Dense}); @@ -1645,24 +2061,183 @@ TEST(scheduling_eval, mttkrpISPC) { C.pack(); D.pack(); - A(i,j) = B(i,k,l) * C(k,j) * D(l,j); + set_ISPC_codegen_enabled(true); - IndexStmt stmt = A.getAssignment().concretize(); - stmt = scheduleMTTKRPCPU(stmt, B); - //printToFile("mttkrp_cpu", stmt); + Tensor A1("A1", {NUM_I, NUM_J}, {Dense, Dense}); + A1(i,j) = B(i,k,l) * C(k,j) * D(l,j); + IndexStmt stmt1 = A1.getAssignment().concretize(); + stmt1 = scheduleMTTKRPISPC(stmt1, B); + // printToFile("mttkrp1_cpu_ispc", stmt1); + A1.compile(stmt1); + A1.assemble(); + A1.compute(); - A.compile(stmt); - A.assemble(); - A.compute(); + set_ISPC_codegen_enabled(false); + Tensor expected1("expected1", {NUM_I, NUM_J}, {Dense, Dense}); + expected1(i,j) = B(i,k,l) * C(k,j) * D(l,j); + IndexStmt taco_stmt1 = expected1.getAssignment().concretize(); + taco_stmt1 = scheduleMTTKRPCPU(taco_stmt1, B); + expected1.compile(taco_stmt1); + expected1.assemble(); + expected1.compute(); + ASSERT_TENSOR_EQ(expected1, A1); - Tensor expected("expected", {NUM_I, NUM_J}, {Dense, Dense}); - expected(i,j) = B(i,k,l) * C(k,j) * D(l,j); - expected.compile(); - expected.assemble(); - expected.compute(); - ASSERT_TENSOR_EQ(expected, A); + set_ISPC_codegen_enabled(true); + Tensor A2("A2", {NUM_I, NUM_J}, {Dense, Dense}); + A2(i,j) = B(i,k,l) * C(k,j) * D(l,j); + IndexStmt stmt2 = A1.getAssignment().concretize(); + stmt2 = scheduleMTTKRPPrecomputedISPC_ST(stmt2, B); + // printToFile("mttkrp_cpu_ispc", stmt); + A2.compile(stmt2); + A2.assemble(); + A2.compute(); + ASSERT_TENSOR_EQ(expected1, A2); + + set_ISPC_codegen_enabled(false); + Tensor expected2("expected2", {NUM_I, NUM_J}, {Dense, Dense}); + expected2(i,j) = B(i,k,l) * C(k,j) * D(l,j); + IndexStmt taco_stmt2 = expected2.getAssignment().concretize(); + taco_stmt2 = scheduleMTTKRPPrecomputedCPU_ST(taco_stmt2, B); + expected2.compile(taco_stmt2); + expected2.assemble(); + expected2.compute(); + ASSERT_TENSOR_EQ(expected1, expected2); + + taco::util::TimeResults timevalue; + bool time = true; + + for (int i=0; i<3; i++) { + TOOL_BENCHMARK_TIMER(expected1.compute(), "Compute TACO1: ", timevalue); + TOOL_BENCHMARK_TIMER(A1.compute(), "Compute ISPC1: ", timevalue); + TOOL_BENCHMARK_TIMER(expected2.compute(), "Compute TACO2: ", timevalue); + TOOL_BENCHMARK_TIMER(A2.compute(), "Compute ISPC2: ", timevalue); + } } + +TEST(scheduling_eval, mttkrp4ISPC) { + if (should_use_CUDA_codegen()) { + return; + } + set_ISPC_codegen_enabled(false); + set_CUDA_codegen_enabled(false); + int NUM_I = 1000; // 1021/20; + int NUM_J = 16; + int NUM_K = 1057/20; + int NUM_L = 1232/20; + int NUM_M = 1124/20; + float SPARSITY = .1; + Tensor B("B", {NUM_I, NUM_K, NUM_L, NUM_M}, {Dense, Sparse, Sparse, Sparse}); + Tensor C("C", {NUM_K, NUM_J}, {Dense, Dense}); + Tensor D("D", {NUM_L, NUM_J}, {Dense, Dense}); + Tensor E("E", {NUM_M, NUM_J}, {Dense, Dense}); + + srand(549694); + for (int i = 0; i < NUM_I; i++) { + for (int k = 0; k < NUM_K; k++) { + for (int l = 0; l < NUM_L; l++) { + for (int m = 0; m < NUM_M; m++) { + float rand_float = (float) rand() / (float) (RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, k, l, m}, (double) ((int) (rand_float * 3 / SPARSITY))); + } + } + } + } + } + + for (int k = 0; k < NUM_K; k++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + C.insert({k, j}, (double) ((int) (rand_float*3))); + } + } + + for (int l = 0; l < NUM_L; l++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + D.insert({l, j}, (double) ((int) (rand_float*3))); + } + } + + for (int m = 0; m < NUM_M; m++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + E.insert({m, j}, (double) ((int) (rand_float*3))); + } + } + + B.pack(); + C.pack(); + D.pack(); + E.pack(); + + set_ISPC_codegen_enabled(true); + Tensor A1("A1", {NUM_I, NUM_J}, {Dense, Dense}); + A1(i,j) = B(i,k,l,m) * C(k,j) * D(l,j) * E(m,j); + IndexStmt stmt1 = A1.getAssignment().concretize(); + stmt1 = scheduleMTTKRP4ISPC_ST(stmt1, B); + // printToFile("mttkrp1_cpu_ispc", stmt1); + A1.compile(stmt1); + A1.assemble(); + A1.compute(); + + set_ISPC_codegen_enabled(false); + Tensor expected1("expected1", {NUM_I, NUM_J}, {Dense, Dense}); + expected1(i,j) = B(i,k,l,m) * C(k,j) * D(l,j) * E(m,j); + IndexStmt taco_stmt1 = expected1.getAssignment().concretize(); + taco_stmt1 = scheduleMTTKRP4CPU_ST(taco_stmt1, B); + expected1.compile(taco_stmt1); + expected1.assemble(); + expected1.compute(); + ASSERT_TENSOR_EQ(expected1, A1); + + // set_ISPC_codegen_enabled(true); + // Tensor A2("A2", {NUM_I, NUM_J}, {Dense, Dense}); + // A2(i,j) = B(i,k,l) * C(k,j) * D(l,j); + // IndexStmt stmt2 = A1.getAssignment().concretize(); + // stmt2 = scheduleMTTKRPPrecomputedISPC_ST(stmt2, B); + // // printToFile("mttkrp_cpu_ispc", stmt); + // A2.compile(stmt2); + // A2.assemble(); + // A2.compute(); + // ASSERT_TENSOR_EQ(expected1, A2); + + set_ISPC_codegen_enabled(false); + Tensor expected2("expected2", {NUM_I, NUM_J}, {Dense, Dense}); + expected2(i,j) = B(i,k,l,m) * C(k,j) * D(l,j) * E(m,j); + + IndexExpr BE = B(i,k,l,m) * E(m,j); + IndexExpr BDE = BE * D(l, j); + expected2(i,j) = BDE * C(k,j); + IndexStmt taco_stmt2 = expected2.getAssignment().concretize(); + TensorVar BE_workspace("BE_workspace", Type(Float64, {Dimension(j)}), taco::dense); + TensorVar BDE_workspace("BDE_workspace", Type(Float64, {Dimension(j)}), taco::dense); + + IndexStmt precomputed_stmt = forall(i, forall(k, + where(forall(j, expected2(i,j) += BDE_workspace(j) * C(k,j)), + forall(l, where(forall(j, BDE_workspace(j) += BE_workspace(j) * D(l,j)), + forall(m, forall(j, BE_workspace(j) += B(i,k,l,m) * E(m,j)))))))); + + // IndexStmt scheduled2 = scheduleMTTKRPPrecomputedCPU(precomputed_stmt, B, 64); + // expected2.compile(scheduled2); + // expected2.assemble(); + // expected2.compute(); + // ASSERT_TENSOR_EQ(expected1, expected2); + + taco::util::TimeResults timevalue; + bool time = true; + + for (int i=0; i<3; i++) { + TOOL_BENCHMARK_TIMER(expected1.compute(), "Compute TACO1: ", timevalue); + TOOL_BENCHMARK_TIMER(A1.compute(), "Compute ISPC1: ", timevalue); + // TOOL_BENCHMARK_TIMER(expected2.compute(), "Compute TACO2: ", timevalue); + // TOOL_BENCHMARK_TIMER(A2.compute(), "Compute ISPC2: ", timevalue); + } +} + + + TEST(scheduling_eval, spmvGPU) { if (!should_use_CUDA_codegen()) { return; @@ -2042,7 +2617,7 @@ TEST(scheduling_eval, mttkrpGPU) { ASSERT_TENSOR_EQ(expected, A); } -TEST(generate_ispc_evaluation_files, ispc) { +TEST(generate_evaluation_files, ispc) { std::cout << "Hi Adhitha!\n" << std::endl ; set_CUDA_codegen_enabled(false); set_ISPC_codegen_enabled(true); @@ -2063,6 +2638,7 @@ TEST(generate_ispc_evaluation_files, ispc) { int NUM_I = 100; int NUM_J = 100; int NUM_K = 100; + int NUM_L = 100; string c_file_ending = ".h"; string file_ending = ".ispc"; @@ -2130,7 +2706,35 @@ TEST(generate_ispc_evaluation_files, ispc) { ispc_source_file.close(); } - // spmm + // spmm omp + { + stringstream source1; + stringstream source2; + std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); + Tensor A("A", {NUM_I, NUM_J}, CSR); + Tensor X("X", {NUM_J, NUM_K}, {Dense, Dense}); + Tensor Y("Y", {NUM_I, NUM_K}, {Dense, Dense}); + Y(i, k) = A(i, j) * X(j, k); + IndexStmt stmt = Y.getAssignment().concretize(); + bool isFirst = true; + for (auto paramSet : spmm_parameters) { + IndexStmt scheduled = scheduleSpMMISPCOMP1(stmt, A, paramSet[0], paramSet[1]); + ir::Stmt compute = lower(scheduled, string("compute1_") + util::join(paramSet, "_"), false, true); + codegen->compile(compute, isFirst); + isFirst = false; + } + ofstream source_file; + source_file.open(file_path + "spmm_omp_ispc_taco1" + c_file_ending); + source_file << source1.str(); + source_file.close(); + + ofstream ispc_source_file; + ispc_source_file.open(file_path + "__spmm_omp_ispc_taco1" + file_ending); + ispc_source_file << source2.str(); + ispc_source_file.close(); + } + + // spmm2 { stringstream source1; stringstream source2; @@ -2186,6 +2790,64 @@ TEST(generate_ispc_evaluation_files, ispc) { ispc_source_file.close(); } + // ttv + { + stringstream source; + stringstream source2; + std::shared_ptr codegen = ir::CodeGen::init_default(source, source2, ir::CodeGen::ImplementationGen); + Tensor A("A", {NUM_I, NUM_J}, {Dense, Dense}); // TODO: change to sparse outputs + Tensor B("B", {NUM_I, NUM_J, NUM_K}, {Sparse, Sparse, Sparse}); + Tensor c("c", {NUM_K}, Format({Dense})); + A(i,j) = B(i,j,k) * c(k); + IndexStmt stmt = A.getAssignment().concretize(); + bool isFirst = true; + for (auto paramSet : ttv_parameters) { + IndexStmt scheduled = scheduleTTVCPU(stmt, B, paramSet[0]); + ir::Stmt compute = lower(scheduled, string("compute_") + util::join(paramSet, "_"), false, true); + codegen->compile(compute, isFirst); + isFirst = false; + } + ofstream source_file; + source_file.open(file_path + "ttv_cpu" + c_file_ending); + source_file << source.str(); + source_file.close(); + + ofstream ispc_source_file; + ispc_source_file.open(file_path + "__ttv_cpu" + file_ending); + ispc_source_file << source2.str(); + ispc_source_file.close(); + } + + + // mttkrp3 + { + stringstream source; + stringstream source2; + std::shared_ptr codegen = ir::CodeGen::init_default(source, source2, ir::CodeGen::ImplementationGen); + Tensor A("A", {NUM_I, NUM_J}, {Dense, Dense}); + Tensor B("B", {NUM_I, NUM_K, NUM_L}, {Dense, Sparse, Sparse}); + Tensor C("C", {NUM_K, NUM_J}, {Dense, Dense}); + Tensor D("D", {NUM_L, NUM_J}, {Dense, Dense}); + A(i,j) = B(i,k,l) * C(k,j) * D(l,j); + IndexStmt stmt = A.getAssignment().concretize(); + bool isFirst = true; + for (auto paramSet : mttkrp_parameters) { + IndexStmt scheduled = scheduleMTTKRPCPU(stmt, B, paramSet[0], paramSet[1]); + ir::Stmt compute = lower(scheduled, string("compute_") + util::join(paramSet, "_"), false, true); + codegen->compile(compute, isFirst); + isFirst = false; + } + ofstream source_file; + source_file.open(file_path + "mttkrp3_cpu" + c_file_ending); + source_file << source.str(); + source_file.close(); + + ofstream ispc_source_file; + ispc_source_file.open(file_path + "__mttkrp3_cpu" + file_ending); + ispc_source_file << source2.str(); + ispc_source_file.close(); + } + return; } @@ -2283,6 +2945,7 @@ TEST(generate_ispc_sddmm_evaluation_files, ispc) { + TEST(generate_evaluation_files, cpu) { if (should_use_CUDA_codegen()) { return; @@ -2599,7 +3262,7 @@ TEST(generate_evaluation_files, cpu) { } } -TEST(generate_evaluation_files_spmv, ispc) { +TEST(generate_evaluation_files, spmv_ispc) { set_CUDA_codegen_enabled(false); set_ISPC_codegen_enabled(true); From 09864add784e06ca0b6eee6728ea0e11923f2540 Mon Sep 17 00:00:00 2001 From: Adhhitha Dias Date: Thu, 3 Mar 2022 14:08:23 -0500 Subject: [PATCH 09/10] fuse kernel implementation --- CMakeLists.txt | 41 +- include/taco/codegen/module.h | 12 +- include/taco/index_notation/transformations.h | 3 +- include/taco/taco_tensor_t.h | 1 + include/taco/tensor.h | 2 + src/codegen/codegen_c.cpp | 22 +- src/codegen/codegen_c.h | 29 +- src/codegen/codegen_ispc.cpp | 112 +- src/codegen/codegen_ispc.h | 13 +- src/codegen/module.cpp | 60 +- src/index_notation/index_notation.cpp | 19 +- src/index_notation/transformations.cpp | 684 ++- src/ir/ir_printer.cpp | 2 +- src/ir/ir_rewriter.cpp | 2 +- src/lower/iteration_graph.cpp | 10 + src/lower/iterator.cpp | 3 + src/lower/lowerer_impl_imperative.cpp | 176 +- src/lower/tensor_path.h | 4 +- src/tensor.cpp | 62 +- test/CMakeLists.txt | 1 + test/kernels/mttkrp_gemm/mttkrp_ryan.c | 177 + test/kernels/mttkrp_gemm/mttkrp_ryan.h | 125 + test/kernels/mttkrp_gemm/taco_default.c | 183 + test/kernels/mttkrp_gemm/taco_default.h | 125 + .../sddmm_spmm/csr_dense_dense_sddmm.c | 199 + .../sddmm_spmm/csr_dense_dense_sddmm.h | 125 + .../sddmm_spmm/csr_dense_dense_sddmm.so | Bin 0 -> 14360 bytes test/kernels/sddmm_spmm/csr_dense_spmm.c | 190 + test/kernels/sddmm_spmm/csr_dense_spmm.h | 125 + test/kernels/sddmm_spmm/csr_dense_spmm.so | Bin 0 -> 14520 bytes test/kernels/sddmm_spmm/fused_kernel.c | 183 + test/kernels/sddmm_spmm/fused_kernel.h | 125 + test/kernels/sddmm_spmm/fused_kernel.so | Bin 0 -> 14512 bytes test/kernels/sddmm_spmm/sddmm_ryan.c | 210 + test/kernels/sddmm_spmm/sddmm_ryan.h | 125 + test/kernels/sddmm_spmm/sddmm_ryan.so | Bin 0 -> 14352 bytes test/kernels/sddmm_spmm/taco_original.c | 166 + test/kernels/sddmm_spmm/taco_original.h | 125 + test/kernels/sddmm_spmm/taco_original.so | Bin 0 -> 14304 bytes test/kernels/spmm_gemm/gemm_default.c | 160 + test/kernels/spmm_gemm/gemm_default.h | 125 + test/kernels/spmm_gemm/gemm_default.so | Bin 0 -> 14296 bytes test/kernels/spmm_gemm/gemm_template.c | 183 + test/kernels/spmm_gemm/gemm_template.h | 125 + test/kernels/spmm_gemm/gemm_template.so | Bin 0 -> 14512 bytes test/kernels/spmv_spmv/spmv_fused.c | 178 + test/kernels/spmv_spmv/spmv_fused.h | 125 + test/kernels/spmv_spmv/spmv_fused.so | Bin 0 -> 14152 bytes test/kernels/spmv_spmv/spmv_spmv_default.c | 157 + test/kernels/spmv_spmv/spmv_spmv_default.h | 125 + test/kernels/ttm_ttm/fused copy.c | 248 + test/kernels/ttm_ttm/fused.c | 242 + test/kernels/ttm_ttm/fused.h | 125 + test/kernels/ttm_ttm/fused.so | Bin 0 -> 14560 bytes test/kernels/ttm_ttm/gemm.c | 181 + test/kernels/ttm_ttm/gemm.h | 125 + test/kernels/ttm_ttm/ttm1_1.c | 219 + test/kernels/ttm_ttm/ttm1_1.h | 125 + test/kernels/ttm_ttm/ttm1_1.so | Bin 0 -> 14400 bytes test/kernels/ttm_ttm/ttm1_2.c | 219 + test/kernels/ttm_ttm/ttm1_2.h | 125 + test/kernels/ttm_ttm/ttm1_2.so | Bin 0 -> 14400 bytes test/kernels/ttm_ttm/ttm2.c | 218 + test/kernels/ttm_ttm/ttm2.h | 125 + test/kernels/ttm_ttm/ttm2.so | Bin 0 -> 14400 bytes test/kernels/ttm_ttm/ttm_original copy 2.c | 242 + test/kernels/ttm_ttm/ttm_original copy.c | 225 + test/kernels/ttm_ttm/ttm_original.c | 226 + test/kernels/ttm_ttm/ttm_original.h | 125 + test/kernels/ttm_ttm/ttm_original.so | Bin 0 -> 14408 bytes test/kernels/ttm_ttm/ttm_original2.c | 229 + test/kernels/ttm_ttm/ttm_original2.h | 125 + test/kernels/ttm_ttm/ttm_original2.so | Bin 0 -> 14568 bytes test/stats/hadamard-gemm.txt | 749 +++ test/stats/mttkrp-spmm.txt | 1090 ++++ test/stats/sddmm-spmm-gemm.txt | 1153 ++++ test/stats/sddmm-spmm.txt | 5174 +++++++++++++++++ test/stats/spmm-spmm.txt | 3432 +++++++++++ test/stats/spmv-spmv.txt | 81 + test/stats/ttm-ttm.txt | 2924 ++++++++++ test/tests-indexstmt.cpp | 194 +- test/tests-scheduling-eval.cpp | 241 +- test/tests-scheduling-fuse.cpp | 2872 +++++++++ test/tests-scheduling-ispc-eval.cpp | 2 + test/tests-transformation.cpp | 2 + test/util.h | 113 + tools/CMakeLists.txt | 1 + tools/taco.cpp | 44 +- 88 files changed, 25490 insertions(+), 325 deletions(-) create mode 100644 test/kernels/mttkrp_gemm/mttkrp_ryan.c create mode 100644 test/kernels/mttkrp_gemm/mttkrp_ryan.h create mode 100644 test/kernels/mttkrp_gemm/taco_default.c create mode 100644 test/kernels/mttkrp_gemm/taco_default.h create mode 100644 test/kernels/sddmm_spmm/csr_dense_dense_sddmm.c create mode 100644 test/kernels/sddmm_spmm/csr_dense_dense_sddmm.h create mode 100755 test/kernels/sddmm_spmm/csr_dense_dense_sddmm.so create mode 100644 test/kernels/sddmm_spmm/csr_dense_spmm.c create mode 100644 test/kernels/sddmm_spmm/csr_dense_spmm.h create mode 100755 test/kernels/sddmm_spmm/csr_dense_spmm.so create mode 100644 test/kernels/sddmm_spmm/fused_kernel.c create mode 100644 test/kernels/sddmm_spmm/fused_kernel.h create mode 100755 test/kernels/sddmm_spmm/fused_kernel.so create mode 100644 test/kernels/sddmm_spmm/sddmm_ryan.c create mode 100644 test/kernels/sddmm_spmm/sddmm_ryan.h create mode 100755 test/kernels/sddmm_spmm/sddmm_ryan.so create mode 100644 test/kernels/sddmm_spmm/taco_original.c create mode 100644 test/kernels/sddmm_spmm/taco_original.h create mode 100755 test/kernels/sddmm_spmm/taco_original.so create mode 100644 test/kernels/spmm_gemm/gemm_default.c create mode 100644 test/kernels/spmm_gemm/gemm_default.h create mode 100755 test/kernels/spmm_gemm/gemm_default.so create mode 100644 test/kernels/spmm_gemm/gemm_template.c create mode 100644 test/kernels/spmm_gemm/gemm_template.h create mode 100755 test/kernels/spmm_gemm/gemm_template.so create mode 100644 test/kernels/spmv_spmv/spmv_fused.c create mode 100644 test/kernels/spmv_spmv/spmv_fused.h create mode 100755 test/kernels/spmv_spmv/spmv_fused.so create mode 100644 test/kernels/spmv_spmv/spmv_spmv_default.c create mode 100644 test/kernels/spmv_spmv/spmv_spmv_default.h create mode 100644 test/kernels/ttm_ttm/fused copy.c create mode 100644 test/kernels/ttm_ttm/fused.c create mode 100644 test/kernels/ttm_ttm/fused.h create mode 100755 test/kernels/ttm_ttm/fused.so create mode 100644 test/kernels/ttm_ttm/gemm.c create mode 100644 test/kernels/ttm_ttm/gemm.h create mode 100644 test/kernels/ttm_ttm/ttm1_1.c create mode 100644 test/kernels/ttm_ttm/ttm1_1.h create mode 100755 test/kernels/ttm_ttm/ttm1_1.so create mode 100644 test/kernels/ttm_ttm/ttm1_2.c create mode 100644 test/kernels/ttm_ttm/ttm1_2.h create mode 100755 test/kernels/ttm_ttm/ttm1_2.so create mode 100644 test/kernels/ttm_ttm/ttm2.c create mode 100644 test/kernels/ttm_ttm/ttm2.h create mode 100755 test/kernels/ttm_ttm/ttm2.so create mode 100644 test/kernels/ttm_ttm/ttm_original copy 2.c create mode 100644 test/kernels/ttm_ttm/ttm_original copy.c create mode 100644 test/kernels/ttm_ttm/ttm_original.c create mode 100644 test/kernels/ttm_ttm/ttm_original.h create mode 100755 test/kernels/ttm_ttm/ttm_original.so create mode 100644 test/kernels/ttm_ttm/ttm_original2.c create mode 100644 test/kernels/ttm_ttm/ttm_original2.h create mode 100755 test/kernels/ttm_ttm/ttm_original2.so create mode 100644 test/stats/hadamard-gemm.txt create mode 100644 test/stats/mttkrp-spmm.txt create mode 100644 test/stats/sddmm-spmm-gemm.txt create mode 100644 test/stats/sddmm-spmm.txt create mode 100644 test/stats/spmm-spmm.txt create mode 100644 test/stats/spmv-spmv.txt create mode 100644 test/stats/ttm-ttm.txt create mode 100644 test/tests-scheduling-fuse.cpp create mode 100644 test/tests-scheduling-ispc-eval.cpp create mode 100644 test/util.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 7e9359e01..aff905db5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,7 +12,7 @@ project(taco option(CUDA "Build for NVIDIA GPU (CUDA must be preinstalled)" OFF) option(ISPC "Build for Intel ISPC Compiler (ISPC Compiler must be preinstalled)" OFF) option(PYTHON "Build TACO for python environment" OFF) -option(OPENMP "Build with OpenMP execution support" OFF) +option(OPENMP "Build with OpenMP execution support" ON) option(COVERAGE "Build with code coverage analysis" OFF) set(TACO_FEATURE_CUDA 0) set(TACO_FEATURE_ISPC 0) @@ -95,6 +95,39 @@ if(OPENMP) set(C_CXX_FLAGS "-fopenmp ${C_CXX_FLAGS}") endif(OPENMP) +set(PAPI_DIR "/home/min/a/kadhitha/workspace/my_taco/papi/src/install/") + +find_path(PAPI_DIR + NAMES include/papi.h +) + +find_library(PAPI_LIBRARIES + # Pick the static library first for easier run-time linking. + NAMES libpapi.a papi + HINTS ${PAPI_DIR}/lib ${HILTIDEPS}/lib +) + +find_path(PAPI_INCLUDE_DIRS + NAMES papi.h + HINTS ${PAPI_DIR}/include ${HILTIDEPS}/include +) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(PAPI DEFAULT_MSG + PAPI_LIBRARIES + PAPI_INCLUDE_DIRS +) + +mark_as_advanced( + PAPI_PREFIX_DIRS + PAPI_LIBRARIES + PAPI_INCLUDE_DIRS +) + +include_directories(${PAPI_INCLUDE_DIRS}) + +# project (ValgrindExample) + if(COVERAGE) find_program(PATH_TO_GCOVR gcovr REQUIRED) # add coverage tooling to build flags @@ -104,7 +137,8 @@ if(COVERAGE) message("-- Code coverage analysis (gcovr) enabled") endif(COVERAGE) -set(C_CXX_FLAGS "${C_CXX_FLAGS}") +set(C_CXX_FLAGS "${C_CXX_FLAGS} -I/${PAPI_DIR}/include -L/${PAPI_DIR}/lib") +# set(C_CXX_FLAGS "${C_CXX_FLAGS}") set(CMAKE_C_FLAGS "${C_CXX_FLAGS}") set(CMAKE_CXX_FLAGS "${C_CXX_FLAGS} -std=c++14") @@ -117,6 +151,9 @@ set(TACO_INCLUDE_DIR ${TACO_PROJECT_DIR}/include) enable_testing() include_directories(${TACO_INCLUDE_DIR}) +# include_directories("/home/min/a/kadhitha/workspace/my_taco/valgrind") +# project (ValgrindExample) +# include (CTest) set(TACO_LIBRARY_DIR ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}) diff --git a/include/taco/codegen/module.h b/include/taco/codegen/module.h index 3df7c8e0f..4db5fcdaf 100644 --- a/include/taco/codegen/module.h +++ b/include/taco/codegen/module.h @@ -17,7 +17,7 @@ class Module { public: /// Create a module for some target Module(Target target=getTargetFromEnvironment()) - : lib_handle(nullptr), moduleFromUserSource(false), target(target) { + : lib_handle(nullptr), so_lib_handle(nullptr), moduleFromUserSource(false), target(target) { setJITLibname(); setJITTmpdir(); } @@ -44,11 +44,16 @@ class Module { /// before calling. If there's no function of this name then a nullptr is /// returned. void* getFuncPtr(std::string name); + void* getFuncPtr(std::string& sofile, std::string name); /// Call a raw function in this module and return the result + int callFuncPackedRaw(std::string name, std::string& sofile, void** args); int callFuncPackedRaw(std::string name, void** args); /// Call a raw function in this module and return the result + int callFuncPackedRaw(std::string name, std::string& sofile, std::vector args) { + return callFuncPackedRaw(name, sofile, args.data()); + } int callFuncPackedRaw(std::string name, std::vector args) { return callFuncPackedRaw(name, args.data()); } @@ -57,6 +62,10 @@ class Module { int callFuncPacked(std::string name, void** args) { return callFuncPackedRaw("_shim_"+name, args); } + + int callFuncPacked(std::string name, std::string& sofile, void** args) { + return callFuncPackedRaw("_shim_"+name, sofile,args); + } /// Call a function using the taco_tensor_t interface and return the result int callFuncPacked(std::string name, std::vector args) { @@ -73,6 +82,7 @@ class Module { std::string libname; std::string tmpdir; void* lib_handle; + void* so_lib_handle; std::vector funcs; // true iff the module was created from user-provided source diff --git a/include/taco/index_notation/transformations.h b/include/taco/index_notation/transformations.h index 6bf277d5c..4d6ec6830 100644 --- a/include/taco/index_notation/transformations.h +++ b/include/taco/index_notation/transformations.h @@ -223,7 +223,8 @@ IndexStmt parallelizeOuterLoop(IndexStmt stmt); */ IndexStmt reorderLoopsTopologically(IndexStmt stmt); -IndexStmt justTraverseThroughTheIndexStmt(IndexStmt stmt); +IndexStmt loopFusionOverFission(IndexStmt stmt, Assignment assignment, + std::string side, int iters); /** * Performs scalar promotion so that reductions are done by accumulating into diff --git a/include/taco/taco_tensor_t.h b/include/taco/taco_tensor_t.h index 20d78bb51..f27acd9c7 100644 --- a/include/taco/taco_tensor_t.h +++ b/include/taco/taco_tensor_t.h @@ -6,6 +6,7 @@ #ifndef TACO_TENSOR_T_DEFINED #define TACO_TENSOR_T_DEFINED +#include #include typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; diff --git a/include/taco/tensor.h b/include/taco/tensor.h index b91782256..883718fb6 100644 --- a/include/taco/tensor.h +++ b/include/taco/tensor.h @@ -413,6 +413,8 @@ class TensorBase { /// Compile the tensor expression. void compile(); + void compute(std::ofstream& statfile); + void compute(std::ofstream& statfile, std::string& sofile); void compile(IndexStmt stmt, bool assembleWhileCompute=false); diff --git a/src/codegen/codegen_c.cpp b/src/codegen/codegen_c.cpp index 2ade9d7f6..83da7aaab 100644 --- a/src/codegen/codegen_c.cpp +++ b/src/codegen/codegen_c.cpp @@ -34,6 +34,7 @@ const string cHeaders = "#include \n" "#include \n" "#include \n" + "#include \n" "#if _OPENMP\n" "#include \n" "#endif\n" @@ -240,7 +241,10 @@ class CodeGen_C::FindVars : public IRVisitor { }; CodeGen_C::CodeGen_C(std::ostream &dest, OutputKind outputKind, bool simplify) - : CodeGen(dest, false, simplify, C), out(dest), outputKind(outputKind) {} + : CodeGen(dest, false, simplify, C), out(dest), out2(dest), outputKind(outputKind) {} + +CodeGen_C::CodeGen_C(std::ostream &dest, std::ostream &dest2, OutputKind outputKind, bool simplify) + : CodeGen(dest, dest2, false, simplify, C), out(dest), out2(dest2), outputKind(outputKind) {} CodeGen_C::~CodeGen_C() {} @@ -299,14 +303,18 @@ void CodeGen_C::visit(const Function* func) { // Print variable declarations out << printDecls(varFinder.varDecls, func->inputs, func->outputs) << endl; + // out << "printf(\"declarations added\\n\");" << std::endl; if (emittingCoroutine) { out << printContextDeclAndInit(varMap, localVars, numYields, func->name) << endl; } + // out << "printf(\"declarations added2\\n\");" << std::endl; // output body print(func->body); + // out << "printf(\"function body added " << count++ << "\\n\"); // " << std::endl; + // output repack only if we allocated memory if (checkForAlloc(func)) @@ -403,6 +411,9 @@ static string getAtomicPragma() { // Docs for vectorization pragmas: // http://clang.llvm.org/docs/LanguageExtensions.html#extensions-for-loop-hint-optimizations void CodeGen_C::visit(const For* op) { + + // out << " printf(\"adding for loop " << count++ << "\\n\"); //" << std::endl; + switch (op->kind) { case LoopKind::Vectorized: doIndent(); @@ -452,6 +463,14 @@ void CodeGen_C::visit(const For* op) { } stream << ") {\n"; + // out << " printf(\"loop " << count++ << " : %d , dim: %d, %d\\n\","; + // op->var.accept(this); + // out << ", "; + // op->start.accept(this); + // out << ", "; + // op->end.accept(this); + // out << "); // " << count++ << std::endl; + op->contents.accept(this); doIndent(); stream << "}"; @@ -472,6 +491,7 @@ void CodeGen_C::visit(const While* op) { } void CodeGen_C::visit(const GetProperty* op) { + // std::cout << "GetProperty* " << op << std::endl; taco_iassert(varMap.count(op) > 0) << "Property " << Expr(op) << " of " << op->tensor << " not found in varMap"; out << varMap[op]; diff --git a/src/codegen/codegen_c.h b/src/codegen/codegen_c.h index 55c9d01a8..c8505a3bb 100644 --- a/src/codegen/codegen_c.h +++ b/src/codegen/codegen_c.h @@ -16,6 +16,7 @@ class CodeGen_C : public CodeGen { /// Initialize a code generator that generates code to an /// output stream. CodeGen_C(std::ostream &dest, OutputKind outputKind, bool simplify=true); + CodeGen_C(std::ostream &dest, std::ostream &dest2, OutputKind outputKind, bool simplify=true); ~CodeGen_C(); /// Compile a lowered function @@ -28,23 +29,25 @@ class CodeGen_C : public CodeGen { protected: using IRPrinter::visit; - void visit(const Function*); - void visit(const VarDecl*); - void visit(const Yield*); - void visit(const Var*); - void visit(const For*); - void visit(const While*); - void visit(const GetProperty*); - void visit(const Min*); - void visit(const Max*); - void visit(const Allocate*); - void visit(const Sqrt*); - void visit(const Store*); - void visit(const Assign*); + virtual void visit(const Function*); + virtual void visit(const VarDecl*); + virtual void visit(const Yield*); + virtual void visit(const Var*); + virtual void visit(const For*); + virtual void visit(const While*); + virtual void visit(const GetProperty*); + virtual void visit(const Min*); + virtual void visit(const Max*); + virtual void visit(const Allocate*); + virtual void visit(const Sqrt*); + virtual void visit(const Store*); + virtual void visit(const Assign*); std::map varMap; std::vector localVars; std::ostream &out; + std::ostream &out2; + int count = 0; OutputKind outputKind; diff --git a/src/codegen/codegen_ispc.cpp b/src/codegen/codegen_ispc.cpp index d35af1748..d4f428ccf 100644 --- a/src/codegen/codegen_ispc.cpp +++ b/src/codegen/codegen_ispc.cpp @@ -418,10 +418,10 @@ class CodeGen_ISPC::FunctionCollector : public IRVisitor { CodeGen_ISPC::CodeGen_ISPC(std::ostream &dest, OutputKind outputKind, bool simplify) - : CodeGen(dest, false, simplify, C), out(dest), out2(dest), outputKind(outputKind) {} + : CodeGen_C(dest, dest, outputKind, simplify) {} CodeGen_ISPC::CodeGen_ISPC(std::ostream &dest, std::ostream &dest2, OutputKind outputKind, bool simplify) - : CodeGen(dest, dest2, false, simplify, C), out(dest), out2(dest2), outputKind(outputKind) {} + : CodeGen_C(dest, dest2, outputKind, simplify) {} CodeGen_ISPC::~CodeGen_ISPC() {} @@ -543,7 +543,7 @@ void CodeGen_ISPC::sendToStream(std::stringstream &stream) { this->out2 << stream.str(); } else { - this->out << stream.str(); + CodeGen_C::sendToStream(stream); } } @@ -709,17 +709,7 @@ void CodeGen_ISPC::visit(const VarDecl* op) { } } else { - if (emittingCoroutine) { - doIndent(); - op->var.accept(this); - parentPrecedence = Precedence::TOP; - stream << " = "; - op->rhs.accept(this); - stream << ";"; - stream << endl; - } else { - IRPrinter::visit(op); - } + CodeGen_C::visit(op); } // sendToStream(stream); @@ -744,15 +734,7 @@ void CodeGen_ISPC::visit(const Var* op) { } } else { - taco_iassert(varMap.count(op) > 0) << - "Var " << op->name << " not found in varMap"; - if (emittingCoroutine) { - // out << "TACO_DEREF("; - } - out << varMap[op]; - if (emittingCoroutine) { - // out << ")"; - } + CodeGen_C::visit(op); } } @@ -804,7 +786,7 @@ static string getAtomicPragma() { // http://clang.llvm.org/docs/LanguageExtensions.html#extensions-for-loop-hint-optimizations void CodeGen_ISPC::visit(const For* op) { if (!is_ISPC_code_stream_enabled()) { - CodeGen::visit(op); + CodeGen_C::visit(op); return; } doIndent(); @@ -934,7 +916,7 @@ void CodeGen_ISPC::visit(const While* op) { out << "\n"; } - IRPrinter::visit(op); + CodeGen_C::visit(op); } void CodeGen_ISPC::visit(const GetProperty* op) { @@ -982,10 +964,11 @@ void CodeGen_ISPC::visit(const Max* op) { } void CodeGen_ISPC::visit(const Allocate* op) { - string elementType = printCType(op->var.type(), false); - doIndent(); + if (is_ISPC_code_stream_enabled()) { + string elementType = printCType(op->var.type(), false); + doIndent(); op->var.accept(this); stream2 << " = "; @@ -1015,33 +998,7 @@ void CodeGen_ISPC::visit(const Allocate* op) { } else { - - op->var.accept(this); - stream << " = ("; - stream << elementType << "*"; - stream << ")"; - if (op->is_realloc) { - stream << "realloc("; - op->var.accept(this); - stream << ", "; - } - else { - // If the allocation was requested to clear the allocated memory, - // use calloc instead of malloc. - if (op->clear) { - stream << "calloc(1, "; - } else { - stream << "malloc("; - } - } - stream << "sizeof(" << elementType << ")"; - stream << " * "; - parentPrecedence = MUL; - op->num_elements.accept(this); - parentPrecedence = TOP; - stream << ");"; - stream << endl; - + CodeGen_C::visit(op); } @@ -1110,15 +1067,14 @@ void CodeGen_ISPC::visit(const Assign* op) { stream2 << ";"; stream2 << endl; + IRPrinter::visit(op); } else { - if (op->use_atomics) { - doIndent(); - stream << getAtomicPragma() << endl; - } + CodeGen_C::visit(op); + } - IRPrinter::visit(op); + } void CodeGen_ISPC::visit(const Store* op) { @@ -1137,43 +1093,5 @@ void CodeGen_ISPC::visit(const Store* op) { IRPrinter::visit(op); } -void CodeGen_ISPC::generateShim(const Stmt& func, stringstream &ret) { - const Function *funcPtr = func.as(); - - ret << "int _shim_" << funcPtr->name << "(void** parameterPack) {\n"; - ret << " return " << funcPtr->name << "("; - - size_t i=0; - string delimiter = ""; - - const auto returnType = funcPtr->getReturnType(); - if (returnType.second != Datatype()) { - ret << "(void**)(parameterPack[0]), "; - ret << "(char*)(parameterPack[1]), "; - ret << "(" << returnType.second << "*)(parameterPack[2]), "; - ret << "(int32_t*)(parameterPack[3])"; - - i = 4; - delimiter = ", "; - } - - for (auto output : funcPtr->outputs) { - auto var = output.as(); - auto cast_type = var->is_tensor ? "taco_tensor_t*" - : printCType(var->type, var->is_ptr); - - ret << delimiter << "(" << cast_type << ")(parameterPack[" << i++ << "])"; - delimiter = ", "; - } - for (auto input : funcPtr->inputs) { - auto var = input.as(); - auto cast_type = var->is_tensor ? "taco_tensor_t*" - : printCType(var->type, var->is_ptr); - ret << delimiter << "(" << cast_type << ")(parameterPack[" << i++ << "])"; - delimiter = ", "; - } - ret << ");\n"; - ret << "}\n"; -} } } diff --git a/src/codegen/codegen_ispc.h b/src/codegen/codegen_ispc.h index 2e440abc0..62d2897ca 100644 --- a/src/codegen/codegen_ispc.h +++ b/src/codegen/codegen_ispc.h @@ -12,7 +12,7 @@ namespace taco { namespace ir { -class CodeGen_ISPC : public CodeGen { +class CodeGen_ISPC : public CodeGen_C { public: /// Initialize a code generator that generates code to an /// output stream. @@ -28,7 +28,7 @@ class CodeGen_ISPC : public CodeGen { static void generateShim(const Stmt& func, std::stringstream &stream); protected: - using IRPrinter::visit; + using CodeGen_C::visit; void visit(const Function*); void visit(const VarDecl*); @@ -50,19 +50,10 @@ class CodeGen_ISPC : public CodeGen { void printISPCFunc(const Function *func, std::map varMap, std::vector &sortedProps); - std::map varMap; - std::vector localVars; bool taskCode = false; - std::ostream &out; - std::ostream &out2; - - OutputKind outputKind; - std::string funcName; std::stringstream funcVariables; std::vector sortedProps; - int labelCount; - bool emittingCoroutine; class FindVars; class FunctionCollector; diff --git a/src/codegen/module.cpp b/src/codegen/module.cpp index 82b736a13..6f631d40e 100644 --- a/src/codegen/module.cpp +++ b/src/codegen/module.cpp @@ -4,6 +4,7 @@ #include #include #include +// #include #if USE_OPENMP #include #endif @@ -178,7 +179,7 @@ string Module::compile() { writeShims(funcs, tmpdir, libname); for (auto &statement : funcs) { std::cout << "----- statement --------" << std::endl; - std::cout << statement; + // std::cout << statement; std::cout << std::endl; } std::cout << tmpdir << std::endl << libname << std::endl; @@ -233,10 +234,61 @@ string Module::getSource() { return source.str(); } +void* Module::getFuncPtr(std::string& sofile, std::string name) { + std::cout << "opening shared object 1\n"; + if (so_lib_handle) { + dlclose(so_lib_handle); + } + std::cout << "opening shared object 2\n"; + so_lib_handle = dlopen(sofile.data(), RTLD_NOW | RTLD_LOCAL); + std::cout << "opening shared object : " << sofile << std::endl; + return dlsym(so_lib_handle, name.data()); +} + void* Module::getFuncPtr(std::string name) { return dlsym(lib_handle, name.data()); } +int Module::callFuncPackedRaw(std::string name, std::string& sofile, void** args) { + typedef int (*fnptr_t)(void**); + static_assert(sizeof(void*) == sizeof(fnptr_t), + "Unable to cast dlsym() returned void pointer to function pointer"); + void* v_func_ptr = getFuncPtr(sofile, name); + fnptr_t func_ptr; + *reinterpret_cast(&func_ptr) = v_func_ptr; + +#if USE_OPENMP + omp_sched_t existingSched; + ParallelSchedule tacoSched; + int existingChunkSize, tacoChunkSize; + int existingNumThreads = omp_get_max_threads(); + omp_get_schedule(&existingSched, &existingChunkSize); + taco_get_parallel_schedule(&tacoSched, &tacoChunkSize); + switch (tacoSched) { + case ParallelSchedule::Static: + omp_set_schedule(omp_sched_static, tacoChunkSize); + break; + case ParallelSchedule::Dynamic: + omp_set_schedule(omp_sched_dynamic, tacoChunkSize); + break; + default: + break; + } + omp_set_num_threads(taco_get_num_threads()); +#endif + + std::cout << "calling the function\n"; + int ret = func_ptr(args); + std::cout << "function call completed\n"; + +#if USE_OPENMP + omp_set_schedule(existingSched, existingChunkSize); + omp_set_num_threads(existingNumThreads); +#endif + + return ret; +} + int Module::callFuncPackedRaw(std::string name, void** args) { typedef int (*fnptr_t)(void**); static_assert(sizeof(void*) == sizeof(fnptr_t), @@ -265,7 +317,13 @@ int Module::callFuncPackedRaw(std::string name, void** args) { omp_set_num_threads(taco_get_num_threads()); #endif + std::cout << "calling the function\n"; + // CALLGRIND_START_INSTRUMENTATION; + // CALLGRIND_TOGGLE_COLLECT; int ret = func_ptr(args); + // CALLGRIND_TOGGLE_COLLECT; + // CALLGRIND_STOP_INSTRUMENTATION; + std::cout << "function call completed\n"; #if USE_OPENMP omp_set_schedule(existingSched, existingChunkSize); diff --git a/src/index_notation/index_notation.cpp b/src/index_notation/index_notation.cpp index 51fb8770c..2e26460c7 100644 --- a/src/index_notation/index_notation.cpp +++ b/src/index_notation/index_notation.cpp @@ -2438,6 +2438,7 @@ bool isConcreteNotation(IndexStmt stmt, std::string* reason) { return isConcrete; } +// make reduction notation Assignment makeReductionNotation(Assignment assignment) { IndexExpr expr = assignment.getRhs(); std::vector free = assignment.getLhs().getIndexVars(); @@ -2513,7 +2514,10 @@ IndexStmt makeReductionNotation(IndexStmt stmt) { return makeReductionNotation(to(stmt)); } +// make concrete notation IndexStmt makeConcreteNotation(IndexStmt stmt) { + // std::cout << "concrete notation original assignment: " << stmt << std::endl; + std::string reason; taco_iassert(isReductionNotation(stmt, &reason)) << "Not reduction notation: " << stmt << std::endl << reason; @@ -2521,6 +2525,7 @@ IndexStmt makeConcreteNotation(IndexStmt stmt) { // Free variables and reductions covering the whole rhs become top level loops vector freeVars = to(stmt).getFreeVars(); + std::cout << "free vars: " << freeVars << std::endl; struct RemoveTopLevelReductions : IndexNotationRewriter { using IndexNotationRewriter::visit; @@ -2535,12 +2540,17 @@ IndexStmt makeConcreteNotation(IndexStmt stmt) { topLevelReductions.push_back(reduction.getVar()); rhs = reduction.getExpr(); } + // std::cout << "top level reductions: " << topLevelReductions << std::endl; if (rhs != node->rhs) { - stmt = Assignment(node->lhs, rhs, Add()); + stmt = Assignment(node->lhs, rhs, Add()); // write with add + int idx = 0; for (auto& i : util::reverse(topLevelReductions)) { + std::cout << idx << ": " << stmt << std::endl; + idx++; stmt = forall(i, stmt); } + std::cout << idx << ": " << stmt << std::endl; } else { stmt = node; @@ -2548,11 +2558,18 @@ IndexStmt makeConcreteNotation(IndexStmt stmt) { } }; stmt = RemoveTopLevelReductions().rewrite(stmt); + // std::cout << "after remove top level reductions: " << stmt << std::endl; + // now we form the stmt in reverse order of freeVars + int idx = 0; for (auto& i : util::reverse(freeVars)) { + std::cout << idx << ": " << stmt << std::endl; stmt = forall(i, stmt); + idx++; } + std::cout << idx << ": " << stmt << std::endl; + std::cout << "replacing reductions with whereas statements\n"; // Replace other reductions with where and forall statements struct ReplaceReductionsWithWheres : IndexNotationRewriter { using IndexNotationRewriter::visit; diff --git a/src/index_notation/transformations.cpp b/src/index_notation/transformations.cpp index 011779caf..c1d82a9fd 100644 --- a/src/index_notation/transformations.cpp +++ b/src/index_notation/transformations.cpp @@ -1,11 +1,16 @@ #include "taco/index_notation/transformations.h" +#include "lower/iteration_graph.h" +#include "lower/tensor_path.h" #include "taco/cuda.h" #include "taco/index_notation/index_notation.h" +#include "taco/index_notation/index_notation_nodes_abstract.h" #include "taco/index_notation/index_notation_rewriter.h" #include "taco/index_notation/index_notation_nodes.h" #include "taco/index_notation/index_notation_printer.h" #include "taco/error/error_messages.h" +#include "taco/index_notation/intrinsic.h" +#include "taco/type.h" #include "taco/util/collections.h" #include "taco/lower/iterator.h" #include "taco/lower/merge_lattice.h" @@ -307,6 +312,7 @@ IndexStmt Precompute::apply(IndexStmt stmt, std::string* reason) const { IndexExpr e = precompute.getExpr(); IndexVar iw = precompute.getiw(); + // these lines of code looks interesting when creating the producer consumer relationship IndexStmt consumer = forall(i, replace(s, {{e, ws(i)}})); IndexStmt producer = forall(iw, Assignment(ws(iw), replace(e, {{i,iw}}), assign.getOperator())); @@ -595,7 +601,7 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const { IndexStmt rewriteParallel(IndexStmt stmt) { std::cout << "1 rewriting IndexStmt to support parallelize schedule directive\n--------------------------------------------\n"; - std::cout << stmt << std::endl; + // std::cout << stmt << std::endl; provGraph = ProvenanceGraph(stmt); std::cout << "2 rewriting IndexStmt to support parallelize schedule directive\n--------------------------------------------\n"; @@ -618,7 +624,7 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const { } std::cout << "4 rewriting IndexStmt to support parallelize schedule directive\n--------------------------------------------\n"; - std::cout << stmt << std::endl; + // std::cout << stmt << std::endl; return rewrite(stmt); } @@ -1306,6 +1312,7 @@ static vector topologicallySort(map> hardDeps, map> softDeps, vector originalOrder) { + std::cout << "originalOrder: " << std::endl; vector sortedVars; unsigned long countVars = originalOrder.size(); while (sortedVars.size() < countVars) { @@ -1327,6 +1334,9 @@ topologicallySort(map> hardDeps, } // No free var found there is a cycle + std::cout << "this is where the assert fails\n"; + std::cout << "freeVarPos: " << freeVarPos << std::endl; + std::cout << "limit: " << std::numeric_limits::max() << std::endl; taco_iassert(freeVarPos != std::numeric_limits::max()) << "Cycles in iteration graphs must be resolved, through transpose, " << "before the expression is passed to the topological sorting " @@ -1352,19 +1362,668 @@ topologicallySort(map> hardDeps, return sortedVars; } -IndexStmt justTraverseThroughTheIndexStmt(IndexStmt stmt) { - struct IndexStatementTraverse : public IndexNotationPrinter { - IndexStatementTraverse(std::ostream& os) : IndexNotationPrinter(os) {}; - using IndexNotationPrinter::visit; +bool checkFromBack(const TensorPath& resultTensorPath, + const vector& tensorPaths, + string& removedAccessNode, + vector& producerVars, + vector& consumerVars, + vector& modifiedResultIndexesAccessed, + vector& sortedAllIndexes) { + + std::cout << "check from back function execution\n"; + + const std::vector& resultIndexesVisited = resultTensorPath.getVariables(); + IndexVar lastVisitedIndexVar = resultIndexesVisited.back(); + + std::cout << "last visited index variable: " << lastVisitedIndexVar << std::endl; + + bool onlyLastTensorContainLastIndexOfOutput = true; + bool fissionFromBack = false; + + // check from the back + for (unsigned long i=0; i& indexesVisited = otherIndexPaths.getVariables(); + cout << "index paths: " << otherIndexPaths << endl; + + // if (i < tensorPaths.size()-1) { + // check if other tensors also contain last index of output tensor + for (auto index : indexesVisited) { + cout << "checking " << index << " " << lastVisitedIndexVar << endl; + if (index == lastVisitedIndexVar) { + onlyLastTensorContainLastIndexOfOutput = false; + } + } + // } + } + + if (onlyLastTensorContainLastIndexOfOutput) { // last accessed tensorVariable + const TensorPath& otherIndexPaths = tensorPaths.back(); + const vector& indexesVisited = otherIndexPaths.getVariables(); + cout << "index paths: " << otherIndexPaths << endl; + + cout << "index variable maybe removed from the back\n"; + auto lastTensorLastVisited = indexesVisited.back(); + cout << "last index last visited " << lastTensorLastVisited << endl; + + if (lastTensorLastVisited == lastVisitedIndexVar) { + cout << "we can diffuse from the back\n"; + fissionFromBack = true; + removedAccessNode = otherIndexPaths.getAccess().getTensorVar().getName(); + cout << "removed access node " << removedAccessNode << endl; + + // mark producer accessed index variables + for (auto indexVar : sortedAllIndexes) { + if (indexVar != lastVisitedIndexVar) { // add everything except the last accessed index + std::cout << "producer vars: " << indexVar << std::endl; + producerVars.push_back(indexVar); + } + } + + for (auto indexVar : sortedAllIndexes) { + if (indexVar != lastVisitedIndexVar) { + if ( + find(resultIndexesVisited.begin(), resultIndexesVisited.end(), indexVar) + != resultIndexesVisited.end() || + find(indexesVisited.begin(), indexesVisited.end(), indexVar) + != indexesVisited.end() + ) { + modifiedResultIndexesAccessed.push_back(indexVar); + } + } + } + + // // get modified index for the intermediate calculated tensor expression + // for (unsigned long j=0; j& tensorPaths, + string& removedAccessNode, + vector& producerVars, + vector& consumerVars, + vector& modifiedResultIndexesAccessed, + vector& sortedAllIndexes) { + + std::cout << "check from front function execution\n"; + + const std::vector& resultIndexesVisited = resultTensorPath.getVariables(); + IndexVar firstVisitedIndexVar = resultIndexesVisited.front(); + + std::cout << "first fisited index variable: " << firstVisitedIndexVar << std::endl; + std::cout << "tensor path size: " << tensorPaths.size() << std::endl; + + bool onlyFirstTensorContainFirstIndexOfOutput = true; + bool fissionFromFront = false; + + // check from the front + for (long i=tensorPaths.size()-1; i>0; i--) { // change tensor paths to recursively use the functionality + std::cout << "i: " << i << std::endl; + const TensorPath& otherIndexPaths = tensorPaths.at(i); + const vector& indexesVisited = otherIndexPaths.getVariables(); + cout << "index paths: " << otherIndexPaths << endl; + + if (i != 0) { // check if other tensors also contain last index of output tensor + for (auto index : indexesVisited) { + cout << "checking " << index << " " << firstVisitedIndexVar << endl; + if (index == firstVisitedIndexVar) { + onlyFirstTensorContainFirstIndexOfOutput = false; + } + } + } + } + + + if (onlyFirstTensorContainFirstIndexOfOutput) { // last accessed tensorVariable + const TensorPath& otherIndexPaths = tensorPaths.front(); + const vector& indexesVisited = otherIndexPaths.getVariables(); + cout << "index paths: " << otherIndexPaths << endl; + + cout << "index variable maybe removed from the front\n"; + auto firstTensorFirstVisited = indexesVisited.front(); + cout << "first index first visited " << firstTensorFirstVisited << endl; + + if (firstTensorFirstVisited == firstVisitedIndexVar) { + cout << "we can diffuse from the front\n"; + fissionFromFront = true; + removedAccessNode = otherIndexPaths.getAccess().getTensorVar().getName(); + cout << "removed access node " << removedAccessNode << endl; + + // mark producer accessed index variables + for (auto indexVar : sortedAllIndexes) { + if (indexVar != firstVisitedIndexVar) { // add everything except the first accessed index + producerVars.emplace_back(indexVar); + } + } + + for (auto indexVar : sortedAllIndexes) { + if (indexVar != firstVisitedIndexVar) { + if ( + find(resultIndexesVisited.begin(), resultIndexesVisited.end(), indexVar) + != resultIndexesVisited.end() || + find(indexesVisited.begin(), indexesVisited.end(), indexVar) + != indexesVisited.end() + ) { + modifiedResultIndexesAccessed.push_back(indexVar); + } + } + } + + std::cout << "printing modifiedResultIndexesAccessed\n"; + for (auto& idx : modifiedResultIndexesAccessed) { + std::cout << "modifiedResultIndexesAccessed: " << idx << std::endl; + } + std::cout << "printed modifiedResultIndexesAccessed\n"; + + // get modified index for the intermediate calculated tensor expression + // for (unsigned long j=0; j forallParallelUnit; map forallOutputRaceStrategy; + vector sortedIndexes; + Assignment innerBody; + + SortedIndexVars() {}; + + void visit(const ForallNode* node) { + Forall forallNode(node); + IndexVar i = forallNode.getIndexVar(); + std::cout << forallNode << std::endl; + + sortedIndexes.push_back(i); + forallParallelUnit[i] = forallNode.getParallelUnit(); + forallOutputRaceStrategy[i] = forallNode.getOutputRaceStrategy(); + + if (isa(forallNode.getStmt())) { + cout << "assignment node found: " << forallNode.getStmt() << endl;; + innerBody = to(forallNode.getStmt()); + return; // Only reorder first contiguous section of ForAlls + } + + IndexNotationVisitor::visit(node); + } }; std::cout << "traversing through the index statement\n"; - IndexNotationPrinter printer(std::cout); + SortedIndexVars sortedIndexVars; + stmt.accept(&sortedIndexVars); std::cout << std::endl; - stmt.accept(&printer); - return stmt; + + struct IndexExprBuilder : public IndexNotationVisitor { + + using IndexNotationVisitor::visit; + vector accessLeftToRight; + map>> indexDimensionsMap; + + void visit(const AccessNode* node) { + Access accessNode(node); + std::cout << "access node: " << accessNode << std::endl; + accessLeftToRight.push_back(accessNode); + + TensorVar tensorVar = accessNode.getTensorVar(); + + for (unsigned long i=0; i < accessNode.getIndexVars().size(); i++) { + auto var = accessNode.getIndexVars()[i]; + + if (indexDimensionsMap.find(var) != indexDimensionsMap.end()) { + indexDimensionsMap[var].emplace_back( + pair(tensorVar.getType().getShape().getDimension(i), + tensorVar.getType())); + } + else { + indexDimensionsMap[var] = { + pair( + tensorVar.getType().getShape().getDimension(i), + tensorVar.getType()) + }; + } + } + + } + + }; + + IndexExpr rhsExpr = assignment.getRhs(); + Access lhsAccess = to(assignment.getLhs()); + std::cout << "right hand side expression: " << rhsExpr << std::endl; + IndexExprBuilder indexExprBuilder; + rhsExpr.accept(&indexExprBuilder); + TensorVar resultVar = lhsAccess.getTensorVar(); + + for (auto item : indexExprBuilder.indexDimensionsMap) { + auto indexVar = item.first; + cout << "var: " << indexVar << " "; + for (auto elem : item.second) { + cout << elem.first << " " << elem.second << " " ; + } + cout << endl; + } + + + // now I have the iteration graph + IterationGraph iterationGraph = IterationGraph::make(assignment); + std::cout << "/*******************************************/\n"; + std::cout << "/********** ITERATION GRAPH ****************/\n"; + std::cout << "/*******************************************/\n"; + std::cout << iterationGraph << std::endl; + + const TensorPath& resultTensorPath = iterationGraph.getResultTensorPath(); + const std::vector& tensorPaths = iterationGraph.getTensorPaths(); + + + string removedAccessNode; + vector producerVars; // producer accessed index variables + vector consumerVars; // consumer accessed index variables + vector fusedVars; + vector modifiedResultIndexesAccessed; + bool fissionFromBack = false; + if (side == "b") { + fissionFromBack = true; + } + + if (fissionFromBack) { + fissionFromBack = checkFromBack(resultTensorPath, tensorPaths, + removedAccessNode, producerVars, consumerVars, + modifiedResultIndexesAccessed, sortedIndexVars.sortedIndexes + ); + } + + bool fissionFromFront = false; + if (side == "f") { + fissionFromFront = true; + } + if (fissionFromBack == false && fissionFromFront) { + fissionFromFront = checkFromFront(resultTensorPath, tensorPaths, + removedAccessNode, producerVars, consumerVars, + modifiedResultIndexesAccessed, sortedIndexVars.sortedIndexes + ); + } + + if (!fissionFromBack && !fissionFromFront) { + cout << "fission operation cannot be performed from the back\n"; + return stmt; + } + + vector newAccessDims{}; + for (auto var : modifiedResultIndexesAccessed) { + auto item = indexExprBuilder.indexDimensionsMap[var]; + cout << "shared vars: " << var << endl; + newAccessDims.emplace_back(item[0].first); + } + TensorVar newAccessVar(resultVar.getName() + "_inner", + Type(resultVar.getType().getDataType(), newAccessDims)); + cout << "new inner assignment statement: " << modifiedResultIndexesAccessed << std::endl; + Access newResultAccess(newAccessVar, modifiedResultIndexesAccessed); + cout << "new access variable for iterative apply: " << newResultAccess << std::endl; + + if (fissionFromBack) { + std::cout << "fission from the back is possible\n"; + } + if (fissionFromFront) { + std::cout << "fission from the front is possible\n"; + } + + // // check from the front + // struct IndexExprSeparator : public IndexNotationVisitor { + + // using IndexNotationVisitor::visit; + // vector accessLeftToRight; + + // void visit(const MulNode* node) { + // Mul mulNode(node); + // IndexExpr lhs = mulNode.getA(); + // IndexExpr rhs = mulNode.getB(); + // std::cout << "access node: " << accessNode << std::endl; + // accessLeftToRight.push_back(accessNode); + // } + + // }; + + + cout << "\n\nProducer accessed index variables\n"; + auto it = producerVars.begin(); + for (; it != producerVars.end(); it++) { + cout << *it << endl; + } + cout << "\n\nConsumer accessed index variables\n"; + it = consumerVars.begin(); + for (; it != consumerVars.end(); it++) { + cout << *it << endl; + } + cout << endl << endl; + + // check common vars that can be fused + for (auto var : sortedIndexVars.sortedIndexes) { + if (find(producerVars.begin(), producerVars.end(), var) != producerVars.end() && + find(consumerVars.begin(), consumerVars.end(), var) != consumerVars.end()) { + fusedVars.emplace_back(var); + } + else { + break; + } + } + + for (auto& fv : fusedVars) { + std::cout << "fusable vars: " << fv << std::endl; + } + + vector sharedVars; + for (auto var : sortedIndexVars.sortedIndexes) { + if (find(fusedVars.begin(), fusedVars.end(), var) == fusedVars.end() && + find(producerVars.begin(), producerVars.end(), var) != producerVars.end() && + find(consumerVars.begin(), consumerVars.end(), var) != consumerVars.end() + ) { + sharedVars.emplace_back(var); + } + } + + for (auto& sv : sharedVars) { + std::cout << "shared vars: " << sv << std::endl; + } + + vector sharedDims{}; + for (auto var : sharedVars) { + auto item = indexExprBuilder.indexDimensionsMap[var]; + cout << "shared vars: " << var << endl; + sharedDims.emplace_back(item[0].first); + } + + + // get removing tensorvars and workspace dimension + const Type& type = resultTensorPath.getAccess().getTensorVar().getType(); + const Format& format = resultTensorPath.getAccess().getTensorVar().getFormat(); + TensorVar intermediateTensor("ws", type, format); + cout << intermediateTensor << endl; + + // TensorVar A("A", Type(), taco::dense); + TensorVar tempVar("t" + resultVar.getName(), + Type(resultVar.getType().getDataType(), sharedDims)); + cout << "tensor order: " << tempVar.getOrder() << endl; + cout << "tensor format: " << tempVar.getFormat() << endl; + cout << "format order: " << tempVar.getFormat().getOrder() << endl; + + // TensorVar* a = new TensorVar("A", Type()); + // TensorVar ws("ws", Type(type(), {jdim}) ); + + // get removing indexExpr and the rest of the indexExpr + Access workspace(tempVar, sharedVars); + std::cout << "workspace access tensor: " << workspace << std::endl; + + + + // construct producer expression right hand side + cout << "generating consumer expression\n"; + IndexExpr producerExpr; + int num_muls = 0; + for (Access accessNode : indexExprBuilder.accessLeftToRight) { + std::cout << "accessNodes: " << accessNode << endl; + if (removedAccessNode != accessNode.getTensorVar().getName()) { + if (producerExpr == NULL) { + std::cout << "index expression is null"; + producerExpr = accessNode; + std::cout << "producerExpr: " << producerExpr << std::endl; + } else { + num_muls++; + producerExpr = producerExpr * accessNode; + std::cout << "producerExpr: " << producerExpr << std::endl; + } + } + } + std::cout << producerExpr << std::endl; + Assignment producerAssignment(newResultAccess, + producerExpr); + std::cout << "new inner assignment statement: " << producerAssignment << std::endl; + Assignment producerInnerBody(workspace, + producerExpr, + sortedIndexVars.innerBody.getOperator() + ); + std::cout << "producerInnerBody: " << producerInnerBody << std::endl; + + // construct consumer expression right hand side + IndexExpr consumerExpr; + if (fissionFromBack) { + consumerExpr = workspace; + } + cout << "generating consumer expression: " << consumerExpr << std::endl; + for (Access accessNode : indexExprBuilder.accessLeftToRight) { + TensorVar tv = accessNode.getTensorVar(); + std::cout << "accessNodes: " << accessNode << endl; + if (removedAccessNode == accessNode.getTensorVar().getName()) { + if (consumerExpr == NULL) { + std::cout << "index expression is null"; + consumerExpr = accessNode; + std::cout << "consumerExpr: " << consumerExpr << std::endl; + } else { + consumerExpr = consumerExpr * accessNode; + std::cout << "consumerExpr: " << consumerExpr << std::endl; + } + } + } + if (fissionFromFront) { + consumerExpr = consumerExpr * workspace; + } + Assignment consumerInnerBody(lhsAccess, + consumerExpr, + sortedIndexVars.innerBody.getOperator() + ); + + cout << "Producer inner body: " << producerInnerBody << endl; + cout << "Consumer inner body: " << consumerInnerBody << endl; + + // rewrite indexstmt + // Reorder Foralls use a rewriter in case new nodes introduced outside of Forall + struct ProducerConsumerRewriter : public IndexNotationRewriter { + using IndexNotationRewriter::visit; + + const vector& producerConsumerVars; + const vector& fusedVars; + IndexStmt innerBody; + const map forallParallelUnit; + const map forallOutputRaceStrategy; + + ProducerConsumerRewriter(const vector& producerConsumerVars, + const vector& fusedVars, IndexStmt innerBody, + const map forallParallelUnit, + const map forallOutputRaceStrategy) + : producerConsumerVars(producerConsumerVars), fusedVars(fusedVars), innerBody(innerBody), + forallParallelUnit(forallParallelUnit), forallOutputRaceStrategy(forallOutputRaceStrategy) { + } + + void visit(const ForallNode* node) { + Forall foralli(node); + IndexVar i = foralli.getIndexVar(); + cout << "going through var: " << i << endl; + + // first forall must be in collected variables + // taco_iassert(util::contains(producerVars, i)); + // std::cout << "\ninner body of the statement\n" << innerBody; + // // done in reverse order? + // for (auto it = sortedVars.rbegin(); it != sortedVars.rend(); ++it) { + // stmt = forall(*it, stmt, forallParallelUnit.at(*it), forallOutputRaceStrategy.at(*it), foralli.getUnrollFactor()); + // } + stmt = rewrite(foralli.getStmt()); + cout << "after rewrite statement: " << stmt << endl; + + // omit the index variables in the fusedVar list + if (find(fusedVars.begin(), fusedVars.end(), i) == fusedVars.end() && + find(producerConsumerVars.begin(), producerConsumerVars.end(), i) != producerConsumerVars.end()) { + stmt = forall(i, stmt, forallParallelUnit.at(i), forallOutputRaceStrategy.at(i), foralli.getUnrollFactor()); + } + } + + void visit (const AssignmentNode* node) { + cout << "assignment node: " << node << endl; + stmt = innerBody; + cout << "producerStmt: " << innerBody << endl; + cout << "stmt: " << stmt << endl; + } + + }; + ProducerConsumerRewriter producerRewriter(producerVars, fusedVars, + producerInnerBody, + sortedIndexVars.forallParallelUnit, + sortedIndexVars.forallOutputRaceStrategy); + IndexStmt producerStmt = producerRewriter.rewrite(stmt); + std::cout << "\nAfter Producer rewriter\n"; + std::cout << producerStmt << std::endl; + if (num_muls > 1) { + producerStmt = loopFusionOverFission(producerStmt, producerInnerBody, + side, iters-1); + } + + + ProducerConsumerRewriter consumerRewriter(consumerVars, fusedVars, + consumerInnerBody, + sortedIndexVars.forallParallelUnit, + sortedIndexVars.forallOutputRaceStrategy); + IndexStmt consumerStmt = consumerRewriter.rewrite(stmt); + std::cout << "\nAfter Consumer rewriter\n"; + std::cout << consumerStmt << std::endl; + + + struct CombineProducerConsumerRewriter : public IndexNotationRewriter { + + const vector& fusedVars; + IndexStmt consumerStmt; + IndexStmt producerStmt; + const map forallParallelUnit; + const map forallOutputRaceStrategy; + + CombineProducerConsumerRewriter(const vector& fusedVars, + IndexStmt producerStmt, IndexStmt consumerStmt, + const map forallParallelUnit, + const map forallOutputRaceStrategy) + : fusedVars(fusedVars), consumerStmt(consumerStmt), producerStmt(producerStmt), + forallParallelUnit(forallParallelUnit), + forallOutputRaceStrategy(forallOutputRaceStrategy) {} + + using IndexNotationRewriter::visit; + + void visit(const ForallNode* node) { + Forall foralli(node); + IndexVar i = foralli.getIndexVar(); + cout << "going through var: " << i << endl; + + // omit the index variables in the fusedVar list + if (find(fusedVars.begin(), fusedVars.end(), i) != fusedVars.end()) { + cout << "fused var in stmt\n"; + stmt = rewrite(foralli.getStmt()); + cout << "rewritten stmt: " << stmt << endl; + stmt = forall(i, stmt, forallParallelUnit.at(i), forallOutputRaceStrategy.at(i), foralli.getUnrollFactor()); + } + else { + cout << "fused var not in stmt\n"; + cout << "producerStmt: " << producerStmt << endl; + cout << "consumerStmt: " << consumerStmt << endl; + stmt = where(consumerStmt, producerStmt); + cout << "where stmt: " << stmt << endl; + } + + cout << "after rewrite statement: " << stmt << endl; + } + + }; + + CombineProducerConsumerRewriter combineRewriter(fusedVars, + producerStmt, consumerStmt, + sortedIndexVars.forallParallelUnit, + sortedIndexVars.forallOutputRaceStrategy); + IndexStmt combinedStmt = combineRewriter.rewrite(stmt); + std::cout << "\nAfter Combine rewriter\n"; + std::cout << combinedStmt << std::endl; + + + return combinedStmt; } @@ -1431,6 +2090,7 @@ IndexStmt reorderLoopsTopologically(IndexStmt stmt) { }; Iterators iterators(stmt); + std::cout << "DAG builder with iterators" << std::endl; DAGBuilder dagBuilder(iterators); stmt.accept(&dagBuilder); std::cout << "After DAGBuilder\n"; @@ -1442,6 +2102,7 @@ IndexStmt reorderLoopsTopologically(IndexStmt stmt) { tensorVarOrders[tensorLevelVar.first] = varOrderFromTensorLevels(tensorLevelVar.second); } + // hard dependencies const auto hardDeps = depsFromVarOrders(tensorVarOrders); struct CollectSoftDependencies : public IndexNotationVisitor { @@ -1463,14 +2124,17 @@ IndexStmt reorderLoopsTopologically(IndexStmt stmt) { } } }; + // soft dependencies CollectSoftDependencies collectSoftDeps; stmt.accept(&collectSoftDeps); std::cout << "After CollectSoftDependencies\n"; std::cout << stmt << std::endl; + // topological sort const auto sortedVars = topologicallySort(hardDeps, collectSoftDeps.softDeps, dagBuilder.indexVarOriginalOrder); + // rewrite indexstmt // Reorder Foralls use a rewriter in case new nodes introduced outside of Forall struct TopoReorderRewriter : public IndexNotationRewriter { using IndexNotationRewriter::visit; @@ -1493,7 +2157,9 @@ IndexStmt reorderLoopsTopologically(IndexStmt stmt) { // first forall must be in collected variables taco_iassert(util::contains(sortedVars, i)); + std::cout << "\ninner body of the statement\n" << innerBody; stmt = innerBody; + // done in reverse order? for (auto it = sortedVars.rbegin(); it != sortedVars.rend(); ++it) { stmt = forall(*it, stmt, forallParallelUnit.at(*it), forallOutputRaceStrategy.at(*it), foralli.getUnrollFactor()); } diff --git a/src/ir/ir_printer.cpp b/src/ir/ir_printer.cpp index fa224bde4..eddca3f29 100644 --- a/src/ir/ir_printer.cpp +++ b/src/ir/ir_printer.cpp @@ -550,7 +550,7 @@ void IRPrinter::visit(const Store* op) { } void IRPrinter::visit(const For* op) { - std::cout << "This is IRPrinter::visit For op method\n"; + // std::cout << "This is IRPrinter::visit For op method\n"; if (is_ISPC_code_stream_enabled()) { doIndent(); stream2 << keywordString("for") << " (" diff --git a/src/ir/ir_rewriter.cpp b/src/ir/ir_rewriter.cpp index fdadf530e..2e4827497 100644 --- a/src/ir/ir_rewriter.cpp +++ b/src/ir/ir_rewriter.cpp @@ -292,7 +292,7 @@ void IRRewriter::visit(const Store* op) { } void IRRewriter::visit(const For* op) { - std::cout << "This is IRRewriter::visit(const For* op) method: For: " << op << std::endl; + // std::cout << "This is IRRewriter::visit(const For* op) method: For: " << op << std::endl; Expr var = rewrite(op->var); Expr start = rewrite(op->start); Expr end = rewrite(op->end); diff --git a/src/lower/iteration_graph.cpp b/src/lower/iteration_graph.cpp index 77735a8d2..482d84aae 100644 --- a/src/lower/iteration_graph.cpp +++ b/src/lower/iteration_graph.cpp @@ -48,6 +48,8 @@ struct IterationGraph::Content { IterationGraph::IterationGraph() { } +// remember that iteration graph does not have an ordering +// I got the ordering from topologically reorder index Ryan wrote IterationGraph IterationGraph::make(Assignment assignment) { TensorVar tensor = assignment.getLhs().getTensorVar(); IndexExpr expr = assignment.getRhs(); @@ -64,8 +66,16 @@ IterationGraph IterationGraph::make(Assignment assignment) { oldToSplitVar.insert({indexVar, indexVar}); } + // access nodes of right hand side match(expr, function([&](const AccessNode* op) { + std::cout << "access node: " << op->tensorVar << " <- " << IndexExpr(op) << std::endl; + std::cout << "index var: "; + for (auto indexVar : op->indexVars) { + std::cout << indexVar << " "; + } + std::cout << std::endl; + auto type = op->tensorVar.getType(); taco_iassert((size_t)type.getShape().getOrder() == op->indexVars.size()) << "Tensor access " << IndexExpr(op) << " but tensor format only has " diff --git a/src/lower/iterator.cpp b/src/lower/iterator.cpp index 0f0c024c5..eb3d8ac3b 100644 --- a/src/lower/iterator.cpp +++ b/src/lower/iterator.cpp @@ -569,6 +569,9 @@ void Iterators::createAccessIterators(Access access, Format format, Expr tensorI ProvenanceGraph provGraph, const map &tensorVars) { TensorVar tensorConcrete = access.getTensorVar(); + cout << "tensor: " << tensorConcrete << " " ; + cout << "tensorConcrete order: " << tensorConcrete.getOrder(); + cout << ", format order: " << format.getOrder() << endl; taco_iassert(tensorConcrete.getOrder() == format.getOrder()) << tensorConcrete << ", Format" << format; Shape shape = tensorConcrete.getType().getShape(); diff --git a/src/lower/lowerer_impl_imperative.cpp b/src/lower/lowerer_impl_imperative.cpp index 28bd6c7c2..1355c80a1 100644 --- a/src/lower/lowerer_impl_imperative.cpp +++ b/src/lower/lowerer_impl_imperative.cpp @@ -28,7 +28,7 @@ class LowererImplImperative::Visitor : public IndexNotationVisitorStrict { public: Visitor(LowererImplImperative* impl) : impl(impl) {} Stmt lower(IndexStmt stmt) { - std::cout << "lowering IndexStmt to ir:Stmt - IndexStmt: " << stmt << std::endl; + // std::cout << "lowering IndexStmt to ir:Stmt - IndexStmt: " << stmt << std::endl; this->stmt = Stmt(); impl->accessibleIterators.scope(); IndexStmtVisitorStrict::visit(stmt); @@ -138,6 +138,7 @@ static bool returnsTrue(IndexExpr expr) { } void visit(const CastNode* op) { + std::cout << "visiting cast node\n"; expr = rewrite(op->a); } @@ -418,7 +419,7 @@ LowererImplImperative::lower(IndexStmt stmt, string name, Stmt LowererImplImperative::lowerAssignment(Assignment assignment) { - std::cout << "\n\n converting assignment IndexStmt============================================ Assignment\n"; + // std::cout << "\n\n converting assignment IndexStmt============================================ Assignment\n"; taco_iassert(generateAssembleCode() || generateComputeCode()); Stmt computeStmt; @@ -426,7 +427,7 @@ Stmt LowererImplImperative::lowerAssignment(Assignment assignment) Expr var = getTensorVar(result); const bool needComputeAssign = util::contains(needCompute, result); - std::cout << "does assignment need compute assign: " << needComputeAssign << std::endl; + // std::cout << "does assignment need compute assign: " << needComputeAssign << std::endl; Expr rhs; if (needComputeAssign) { rhs = lower(assignment.getRhs()); @@ -434,26 +435,26 @@ Stmt LowererImplImperative::lowerAssignment(Assignment assignment) // Assignment to scalar variables. if (isScalar(result.getType())) { - std::cout << "assignment to scalar variables\n"; + // std::cout << "assignment to scalar variables\n"; if (needComputeAssign) { - std::cout << "compute assign\n"; + // std::cout << "compute assign\n"; if (!assignment.getOperator().defined()) { - std::cout << "assignment operator is not defined\n"; - std::cout << "var: " << var << ", rhs, : " << rhs << std::endl; + // std::cout << "assignment operator is not defined\n"; + // std::cout << "var: " << var << ", rhs, : " << rhs << std::endl; computeStmt = Assign::make(var, rhs); } else { taco_iassert(isa(assignment.getOperator())); - std::cout << "assignment depth -- loopDepth: " << loopDepth << std::endl; - std::cout << "is markAssignsAtomicDepth > 0: " << (markAssignsAtomicDepth > 0) << std::endl; - for (auto &tensors_ : whereTemps) { - std::cout << tensors_ << ", "; - } - std::cout << std::endl; - std::cout << result << std::endl; + // std::cout << "assignment depth -- loopDepth: " << loopDepth << std::endl; + // std::cout << "is markAssignsAtomicDepth > 0: " << (markAssignsAtomicDepth > 0) << std::endl; + // for (auto &tensors_ : whereTemps) { + // // std::cout << tensors_ << ", "; + // } + // std::cout << std::endl; + // std::cout << result << std::endl; int tempVarInitLoopDepth = whereTempsWithLoopDepth.find(result)->second; - std::cout << "tempInitLoopDepth: " << tempVarInitLoopDepth << std::endl; + // std::cout << "tempInitLoopDepth: " << tempVarInitLoopDepth << std::endl; bool reduction = false; std::map::iterator itr; @@ -461,24 +462,24 @@ Stmt LowererImplImperative::lowerAssignment(Assignment assignment) if (itr->first<=loopDepth && itr->first>tempVarInitLoopDepth && itr->second == ParallelUnit::CPUSimd) { reduction = true; } - std::cout << itr->first << "\t" << ParallelUnit_NAMES[(int) itr->second] << std::endl; + // std::cout << itr->first << "\t" << ParallelUnit_NAMES[(int) itr->second] << std::endl; } // less than or equal to loopDepth but greater than temp variable initialized loop depth bool useAtomics = markAssignsAtomicDepth > 0 && (!util::contains(whereTemps, result) || reduction); - std::cout << "whereTemps and result: " << !util::contains(whereTemps, result) << std::endl; - std::cout << "assignment to scalar variables useAtomics: " << useAtomics << std::endl; + // std::cout << "whereTemps and result: " << !util::contains(whereTemps, result) << std::endl; + // std::cout << "assignment to scalar variables useAtomics: " << useAtomics << std::endl; computeStmt = compoundAssign(var, rhs, useAtomics, atomicParallelUnit); - std::cout << "computeStatment: " << computeStmt << std::endl; + // std::cout << "computeStatment: " << computeStmt << std::endl; } } else { - std::cout << "not compute assign\n"; + // std::cout << "not compute assign\n"; } } // Assignments to tensor variables (non-scalar). else { - std::cout << "assignment to tensor variables\n"; + // std::cout << "assignment to tensor variables\n"; Expr values = getValuesArray(result); Expr loc = generateValueLocExpr(assignment.getLhs()); @@ -512,7 +513,7 @@ Stmt LowererImplImperative::lowerAssignment(Assignment assignment) } if (needComputeAssign && values.defined()) { - std::cout << "assign compute statement\n"; + // std::cout << "assign compute statement\n"; if (!assignment.getOperator().defined()) { computeStmt = Store::make(values, loc, rhs); } @@ -627,34 +628,35 @@ LowererImplImperative::splitAppenderAndInserters(const vector& results /* * This is the for loop lowering part */ + Stmt LowererImplImperative::lowerForall(Forall forall) { loopDepth++; forUnits.insert(std::pair(loopDepth,forall.getParallelUnit())); - std::cout << "doing lowerForall: " << forall << std::endl; + // std::cout << "doing lowerForall: " << forall << std::endl; bool hasExactBound = provGraph.hasExactBound(forall.getIndexVar()); bool forallNeedsUnderivedGuards = !hasExactBound && emitUnderivedGuards; - std::cout << "printing temporary variables with their atomic depths\n"; + // std::cout << "printing temporary variables with their atomic depths\n"; map::iterator itr; for (itr = whereTempsWithLoopDepth.begin(); itr != whereTempsWithLoopDepth.end(); ++itr) { - std::cout << itr->first << "\t" << itr->second << "\n"; + // std::cout << itr->first << "\t" << itr->second << "\n"; } if (!ignoreVectorize && forallNeedsUnderivedGuards && (forall.getParallelUnit() == ParallelUnit::CPUVector || forall.getUnrollFactor() > 0)) { - std::cout << "calling lowerForallCloned(forall)\n"; + // std::cout << "calling lowerForallCloned(forall)\n"; return lowerForallCloned(forall); } - std::cout << "inParallelLoopDepth: " << inParallelLoopDepth << "========================\n"; + // std::cout << "inParallelLoopDepth: " << inParallelLoopDepth << "========================\n"; if (forall.getParallelUnit() != ParallelUnit::NotParallel) { inParallelLoopDepth++; } - std::cout << "inParallelLoopDepth: " << inParallelLoopDepth << "========================\n"; + // std::cout << "inParallelLoopDepth: " << inParallelLoopDepth << "========================\n"; // Recover any available parents that were not recoverable previously vector recoverySteps; @@ -842,23 +844,23 @@ Stmt LowererImplImperative::lowerForall(Forall forall) } if (!isWhereProducer && hasPosDescendant && underivedAncestors.size() > 1 && provGraph.isPosVariable(iterator.getIndexVar()) && posDescendant == forall.getIndexVar()) { - std::cout << "calling lowerForallFusedPosition(forall\n"; + // std::cout << "calling lowerForallFusedPosition(forall\n"; loops = lowerForallFusedPosition(forall, iterator, locators, inserters, appenders, reducedAccesses, recoveryStmt); } else if (canAccelWithSparseIteration) { - std::cout << "calling lowerForallDenseAcceleration(forall\n"; + // std::cout << "calling lowerForallDenseAcceleration(forall\n"; loops = lowerForallDenseAcceleration(forall, locators, inserters, appenders, reducedAccesses, recoveryStmt); } // Emit dimension coordinate iteration loop else if (iterator.isDimensionIterator()) { - std::cout << "calling lowerForallDimension(forall\n"; + // std::cout << "calling lowerForallDimension(forall\n"; loops = lowerForallDimension(forall, point.locators(), inserters, appenders, reducedAccesses, recoveryStmt); } // Emit position iteration loop else if (iterator.hasPosIter()) { - std::cout << "calling lowerForallPosition(forall\n"; + // std::cout << "calling lowerForallPosition(forall\n"; loops = lowerForallPosition(forall, iterator, locators, inserters, appenders, reducedAccesses, recoveryStmt); } @@ -877,9 +879,9 @@ Stmt LowererImplImperative::lowerForall(Forall forall) forall.getStmt(), reducedAccesses); } - std::cout << "printing loops ----------------------------------------------------------------------------------------------\n"; - std::cout << loops << std::endl; - std::cout << "loops printed -----------------------------------------------------------------------------------------------\n"; + // std::cout << "printing loops ----------------------------------------------------------------------------------------------\n"; + // std::cout << loops << std::endl; + // std::cout << "loops printed -----------------------------------------------------------------------------------------------\n"; // taco_iassert(loops.defined()); if (!generateComputeCode() && !hasStores(loops)) { @@ -1203,22 +1205,22 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, set reducedAccesses, ir::Stmt recoveryStmt) { - std::cout << "1 Stmt LowererImplImperative::lowerForallDimension\n"; - std::cout << "1 Stmt LowererImplImperative::lowerForallDimension markAssignsAtomicDepth: " << markAssignsAtomicDepth << std::endl; + // std::cout << "1 Stmt LowererImplImperative::lowerForallDimension\n"; + // std::cout << "1 Stmt LowererImplImperative::lowerForallDimension markAssignsAtomicDepth: " << markAssignsAtomicDepth << std::endl; Expr coordinate = getCoordinateVar(forall.getIndexVar()); if (forall.getParallelUnit() != ParallelUnit::NotParallel && forall.getOutputRaceStrategy() == OutputRaceStrategy::Atomics) { markAssignsAtomicDepth++; - std::cout << "1 Stmt LowererImplImperative::lowerForallDimension getParallelUnit() is Not NotParallel and outputRaceStrategy is Atomics\n"; - std::cout << "markAssignsAtomicDepth: " << markAssignsAtomicDepth << std::endl; + // std::cout << "1 Stmt LowererImplImperative::lowerForallDimension getParallelUnit() is Not NotParallel and outputRaceStrategy is Atomics\n"; + // std::cout << "markAssignsAtomicDepth: " << markAssignsAtomicDepth << std::endl; atomicParallelUnit = forall.getParallelUnit(); } else { - std::cout << "1 Stmt LowererImplImperative::lowerForallDimension getParallelUnit() is NotParallel or outputRaceStrategy is not Atomics\n"; + // std::cout << "1 Stmt LowererImplImperative::lowerForallDimension getParallelUnit() is NotParallel or outputRaceStrategy is not Atomics\n"; } - std::cout << "original forall : " << forall << std::endl; - std::cout << "inside IndexStmt: " << forall.getStmt() << std::endl; + // std::cout << "original forall : " << forall << std::endl; + // std::cout << "inside IndexStmt: " << forall.getStmt() << std::endl; Stmt body = lowerForallBody(coordinate, forall.getStmt(), locators, inserters, appenders, reducedAccesses); @@ -1235,7 +1237,7 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, LoopKind kind = LoopKind::Serial; if (should_use_ISPC_codegen()) { - std::cout << "Foreach compatible loop\n"; + // std::cout << "Foreach compatible loop\n"; if (forall.getParallelUnit() == ParallelUnit::CPUSimd) { kind = LoopKind::Foreach; } @@ -1253,7 +1255,7 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, kind = LoopKind::Runtime; } - std::cout << "2 Stmt LowererImplImperative::lowerForallDimension\n"; + // std::cout << "2 Stmt LowererImplImperative::lowerForallDimension\n"; return Block::blanks(For::make(coordinate, bounds[0], bounds[1], 1, body, kind, ignoreVectorize ? ParallelUnit::NotParallel : forall.getParallelUnit(), ignoreVectorize ? 0 : forall.getUnrollFactor()), @@ -1267,7 +1269,7 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, set reducedAccesses, ir::Stmt recoveryStmt) { - std::cout << "1 Stmt LowererImplImperative::lowerForallDenseAcceleration\n"; + // std::cout << "1 Stmt LowererImplImperative::lowerForallDenseAcceleration\n"; taco_iassert(locators.size() == 1) << "Optimizing a dense workspace is only supported when the consumer is the only RHS tensor"; taco_iassert(provGraph.isFullyDerived(forall.getIndexVar())) << "Sparsely accelerating a dense workspace only works with fully derived index vars"; taco_iassert(forall.getParallelUnit() == ParallelUnit::NotParallel) << "Sparsely accelerating a dense workspace only works within serial loops"; @@ -1293,8 +1295,8 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, } Stmt declareVar = VarDecl::make(coordinate, Load::make(indexList, loopVar)); - std::cout << "original forall : " << forall << std::endl; - std::cout << "inside IndexStmt: " << forall.getStmt() << std::endl; + // std::cout << "original forall : " << forall << std::endl; + // std::cout << "inside IndexStmt: " << forall.getStmt() << std::endl; Stmt body = lowerForallBody(coordinate, forall.getStmt(), locators, inserters, appenders, reducedAccesses); Stmt resetGuard = ir::Store::make(bitGuard, coordinate, ir::Literal::make(false), markAssignsAtomicDepth > 0, atomicParallelUnit); @@ -1320,7 +1322,7 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, kind = LoopKind::Runtime; } - std::cout << "2 Stmt LowererImplImperative::lowerForallDenseAcceleration\n"; + // std::cout << "2 Stmt LowererImplImperative::lowerForallDenseAcceleration\n"; return Block::blanks(For::make(loopVar, 0, indexListSize, 1, body, kind, ignoreVectorize ? ParallelUnit::NotParallel : forall.getParallelUnit(), ignoreVectorize ? 0 : forall.getUnrollFactor()), @@ -1344,7 +1346,7 @@ Stmt LowererImplImperative::lowerForallPosition(Forall forall, Iterator iterator set reducedAccesses, ir::Stmt recoveryStmt) { - std::cout << "1 Stmt LowererImplImperative::lowerForallPosition\n" << std::endl; + // std::cout << "1 Stmt LowererImplImperative::lowerForallPosition\n" << std::endl; Expr coordinate = getCoordinateVar(forall.getIndexVar()); Stmt declareCoordinate = Stmt(); @@ -1380,8 +1382,8 @@ Stmt LowererImplImperative::lowerForallPosition(Forall forall, Iterator iterator // see we are inside a forall. ex: forall(i, forall(j, y(i) += A(i,j) * x(j))) // when you call forall.getStmt it returns forall(j, y(i) += A(i,j) * x(j)) which is the // IndexStmt inside the forall IndexStmt - std::cout << "original forall : " << forall << std::endl; - std::cout << "inside IndexStmt: " << forall.getStmt() << std::endl; + // std::cout << "original forall : " << forall << std::endl; + // std::cout << "inside IndexStmt: " << forall.getStmt() << std::endl; Stmt body = lowerForallBody(coordinate, forall.getStmt(), locators, inserters, appenders, reducedAccesses); @@ -1443,7 +1445,7 @@ Stmt LowererImplImperative::lowerForallPosition(Forall forall, Iterator iterator kind = LoopKind::Runtime; } - std::cout << "2 Stmt LowererImplImperative::lowerForallPosition\n" << std::endl; + // std::cout << "2 Stmt LowererImplImperative::lowerForallPosition\n" << std::endl; // Loop with preamble and postamble return Block::blanks( boundsCompute, @@ -1462,7 +1464,7 @@ Stmt LowererImplImperative::lowerForallFusedPosition(Forall forall, Iterator ite set reducedAccesses, ir::Stmt recoveryStmt) { - std::cout << "1 Stmt LowererImplImperative::lowerForallFusedPosition\n" << std::endl; + // std::cout << "1 Stmt LowererImplImperative::lowerForallFusedPosition\n" << std::endl; Expr coordinate = getCoordinateVar(forall.getIndexVar()); Stmt declareCoordinate = Stmt(); if (provGraph.isCoordVariable(forall.getIndexVar())) { @@ -1553,8 +1555,8 @@ Stmt LowererImplImperative::lowerForallFusedPosition(Forall forall, Iterator ite markAssignsAtomicDepth++; } - std::cout << "original forall : " << forall << std::endl; - std::cout << "inside IndexStmt: " << forall.getStmt() << std::endl; + // std::cout << "original forall : " << forall << std::endl; + // std::cout << "inside IndexStmt: " << forall.getStmt() << std::endl; Stmt body = lowerForallBody(coordinate, forall.getStmt(), locators, inserters, appenders, reducedAccesses); @@ -1612,7 +1614,7 @@ Stmt LowererImplImperative::lowerForallFusedPosition(Forall forall, Iterator ite kind = LoopKind::Runtime; } - std::cout << "2 Stmt LowererImplImperative::lowerForallFusedPosition\n" << std::endl; + // std::cout << "2 Stmt LowererImplImperative::lowerForallFusedPosition\n" << std::endl; // Loop with preamble and postamble return Block::blanks(boundsCompute, Block::make(Block::make(searchForUnderivedStart), @@ -1713,6 +1715,7 @@ Stmt LowererImplImperative::lowerMergePoint(MergeLattice pointLattice, ir::Assign::make(indexSetIter.getCoordVar(), indexSetIter.getPosVar()) ); // Code to increment both iterator variables. + std::cout << "some casting stuff happening\n"; auto incr = ir::Block::make( compoundAssign(iter.getIteratorVar(), ir::Cast::make(Eq::make(iter.getCoordVar(), setMatch), iter.getIteratorVar().type())), compoundAssign(indexSetIter.getIteratorVar(), ir::Cast::make(Eq::make(indexSetIter.getCoordVar(), setMatch), indexSetIter.getIteratorVar().type())), @@ -1876,7 +1879,7 @@ Stmt LowererImplImperative::lowerForallBody(Expr coordinate, IndexStmt stmt, vector appenders, const set& reducedAccesses) { - std::cout << "lowering a forall body----------------------------------------------------\n"; + // std::cout << "lowering a forall body----------------------------------------------------\n"; Stmt initVals = resizeAndInitValues(appenders, reducedAccesses); @@ -1893,7 +1896,7 @@ Stmt LowererImplImperative::lowerForallBody(Expr coordinate, IndexStmt stmt, // Code of loop body statement Stmt body = lower(stmt); - std::cout << "\nBefore: [" << stmt << "]\nAfter : [" << body << "]\n"; + // std::cout << "\nBefore: [" << stmt << "]\nAfter : [" << body << "]\n"; // Code to append coordinates Stmt appendCoords = appendCoordinate(appenders, coordinate); @@ -1911,10 +1914,12 @@ Expr LowererImplImperative::getTemporarySize(Where where) { TensorVar temporary = where.getTemporary(); Dimension temporarySize = temporary.getType().getShape().getDimension(0); Access temporaryAccess = getResultAccesses(where.getProducer()).first[0]; + std::cout << "temporaryAccess: " << temporaryAccess; std::vector indexVars = temporaryAccess.getIndexVars(); if(util::all(indexVars, [&](const IndexVar& var) { return provGraph.isUnderived(var);})) { // All index vars underived then use tensor properties to get tensor size + std::cout << "All index vars underived then use tensor properties to get tensor size\n"; taco_iassert(util::contains(dimensions, indexVars[0])) << "Missing " << indexVars[0]; ir::Expr size = dimensions.at(indexVars[0]); for(size_t i = 1; i < indexVars.size(); ++i) { @@ -1925,16 +1930,19 @@ Expr LowererImplImperative::getTemporarySize(Where where) { } if (temporarySize.isFixed()) { + std::cout << "temporary is fixed\n" ; return ir::Literal::make(temporarySize.getSize()); } if (temporarySize.isIndexVarSized()) { + std::cout << "temporary is index var sized\n"; IndexVar var = temporarySize.getIndexVarSize(); vector bounds = provGraph.deriveIterBounds(var, definedIndexVarsOrdered, underivedBounds, indexVarToExprMap, iterators); return ir::Sub::make(bounds[1], bounds[0]); } + std::cout << "should this be an error\n"; taco_ierror; // TODO return Expr(); } @@ -2003,7 +2011,7 @@ vector LowererImplImperative::codeToInitializeDenseAcceleratorArrays(Where Expr p = Var::make("p" + temporary.getName(), Int()); Stmt guardZeroInit = Store::make(alreadySetArr, p, ir::Literal::zero(bitGuardType)); - std::cout << "vector LowererImplImperative::codeToInitializeDenseAcceleratorArrays\n" << std::endl; + // std::cout << "vector LowererImplImperative::codeToInitializeDenseAcceleratorArrays\n" << std::endl; Stmt zeroInitLoop = For::make(p, 0, bitGuardSize, 1, guardZeroInit, LoopKind::Serial); Stmt inits = Block::make(alreadySetDecl, indexListDecl, allocateAlreadySet, allocateIndexList, zeroInitLoop); return {inits, freeTemps}; @@ -2205,8 +2213,10 @@ vector LowererImplImperative::codeToInitializeTemporaryParallel(Where wher vector LowererImplImperative::codeToInitializeTemporary(Where where) { TensorVar temporary = where.getTemporary(); + cout << "temporary found: " << temporary << std::endl; const bool accelerateDense = canAccelerateDenseTemp(where).first; + cout << "accelerateDense: " << accelerateDense << std::endl; Stmt freeTemporary = Stmt(); Stmt initializeTemporary = Stmt(); @@ -2217,6 +2227,7 @@ vector LowererImplImperative::codeToInitializeTemporary(Where where) { initializeTemporary = Block::make(initializeTemporary, initTempSet); tempToBitGuard[temporary] = tempSet; } else { + cout << "higher order temporary found: " << temporary << std::endl; // TODO: Need to support keeping track of initialized elements for // temporaries that don't have sparse accelerator taco_iassert(!util::contains(guardedTemps, temporary) || accelerateDense); @@ -2234,19 +2245,32 @@ vector LowererImplImperative::codeToInitializeTemporary(Where where) { needComputeValues(where, temporary)) { values = ir::Var::make(temporary.getName(), temporary.getType().getDataType(), true, false); - taco_iassert(temporary.getType().getOrder() == 1) - << " Temporary order was " << temporary.getType().getOrder(); // TODO + std::cout << "values: " << values << std::endl; + std::cout << "dataType: " << values.type() << std::endl; + + // taco_iassert(temporary.getType().getOrder() == 1) + // << " Temporary order was " << temporary.getType().getOrder(); // TODO + Expr size = getTemporarySize(where); + std::cout << "temporarySize: " << size << std::endl; + // no decl needed for shared memory Stmt decl = Stmt(); if ((isa(where.getProducer()) && inParallelLoopDepth == 0) || !should_use_CUDA_codegen()) { decl = VarDecl::make(values, ir::Literal::make(0)); + std::cout << "decl statement: " << decl << std::endl; } Stmt allocate = Allocate::make(values, size); + std::cout << "allocate stmt: " << allocate << std::endl; freeTemporary = Block::make(freeTemporary, Free::make(values)); + std::cout << "free temp: " << freeTemporary << std::endl; initializeTemporary = Block::make(decl, initializeTemporary, allocate); + std::cout << "initializeTemporary: " << initializeTemporary << std::endl; + + // taco_iassert(temporary.getType().getOrder() == 1) + // << " Temporary order was " << temporary.getType().getOrder(); // TODO } /// Make a struct object that lowerAssignment and lowerAccess can read @@ -2259,7 +2283,7 @@ vector LowererImplImperative::codeToInitializeTemporary(Where where) { } Stmt LowererImplImperative::lowerWhere(Where where) { - std::cout << "\n--------------------------------------- lowering where statement: " << where << "\n\n\n"; + // std::cout << "\n--------------------------------------- lowering where statement: " << where << "\n\n\n"; TensorVar temporary = where.getTemporary(); bool accelerateDenseWorkSpace, sortAccelerator; std::tie(accelerateDenseWorkSpace, sortAccelerator) = @@ -2296,7 +2320,7 @@ Stmt LowererImplImperative::lowerWhere(Where where) { }) ); - std::cout << "\ninitiating lowering of where consumer: " << where.getConsumer() << std::endl; + // std::cout << "\ninitiating lowering of where consumer: " << where.getConsumer() << std::endl; Stmt consumer = lower(where.getConsumer()); if (accelerateDenseWorkSpace && sortAccelerator) { // We need to sort the indices array @@ -2320,13 +2344,13 @@ Stmt LowererImplImperative::lowerWhere(Where where) { true, false); Expr size = getTemporarySize(where); Stmt zeroInit = Store::make(values, p, ir::Literal::zero(temporary.getType().getDataType())); - std::cout << "Stmt LowererImplImperative::lowerWhere\n"; + // std::cout << "Stmt LowererImplImperative::lowerWhere\n"; Stmt loopInit = For::make(p, 0, size, 1, zeroInit, LoopKind::Serial); initializeTemporary = Block::make(initializeTemporary, loopInit); } whereConsumers.push_back(consumer); - std::cout << "\nwhere temporaries: " << where.getTemporary() << std::endl; + // std::cout << "\nwhere temporaries: " << where.getTemporary() << std::endl; whereTemps.push_back(where.getTemporary()); captureNextLocatePos = true; @@ -2339,7 +2363,7 @@ Stmt LowererImplImperative::lowerWhere(Where where) { whereTempsWithLoopDepth.insert(std::pair(where.getTemporary(), loopDepth)); - std::cout << "\ninitiating lowering of where producer: " << where.getConsumer() << std::endl; + // std::cout << "\ninitiating lowering of where producer: " << where.getConsumer() << std::endl; Stmt producer = lower(where.getProducer()); if (accelerateDenseWorkSpace) { const Expr indexListSizeExpr = tempToIndexListSize.at(temporary); @@ -2458,7 +2482,7 @@ Stmt LowererImplImperative::lowerAssemble(Assemble assemble) { resultModeOrdering[iter.getMode().getLevel() - 1]); Expr pos = iter.getPosVar(); Stmt initPos = VarDecl::make(pos, iter.locate(locateCoords)[0]); - std::cout << "Stmt LowererImplImperative::lowerAssemble\n"; + // std::cout << "Stmt LowererImplImperative::lowerAssemble\n"; insertEdgeLoop = For::make(coords.back(), 0, dim, 1, Block::make(initPos, insertEdgeLoop)); } else { @@ -2496,7 +2520,7 @@ Stmt LowererImplImperative::lowerAssemble(Assemble assemble) { initAssembleStmts.push_back(initValues); } } else if (zeroInit) { - initAssembleStmts.push_back(zeroInitValues(resultTensorVar, 0, prevSize)); + initAssembleStmts.push_back(zeroInitValues(resultTensorVar, 0, prevSize)); // init values } } Stmt initAssemble = Block::make(initAssembleStmts); @@ -2540,7 +2564,7 @@ Stmt LowererImplImperative::lowerMulti(Multi multi) { } Stmt LowererImplImperative::lowerSuchThat(SuchThat suchThat) { - std::cout << "lowering such that statement\n"; + // std::cout << "lowering such that statement\n"; Stmt stmt = lower(suchThat.getStmt()); return Block::make(stmt); } @@ -2654,6 +2678,7 @@ Expr LowererImplImperative::lowerSqrt(Sqrt sqrt) { Expr LowererImplImperative::lowerCast(Cast cast) { + std::cout << "casting: " << cast.getA() << ", dataType: " << cast.getDataType() << std::endl; return ir::Cast::make(lower(cast.getA()), cast.getDataType()); } @@ -2870,7 +2895,7 @@ Stmt LowererImplImperative::initResultArrays(vector writes, // iteration of all the iterators is not full. We can check this by seeing if we can recover a // full iterator from our set of iterators. Expr size = generateAssembleCode() ? getCapacityVar(tensor) : parentSize; - result.push_back(zeroInitValues(tensor, 0, size)); + result.push_back(zeroInitValues(tensor, 0, size)); // init values } } return result.empty() ? Stmt() : Block::blanks(result); @@ -3021,7 +3046,7 @@ Stmt LowererImplImperative::initResultArrays(IndexVar var, vector writes util::contains(reducedAccesses, write)) { // Zero-initialize values array if might not assign to every element // in values array during compute - result.push_back(zeroInitValues(tensor, resultParentPos, stride)); + result.push_back(zeroInitValues(tensor, resultParentPos, stride)); // init values } } } @@ -3068,7 +3093,7 @@ Stmt LowererImplImperative::resizeAndInitValues(const std::vector& app Stmt LowererImplImperative::zeroInitValues(Expr tensor, Expr begin, Expr size) { - std::cout << "1 Stmt LowererImplImperative::zeroInitValues\n"; + // std::cout << "1 Stmt LowererImplImperative::zeroInitValues\n"; Expr lower = simplify(ir::Mul::make(begin, size)); Expr upper = simplify(ir::Mul::make(ir::Add::make(begin, 1), size)); Expr p = Var::make("p" + util::toString(tensor), Int()); @@ -3081,9 +3106,10 @@ Stmt LowererImplImperative::zeroInitValues(Expr tensor, Expr begin, Expr size) { return ir::VarDecl::make(ir::Var::make("status", Int()), ir::Call::make("cudaMemset", {values, ir::Literal::make(0, Int()), ir::Mul::make(ir::Sub::make(upper, lower), ir::Literal::make(values.type().getNumBytes()))}, Int())); } - std::cout << "2 Stmt LowererImplImperative::zeroInitValues\n"; + // std::cout << "2 Stmt LowererImplImperative::zeroInitValues\n"; + // if generating ispc code, we will keep the LoopKind as Init so that we can initializa it if tasks are used if (should_use_ISPC_codegen()) { - return For::make(p, lower, upper, 1, zeroInit, LoopKind::Foreach); + return For::make(p, lower, upper, 1, zeroInit, LoopKind::Init); } return For::make(p, lower, upper, 1, zeroInit, parallel); } @@ -3366,6 +3392,7 @@ Stmt LowererImplImperative::codeToIncIteratorVars(Expr coordinate, IndexVar coor for (auto& iterator : levelIterators) { Expr ivar = iterator.getIteratorVar(); if (iterator.isUnique()) { + std::cout << "casting \n"; Expr increment = iterator.isFull() ? 1 : ir::Cast::make(Eq::make(iterator.getCoordVar(), @@ -3636,6 +3663,7 @@ Expr LowererImplImperative::generateAssembleGuard(IndexExpr expr) { } void visit(const CastNode* node) { + std::cout << "lowering to cast node\n"; expr = lower(node->a); } diff --git a/src/lower/tensor_path.h b/src/lower/tensor_path.h index 4f5dc49af..da52fb782 100644 --- a/src/lower/tensor_path.h +++ b/src/lower/tensor_path.h @@ -2,6 +2,7 @@ #define TACO_TENSOR_PATH_H #include +#include #include #include "taco/util/comparable.h" @@ -47,14 +48,13 @@ class TensorPath : public util::Comparable { friend bool operator==(const TensorPath&, const TensorPath&); friend bool operator<(const TensorPath&, const TensorPath&); + friend std::ostream& operator<<(std::ostream&, const TensorPath&); private: struct Content; std::shared_ptr content; }; -std::ostream& operator<<(std::ostream&, const TensorPath&); - /// A step along a tensor path. class TensorPathStep : public util::Comparable { diff --git a/src/tensor.cpp b/src/tensor.cpp index 5e02d2660..176856196 100644 --- a/src/tensor.cpp +++ b/src/tensor.cpp @@ -10,6 +10,7 @@ #include #include +#include "../test/util.h" #include "taco/cuda.h" #include "taco/format.h" #include "taco/taco_tensor_t.h" @@ -806,7 +807,36 @@ void TensorBase::assemble() { } } -void TensorBase::compute() { +void TensorBase::compute(std::ofstream& statfile, std::string& sofile) { + taco_uassert(!needsCompile()) << error::compute_without_compile; + // if (!needsCompute()) { + // return; + // } + setNeedsCompute(false); + // Sync operand tensors if needed. + auto operands = getTensors(getAssignment().getRhs()); + for (auto& operand : operands) { + // std::cout << "operand: " << operand.second << std::endl; + operand.second.syncValues(); + operand.second.removeDependentTensor(*this); + } + + auto arguments = packArguments(*this); + + taco::util::TimeResults timevalue; + bool time = true; + TOOL_BENCHMARK_TIMER2(this->content->module->callFuncPacked("compute", sofile, arguments.data()), + "\nkernel execution time: ", timevalue); + // this->content->module->callFuncPacked("compute", arguments.data()); + + if (content->assembleWhileCompute) { + setNeedsAssemble(false); + taco_tensor_t* tensorData = ((taco_tensor_t*)arguments[0]); + content->valuesSize = unpackTensorData(*tensorData, *this); + } +} + +void TensorBase::compute(std::ofstream& statfile) { taco_uassert(!needsCompile()) << error::compute_without_compile; // if (!needsCompute()) { // return; @@ -820,7 +850,37 @@ void TensorBase::compute() { } auto arguments = packArguments(*this); + + taco::util::TimeResults timevalue; + bool time = true; + TOOL_BENCHMARK_TIMER2(this->content->module->callFuncPacked("compute", arguments.data()), + "\nkernel execution time: ", timevalue); + // this->content->module->callFuncPacked("compute", arguments.data()); + + if (content->assembleWhileCompute) { + setNeedsAssemble(false); + taco_tensor_t* tensorData = ((taco_tensor_t*)arguments[0]); + content->valuesSize = unpackTensorData(*tensorData, *this); + } +} + +void TensorBase::compute() { + taco_uassert(!needsCompile()) << error::compute_without_compile; + if (!needsCompute()) { + return; + } + setNeedsCompute(false); + // Sync operand tensors if needed. + auto operands = getTensors(getAssignment().getRhs()); + for (auto& operand : operands) { + operand.second.syncValues(); + operand.second.removeDependentTensor(*this); + } + + auto arguments = packArguments(*this); + std::cout << "running the compute function from the shared library\n"; this->content->module->callFuncPacked("compute", arguments.data()); + std::cout << "compute function executed\n"; if (content->assembleWhileCompute) { setNeedsAssemble(false); diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 02464ce26..f4d848de0 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -11,6 +11,7 @@ add_executable(taco-test ${TEST_SOURCES} ${TEST_HEADERS}) target_link_libraries(taco-test taco-gtest) target_link_libraries(taco-test pthread) target_link_libraries(taco-test taco) +target_link_libraries(taco-test papi) if(${CMAKE_VERSION} VERSION_LESS "3.9.0") add_test(NAME taco-test COMMAND taco-test) diff --git a/test/kernels/mttkrp_gemm/mttkrp_ryan.c b/test/kernels/mttkrp_gemm/mttkrp_ryan.c new file mode 100644 index 000000000..9d0536b8c --- /dev/null +++ b/test/kernels/mttkrp_gemm/mttkrp_ryan.c @@ -0,0 +1,177 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A1845, taco_tensor_t *matmul_5_5_5, taco_tensor_t *A1475, taco_tensor_t *A1416) { + int A18451_dimension = (int)(A1845->dimensions[0]); + int A18452_dimension = (int)(A1845->dimensions[1]); + double* restrict A1845_vals = (double*)(A1845->vals); + + A1845_vals = (double*)malloc(sizeof(double) * (A18451_dimension * A18452_dimension)); + + A1845->vals = (uint8_t*)A1845_vals; + return 0; +} + +int compute(taco_tensor_t *A1845, taco_tensor_t *matmul_5_5_5, taco_tensor_t *A1475, taco_tensor_t *A1416) { + int A18451_dimension = (int)(A1845->dimensions[0]); + int A18452_dimension = (int)(A1845->dimensions[1]); + double* restrict A1845_vals = (double*)(A1845->vals); + int* restrict matmul_5_5_51_pos = (int*)(matmul_5_5_5->indices[0][0]); + int* restrict matmul_5_5_51_crd = (int*)(matmul_5_5_5->indices[0][1]); + int* restrict matmul_5_5_52_pos = (int*)(matmul_5_5_5->indices[1][0]); + int* restrict matmul_5_5_52_crd = (int*)(matmul_5_5_5->indices[1][1]); + int* restrict matmul_5_5_53_pos = (int*)(matmul_5_5_5->indices[2][0]); + int* restrict matmul_5_5_53_crd = (int*)(matmul_5_5_5->indices[2][1]); + double* restrict matmul_5_5_5_vals = (double*)(matmul_5_5_5->vals); + int A14751_dimension = (int)(A1475->dimensions[0]); + int A14752_dimension = (int)(A1475->dimensions[1]); + double* restrict A1475_vals = (double*)(A1475->vals); + int A14161_dimension = (int)(A1416->dimensions[0]); + int A14162_dimension = (int)(A1416->dimensions[1]); + double* restrict A1416_vals = (double*)(A1416->vals); + + #pragma omp parallel for schedule(static) + for (int32_t pA1845 = 0; pA1845 < (A18451_dimension * A18452_dimension); pA1845++) { + A1845_vals[pA1845] = 0.0; + } + + #pragma omp parallel for schedule(runtime) + for (int32_t i1542matmul_5_5_5 = matmul_5_5_51_pos[0]; i1542matmul_5_5_5 < matmul_5_5_51_pos[1]; i1542matmul_5_5_5++) { + int32_t i1542 = matmul_5_5_51_crd[i1542matmul_5_5_5]; + for (int32_t i1545 = 0; i1545 < A14162_dimension; i1545++) { + int32_t i1545A1845 = i1542 * A18452_dimension + i1545; + double ti1543A1845_val = 0.0; + for (int32_t i1543matmul_5_5_5 = matmul_5_5_52_pos[i1542matmul_5_5_5]; i1543matmul_5_5_5 < matmul_5_5_52_pos[(i1542matmul_5_5_5 + 1)]; i1543matmul_5_5_5++) { + int32_t i1543 = matmul_5_5_52_crd[i1543matmul_5_5_5]; + int32_t i1545A1416 = i1543 * A14162_dimension + i1545; + for (int32_t i1544matmul_5_5_5 = matmul_5_5_53_pos[i1543matmul_5_5_5]; i1544matmul_5_5_5 < matmul_5_5_53_pos[(i1543matmul_5_5_5 + 1)]; i1544matmul_5_5_5++) { + int32_t i1544 = matmul_5_5_53_crd[i1544matmul_5_5_5]; + int32_t i1545A1475 = i1544 * A14752_dimension + i1545; + ti1543A1845_val += (matmul_5_5_5_vals[i1544matmul_5_5_5] * A1475_vals[i1545A1475]) * A1416_vals[i1545A1416]; + } + } + A1845_vals[i1545A1845] = ti1543A1845_val; + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/mttkrp_gemm/taco_default.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} diff --git a/test/kernels/mttkrp_gemm/mttkrp_ryan.h b/test/kernels/mttkrp_gemm/mttkrp_ryan.h new file mode 100644 index 000000000..3d0c06f50 --- /dev/null +++ b/test/kernels/mttkrp_gemm/mttkrp_ryan.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A1845, taco_tensor_t *matmul_5_5_5, taco_tensor_t *A1475, taco_tensor_t *A1416); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A1845, taco_tensor_t *matmul_5_5_5, taco_tensor_t *A1475, taco_tensor_t *A1416); +#endif diff --git a/test/kernels/mttkrp_gemm/taco_default.c b/test/kernels/mttkrp_gemm/taco_default.c new file mode 100644 index 000000000..edf8cdb16 --- /dev/null +++ b/test/kernels/mttkrp_gemm/taco_default.c @@ -0,0 +1,183 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A1538, taco_tensor_t *matmul_5_5_5, taco_tensor_t *A1475, taco_tensor_t *A1416, taco_tensor_t *A1479) { + int A15381_dimension = (int)(A1538->dimensions[0]); + int A15382_dimension = (int)(A1538->dimensions[1]); + double* restrict A1538_vals = (double*)(A1538->vals); + + A1538_vals = (double*)malloc(sizeof(double) * (A15381_dimension * A15382_dimension)); + + A1538->vals = (uint8_t*)A1538_vals; + return 0; +} + +int compute(taco_tensor_t *A1538, taco_tensor_t *matmul_5_5_5, taco_tensor_t *A1475, taco_tensor_t *A1416, taco_tensor_t *A1479) { + int A15381_dimension = (int)(A1538->dimensions[0]); + int A15382_dimension = (int)(A1538->dimensions[1]); + double* restrict A1538_vals = (double*)(A1538->vals); + int* restrict matmul_5_5_51_pos = (int*)(matmul_5_5_5->indices[0][0]); + int* restrict matmul_5_5_51_crd = (int*)(matmul_5_5_5->indices[0][1]); + int* restrict matmul_5_5_52_pos = (int*)(matmul_5_5_5->indices[1][0]); + int* restrict matmul_5_5_52_crd = (int*)(matmul_5_5_5->indices[1][1]); + int* restrict matmul_5_5_53_pos = (int*)(matmul_5_5_5->indices[2][0]); + int* restrict matmul_5_5_53_crd = (int*)(matmul_5_5_5->indices[2][1]); + double* restrict matmul_5_5_5_vals = (double*)(matmul_5_5_5->vals); + int A14751_dimension = (int)(A1475->dimensions[0]); + int A14752_dimension = (int)(A1475->dimensions[1]); + double* restrict A1475_vals = (double*)(A1475->vals); + int A14161_dimension = (int)(A1416->dimensions[0]); + int A14162_dimension = (int)(A1416->dimensions[1]); + double* restrict A1416_vals = (double*)(A1416->vals); + int A14791_dimension = (int)(A1479->dimensions[0]); + int A14792_dimension = (int)(A1479->dimensions[1]); + double* restrict A1479_vals = (double*)(A1479->vals); + + #pragma omp parallel for schedule(static) + for (int32_t pA1538 = 0; pA1538 < (A15381_dimension * A15382_dimension); pA1538++) { + A1538_vals[pA1538] = 0.0; + } + + #pragma omp parallel for schedule(runtime) + for (int32_t i1542matmul_5_5_5 = matmul_5_5_51_pos[0]; i1542matmul_5_5_5 < matmul_5_5_51_pos[1]; i1542matmul_5_5_5++) { + int32_t i1542 = matmul_5_5_51_crd[i1542matmul_5_5_5]; + for (int32_t i1546 = 0; i1546 < A14792_dimension; i1546++) { + int32_t i1546A1538 = i1542 * A15382_dimension + i1546; + double ti1543A1538_val = 0.0; + for (int32_t i1543matmul_5_5_5 = matmul_5_5_52_pos[i1542matmul_5_5_5]; i1543matmul_5_5_5 < matmul_5_5_52_pos[(i1542matmul_5_5_5 + 1)]; i1543matmul_5_5_5++) { + int32_t i1543 = matmul_5_5_52_crd[i1543matmul_5_5_5]; + for (int32_t i1544matmul_5_5_5 = matmul_5_5_53_pos[i1543matmul_5_5_5]; i1544matmul_5_5_5 < matmul_5_5_53_pos[(i1543matmul_5_5_5 + 1)]; i1544matmul_5_5_5++) { + int32_t i1544 = matmul_5_5_53_crd[i1544matmul_5_5_5]; + for (int32_t i1545 = 0; i1545 < A14791_dimension; i1545++) { + int32_t i1545A1475 = i1544 * A14752_dimension + i1545; + int32_t i1545A1416 = i1543 * A14162_dimension + i1545; + int32_t i1546A1479 = i1545 * A14792_dimension + i1546; + ti1543A1538_val += ((matmul_5_5_5_vals[i1544matmul_5_5_5] * A1475_vals[i1545A1475]) * A1416_vals[i1545A1416]) * A1479_vals[i1546A1479]; + } + } + } + A1538_vals[i1546A1538] = ti1543A1538_val; + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/mttkrp_gemm/taco_default.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]), (taco_tensor_t*)(parameterPack[4])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]), (taco_tensor_t*)(parameterPack[4])); +} diff --git a/test/kernels/mttkrp_gemm/taco_default.h b/test/kernels/mttkrp_gemm/taco_default.h new file mode 100644 index 000000000..54274569e --- /dev/null +++ b/test/kernels/mttkrp_gemm/taco_default.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A1538, taco_tensor_t *matmul_5_5_5, taco_tensor_t *A1475, taco_tensor_t *A1416, taco_tensor_t *A1479); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A1538, taco_tensor_t *matmul_5_5_5, taco_tensor_t *A1475, taco_tensor_t *A1416, taco_tensor_t *A1479); +#endif diff --git a/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.c b/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.c new file mode 100644 index 000000000..a5e031e7a --- /dev/null +++ b/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.c @@ -0,0 +1,199 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A2531, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451) { + int* restrict A25312_pos = (int*)(A2531->indices[1][0]); + int* restrict A25312_crd = (int*)(A2531->indices[1][1]); + double* restrict A2531_vals = (double*)(A2531->vals); + int* restrict cage32_pos = (int*)(cage3->indices[1][0]); + int* restrict cage32_crd = (int*)(cage3->indices[1][1]); + int A13921_dimension = (int)(A1392->dimensions[0]); + + A25312_pos = (int32_t*)malloc(sizeof(int32_t) * 6); + A25312_pos[0] = 0; + for (int32_t pA25312 = 1; pA25312 < 6; pA25312++) { + A25312_pos[pA25312] = 0; + } + int32_t A25312_crd_size = 1048576; + A25312_crd = (int32_t*)malloc(sizeof(int32_t) * A25312_crd_size); + int32_t i1468A2531 = 0; + + for (int32_t i1467 = 0; i1467 < A13921_dimension; i1467++) { + int32_t pA25312_begin = i1468A2531; + + for (int32_t i1468cage3 = cage32_pos[i1467]; i1468cage3 < cage32_pos[(i1467 + 1)]; i1468cage3++) { + int32_t i1468 = cage32_crd[i1468cage3]; + if (A25312_crd_size <= i1468A2531) { + A25312_crd = (int32_t*)realloc(A25312_crd, sizeof(int32_t) * (A25312_crd_size * 2)); + A25312_crd_size *= 2; + } + A25312_crd[i1468A2531] = i1468; + i1468A2531++; + } + + A25312_pos[i1467 + 1] = i1468A2531 - pA25312_begin; + } + + int32_t csA25312 = 0; + for (int32_t pA253120 = 1; pA253120 < 6; pA253120++) { + csA25312 += A25312_pos[pA253120]; + A25312_pos[pA253120] = csA25312; + } + + A2531_vals = (double*)malloc(sizeof(double) * i1468A2531); + + A2531->indices[1][0] = (uint8_t*)(A25312_pos); + A2531->indices[1][1] = (uint8_t*)(A25312_crd); + A2531->vals = (uint8_t*)A2531_vals; + return 0; +} + +int compute(taco_tensor_t *A2531, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451) { + double* restrict A2531_vals = (double*)(A2531->vals); + int* restrict cage32_pos = (int*)(cage3->indices[1][0]); + int* restrict cage32_crd = (int*)(cage3->indices[1][1]); + double* restrict cage3_vals = (double*)(cage3->vals); + int A13921_dimension = (int)(A1392->dimensions[0]); + int A13922_dimension = (int)(A1392->dimensions[1]); + double* restrict A1392_vals = (double*)(A1392->vals); + int A14512_dimension = (int)(A1451->dimensions[1]); + double* restrict A1451_vals = (double*)(A1451->vals); + +// int32_t i1468A2531 = 0; + + #pragma omp parallel for schedule(runtime) + for (int32_t i1467 = 0; i1467 < A13921_dimension; i1467++) { + for (int32_t i1468cage3 = cage32_pos[i1467]; i1468cage3 < cage32_pos[(i1467 + 1)]; i1468cage3++) { + int32_t i1468 = cage32_crd[i1468cage3]; + double ti1469A2531_val = 0.0; + for (int32_t i1469 = 0; i1469 < A14512_dimension; i1469++) { + int32_t i1469A1392 = i1467 * A13922_dimension + i1469; + int32_t i1469A1451 = i1468 * A14512_dimension + i1469; + ti1469A2531_val += (cage3_vals[i1468cage3] * A1392_vals[i1469A1392]) * A1451_vals[i1469A1451]; + } + A2531_vals[i1468cage3] = ti1469A2531_val; + // i1468A2531++; + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} diff --git a/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.h b/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.h new file mode 100644 index 000000000..a9d6b760d --- /dev/null +++ b/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A2531, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A2531, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451); +#endif diff --git a/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.so b/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.so new file mode 100755 index 0000000000000000000000000000000000000000..c2c5ca30ea045392ec3b72aaa875d7ea80b5ac1e GIT binary patch literal 14360 zcmeHOeQ;aVmA}tYVi5~T2_|s@A&(%a4Y6WF2sTN;vL)v^t0dmUfzSpNS(a>1ZOOGH z;?PV8>zK`3A)4Jnp`HB*JJXr&w%cZQVPToh#*RZsfbKYaZ|Q8^<)d-~B{2oEB#rlX z?z>l(p5?MTv$KEYCQs*{-#zEt?~ixi@q_+AbBUrb87%A$M%Z<(4+w~Nu}N&LyyY&KMgJ+xJBeQ zTf5jqdq^s(!|PTK8x}n4H*I5!KHqx&_Nuy7N8kJQ>BDFHvInzIj#CztemNd8Q;O%1 zs@N{8DJipC82g-i*jnPAIe`EXA zhd%pewDGaXYyXlt>u4?e+mpX}?PJHl%a7OApZK#VRA&uyP~ZhcQm+Q47QuG|D<=Q_ zS@46i;M7hk9&?xgP|W_kS;`rjMZONWipLxl%wlH?1ecqI!M8!)#ja$1s#=hG1Gi&m z4Pv0Y44y(KmtV*RO!8-twt(5$xRZ;qod#k5Ebueb(Fk(xP9je7haDVdHwpV+6ZXyJ zQGt{Em}op2dlYUJxJ%$q8!+lb@|Uq;IUxnUQple%*;&ZtFK1RU?}Rvep5u1LL^-#L zgi`|FDR50H1cRNu@mMgC4D~02K^E+ebthRc3c`XdZ5x9fk^V?$cOn_-Z`;_^6OTpO zLhU^fBdvH!Fgy?<1EHSo2O@0!=8dhvo_M@3*q@3eyL%%>Rz>hwWFVObM`9hpSbSHA zibFc#uLr|jUk^q@-90Sa+sA;!gYC$sfA`i%s6X80jiQh<;l@Zb-XCGTp`M<2m~}*` zf8db-;? z5%nbEo?2qU6kE-Lk&aL@gi^OB5~8+2)W)dWDo>3OsijDurLie^vu9NXANU6m1O-y-_?Tp@2K?A{H(<$~!+HcZR!O{trNC1l`3)kC`10;< zjC?EGE3nj(ZtQd=?0}G$`;Q!^2j`I%g4kyyF84>932znhU8>5}q`c(Eml-(i4Pq(h za@jmQf8RKhhs&an!nr&gvmqBMM834Piz@LnH4mrOmZ35amwOkb)#Tw~N^zCCJY23F zl+cieQ{FOYdAMu~N^8x-#ZiN+Y|q2R(ShST^YAi`7(d$Jhc+!eXXzr9X+y)wlBqFm z@R)5hCoy&FawJV%@5ZllSp#CkcTxJp_!PqRONdv9ALIN2;;CyUhB-fvcxtkV!<;W6 zo|6@J_A%!Cahv6ew%mSP_6!)roW$Dj3y}| z7EN;by;I}UD))|3D@^my?_t(5OJt?DB5wULV!O+TZJWbf3JH*e9bf=`+Od zorprh;JtTy?|Pnk$Ggqjwr#8S(1%Zg(ZaiGv~;ktLrYt)uOvX+!pJDgs>Dv;Ul z-bY|2;2!X$chv>booYb$xw|wy;NFQyot9o-p&_Cn;-W|^A~uRNXz9}0DT-~+-g$9q zYX7ive+a|Dn`uUoKK{tBe~PrqfIi-$|3cG8wX}~iX>-@5f68<9Ie#*yg-_^v+-oU2 z)u;!F!So)t8VK8;OZ%R)dh17=nU_$t!K~ubf9tC{p$!fzlraW03bY}!;s8aomXTjs zHf2`XTEa&ImVkS^mRUR6qMz50Lru%bX^VI8UBy@RA3puxz34OLgr0Pl4}Nfw#Hauj z-i-AYU@q^eZ0%<-zk)`9f9Ruh;2Gw8bQC4)>72a<*gq>{8-ev{lu#M?H4C!O10a_;{gdJxrL z;m@os$&@P_!&6%L)yYPfJCBmwV|_i-s(9<)KsiRCaAI$-FSB;OPk9wmkezIT-~}XF zU(2+X@5OM^`*-99e^EF+1)7;U6)BU5PhHuu}pV+hfw2$ogGWNNC{e*WUyUeeg z@SZx?5*}M)e`4<!C zjj5(L*|N78K#P7%)0--?m0)SmSG4Fa08lZL-XH^6Clb(+EBr%zfl!fHsV7{{M;}D0 zH?w}qkCh{19kTM=FgH$LH+W7-U2%j?zu8m&Wtder=+*~X24A!W>i4)SlFdzN>m6D; z;I=jD*80Y@ZyjsWt!rrjxf;{fx}o7F9a!p_BeZE`)7(nZO;+~itfZdDwuj2TT`)|r zp3k82S3!Xh*r|NRClC8G(n}7ZpPDK)6xb(|EOBJ z(Uz{r4nrTKbofQ%rRK$kawxGQI83k>{Wx_`jdMQBeivdx@3iPgosVwAa@3M(wgxi2 zRxboy>_wlOAAd;Nm1#o1{?@PmzD56B%lL-)f{rd9n;uX8^yUlxHBM*7LMvN1>9)6o zPigvDt!h%stayeBKyy{~R-M#e_UR`B8TWTK>K~k5BIecLC>CpXr8fA2HB-LWTfb;; zwXgoPKkfGHdC#wWr0Fl}c6UqoQ*GpflI+KyW4zeir(c@RU;p_{ZEm~1!;LL*kK08Z zs4voJFs#8~ptf^4hwjEa*1~&GHvOnJ@@tEh-r=qZ=;iKolhx8>!;rN!+4Uy7rOBl< zmFrEeYzjG1xhn(da<`Vg3(cLxBBU<~=v&YV%{3SvTJ3Pb=-5YtA+CWoB8NTLwKD5Ap{smp=1Xf<=fXDH z$h=6y`lw~F0XY|9kf&R%Kx=AG`mi_oeCW0YXBAd|J{vJZwc1ym3$X_}tFU)Et1t$# zZ+}LW!_J3Q(ayK~R4Ga|c&uwQ8$en#Z#~n}n{igT+oh#{2w3QZc&_kd*K*ugj zP5l702CMt)pbQP|hW;>U1L!zt8z==n^g)#ewlHPDrCen%wH<_x8yMY<)S)bgL7BtV z?5Nn_EZb$<$L?IR=Ehsx%LyiZOm{X6Ia()V2q3Zp&vR5yI>k0RT;D2bbW}WQS?^FY z)<#F=BXhitnuq6l9d(1H>m6f?<&%~%bh)>5JfM6-C-0^|1!(`E7J%F>)G%LYfEMNu4ez8NKfjY>S!<|^8H zC*oAEM@qboiia&;hdO92X>(L~d9fLzHrNT-HpudOlj(d>zZq@dbL>+rf32Vmnr7sz zQ5(?Ob{O_vNB=&JIP*FBO8$PH!{#mH?ehxK2GHlSPpK`&f$cY8zXPWJZfeSO#X%M? zg-b1PsRb^zz@-+r)B=}U;8F|xpSOU#*OvF#^4=Op5+3M|or2`)4xNJCAi3{2;xkh2 zJI!K|F7MH=5bNnRGgw`M1Jy&IZp4?2=Z=xyEr$?JMqVa zyu2HxGa3at^75dnoCq~`#z1rDZh2?DNZ?}vkhn`kWI2b0oV>f1`Qg5Wm;bORD317? z&*dxlq)5k+nWu~D39J8qM1D6n=lGeZ=v-03g@WE7=xRZm1-(nqsG#=?`mmt#`^WmG zrrXuZZSDA0mQru?-0G>Rz9q%E+P$@Po|;uv2Fg~TL`I_-C6NPlw!`vQA#XIIjSmI; zegOYyt`-@ku2;62?J0kG7jVm7O1zJvJXo!z7b&95SFU$lQD$*m<=D1pzT(0#De)*4 z$L(cn%D%FAzOtaSq2h_g#R^JiS;Y*z<%$xBS>3Fx><(zcN*GTGe@#=A^N!1zf|USJHR4aCl`n9*cH&uI#?0ZZ&lKlhIHj*&9lBdBVoDysSi4!txqZEc2AF zzEpxQXbQgnt4+1FMnkIAz4)rxS=|v&;R9cFT@8R_Di&GS8Hq*uyTf4V@opC*%D9`?|xdcOabTXJ1mE zm4($=$?dO1rK@kN_V>2>YFY)uhlof<5reB&p$M_mKy_yy>P3H_5uDnSxDqaeukl`;j1U&>g}O(*zZ zEEKh15Xki{KgEL8O0H{#xRuHEuMnTZeT*nG=8_M+`_J4^80LlqJ?$|wV3>sOs;E%@(XA9Z&130d)4Gi8gUE1Q7DGX-Lx>x zlG{bixZG{cxZGvU_|if^a29sW44j6sm7Se3CpZC={uQ z2^@SM_3Gh2RLsBQGR~c z25h!=_y}nVbFxRSL?kcFmM@7E%lABRrO147ph1e^%Q-%C+>#%T0uLSYyr|<1J99q% zk+5U-3%Z-*Gv`Y$$7kAkXcqiCz=catUh6PM_o-?DVlc;Mk$<1#c4qc3`UH3}KdUde ze35ktXOUv%d5hyS{j#o}MSczNV&&f;?3n$sh6HZ*%i1)H{iVEKGyTEZXOZ6p+?6+v zWYZ3ce9iu2PtIcJc_BZ;FH7KKCjYL}yq)dLC2$(|6vl{q(E9CnTken@S1iP3nqJmVR9Htz!Rz?9_;Liw}*Oy9XJUk zf}zv^BQL6+NHWslxovfQU2!4qh!za>_lI`Fg(}&<8@^g0_>gs^dV6<6C6D8tYXze5 zXtXCD>X1dlu?ya8u>`dFZ=$24w--eg-dvs_$`g#n;Fc8(HgEB6^auT$eB=@r6o!KL zhfs9#_c-4@o4gxanjneOX3(z*i<)l>Ab8SA+v@|H8@++x z=H}+D{D?yX*bHAllMCK-uJB#Ux%f@{?B!c6^e`mfJ;NDq+6I+e_}>+R`95?-UH7K*E^sf* z%YGI=L6c+O3@PC@$lJ51pCiv%`U#5s$hj|A;Qwf<4)?BPp2Y56^eSkw-=JMG7Q@Se z2WNK=Kk-+0cQDR%g%VxN)3G}SO@k)!4ncDFNBR@p@mK*9L|T8OCqxP&*4L9{9-g}g zQBP+aY?2%Z!=_xETT~4^XP%gBlBPc^72;msu?<7f~WCBIFfD=ZaRxc-+AodoI10?Q^3 z@;yXSx4VefKP^TsW0DGB<&E2SWbB`zYf#^rMnoZFW+kq}z~6cR7-m?^BZQG}Lh> zGsjaGum$e7`2JKFB=va zNGk8SAi@LXl^+sd^7Y4szNClqg%NiNUQ&LK0!-#F-wP#`^_6BMFX^8mAwPflCN61* z>HR_GEA3@W`qGbFQjb)$krrbr|DFDk`jWnaq3xncS$_GR(>II0Xbx$PjUz@_O%5?4QoqWCOMOWvkS^o-qTf{w&TQ8#du|r} zlS2PK!%XhitZxNi#z^=p*BUC#IZY#CmZiUYF=HgzEA%_RME@G0Z?5O-Lch#pT(Xi@ zAtK}XS^aXk!QGh;B5sx?-x4!MD%V_RD5?g@{hDQomup8(2$2S%e}Xn=1Qky`9y$Kx m`cLbV9D9i7g6$?F8ZafA^QXVzGKp}-Hrm(`8chl&mi;#j97$dP literal 0 HcmV?d00001 diff --git a/test/kernels/sddmm_spmm/csr_dense_spmm.c b/test/kernels/sddmm_spmm/csr_dense_spmm.c new file mode 100644 index 000000000..7f710f6c1 --- /dev/null +++ b/test/kernels/sddmm_spmm/csr_dense_spmm.c @@ -0,0 +1,190 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A2535, taco_tensor_t *A2531, taco_tensor_t *A1455) { + int A25352_dimension = (int)(A2535->dimensions[1]); + double* restrict A2535_vals = (double*)(A2535->vals); + + A2535_vals = (double*)malloc(sizeof(double) * (5 * A25352_dimension)); + + A2535->vals = (uint8_t*)A2535_vals; + return 0; +} + +int compute(taco_tensor_t *C, taco_tensor_t *A, taco_tensor_t *B) { + int C1_dimension = (int)(C->dimensions[0]); + int C2_dimension = (int)(C->dimensions[1]); + double* restrict C_vals = (double*)(C->vals); + int A1_dimension = (int)(A->dimensions[0]); + int* restrict A2_pos = (int*)(A->indices[1][0]); + int* restrict A2_crd = (int*)(A->indices[1][1]); + double* restrict A_vals = (double*)(A->vals); + int B1_dimension = (int)(B->dimensions[0]); + int B2_dimension = (int)(B->dimensions[1]); + double* restrict B_vals = (double*)(B->vals); + + #pragma omp parallel for schedule(static) + for (int32_t pC = 0; pC < (C1_dimension * C2_dimension); pC++) { + C_vals[pC] = 0.0; + } + + #pragma omp parallel for schedule(dynamic, 1) + for (int32_t i0 = 0; i0 < ((A1_dimension + 15) / 16); i0++) { + for (int32_t i1 = 0; i1 < 16; i1++) { + int32_t i = i0 * 16 + i1; + if (i >= A1_dimension) + continue; + + for (int32_t jpos0 = A2_pos[i] / 4; jpos0 < ((A2_pos[(i + 1)] + 3) / 4); jpos0++) { + int32_t jposA = jpos0 * 4; + if (jpos0 * 4 < A2_pos[i] || (jpos0 * 4 + 4) + ((jpos0 * 4 + 4) - jpos0 * 4) >= A2_pos[(i + 1)]) { + for (int32_t k = 0; k < B2_dimension; k++) { + int32_t kC = i * C2_dimension + k; + for (int32_t jpos1 = 0; jpos1 < 4; jpos1++) { + int32_t jposA = jpos0 * 4 + jpos1; + if (jposA < A2_pos[i] || jposA >= A2_pos[(i + 1)]) + continue; + + int32_t j = A2_crd[jposA]; + int32_t kB = j * B2_dimension + k; + C_vals[kC] = C_vals[kC] + A_vals[jposA] * B_vals[kB]; + } + } + } + else { + #pragma clang loop interleave(enable) vectorize(enable) + for (int32_t k = 0; k < B2_dimension; k++) { + int32_t kC = i * C2_dimension + k; + for (int32_t jpos1 = 0; jpos1 < 4; jpos1++) { + int32_t jposA = jpos0 * 4 + jpos1; + int32_t j = A2_crd[jposA]; + int32_t kB = j * B2_dimension + k; + C_vals[kC] = C_vals[kC] + A_vals[jposA] * B_vals[kB]; + } + } + } + } + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2])); +} diff --git a/test/kernels/sddmm_spmm/csr_dense_spmm.h b/test/kernels/sddmm_spmm/csr_dense_spmm.h new file mode 100644 index 000000000..cf0cf205c --- /dev/null +++ b/test/kernels/sddmm_spmm/csr_dense_spmm.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A2535, taco_tensor_t *A2531, taco_tensor_t *A1455); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A2535, taco_tensor_t *A2531, taco_tensor_t *A1455); +#endif diff --git a/test/kernels/sddmm_spmm/csr_dense_spmm.so b/test/kernels/sddmm_spmm/csr_dense_spmm.so new file mode 100755 index 0000000000000000000000000000000000000000..398362532976cb094350fa44b666bd3babbcabb3 GIT binary patch literal 14520 zcmeHOdvIITnZH+35*Z6g2_cwA$xRT&X=24LAy_0Sk}W4!SsC2Kp$3vFvMfKgw(PZ~ z;y@ZqWK(Vxg)B`=c9ufA!|t@Z-ID3j(g!oO^K{Z>aRN!`K)c>41-a#6QZ|?Z(f+=3 z&ylaM?RIDOkDb}s1M+vi-}gA*dEE23_vFEv@Oq0ZOCnjN&q>79U8W;`E~qT$84$lz zDb2&*rP30nJ7-cfEmwY>4hbqr3InK&x%f?dim{*y%Lw|2MaTDXD(r|QJNNav+m zGJ=lqst9V=)g&7z{f4>ROg&srP|~GT_uFI><#%4q*y&VH&@)m z%bTrT?7Te$<<((zTL>Rk{5IUNMf%{xNcZJs_S%jA{DAcz7o1!@aPztjDuU!L{K(Bb z{0=Izy{Oz$?66hIkIYeIl3jrxRky0<>|2Mvb?7@k{mo-LQ;x=t{^m!DzxP*14!-hN z4c8sK_&{^j{`gBj$-d{TEB?EazkBIjXYVh*QQ;f@S~E;%O*bRp1v5x#m?$&gJAlok ze`*%|?`OfIv*5p)1>ZLd{-arN>IVfsbGizGnc}a6qGDDCUpb5ZCdih~pnpH~-O`m( zpQ22WhM6CSq?G8eu+1|4MbaZC{p-2@;8nW*cJA$U&;^o18o5#j#Ivq&{e6pcd^vbZ z4}rR+MbqM0&-Ls0b0o(5y#nWPex$pB4=T4vYAE6WCtJAwc9S0pV}W$J)WvaOm1DR= z8s+hC=7K!O{XBkE7(^m19i53tPcqt_j6|eJTcRx~MVdjRNT|Lx(iHEGx3u*n6{I^rEY@uU>% z=#pUH8EHgN-8(kNqusIAKr^Z|9j=NucXr36j%a&(XH05}Q*%U^aWbCh>FkaqrRMH9 zuU1#I8+!3}Dbmy0))9&J^u#+@<1jXcC{xL})ZW(Ef~>cv(_2AIjB=}`NW3YUjG}Ih zJw3clBB-<80;{~`dZvOh;ZRj|meaAh;N(B?A9cM9 z{TLg&lH>J)FmCuzP6PSm1oAdZadM0qO4%g_#L`!H+K1aax0@T|~+maK6Md(YOJ3 z2xZU-1MW27CA?f(LwJ>#R59SR)`(PUz|nApRBph<+(N=i11{zY!u9z4Y=qM(r-23e2OGk>N4Qy@Iq44Z`sw%oPkA(qz()wE%^~OebjzL z%HQ%Bp!|*d@z=H3j~wx>WH&aNN4oKA#LL8wFn%}j)P-Y%jK80FYTB_wjPE9%nsn?S zY{km|mq-NI@!NC?aJ4Xgp zeG{to+vL?~s!i}iO?COU{OF{~wj2+@|L~aYk<4`l}?g5_r6snk0@#pxQ&x4oA z92JwKKOBL}psv1ctQi_QANWGxuAflv1-1n0e@dAxo7IQjYCulSEDmQnl_oV~yRj4k zRqOYtz$*sT%m#NjyT{`P7tZ#3XcM4nPk0VND-$UVX6`EwXIe^BE$G<}T{}ciLloBb zcuL6xjud9n?{R}>Hmt~YT@UbkP)ZNFT%qxb{Lm$v8oQW#?UQ`If6&$M#0Usv*XM_R zV}rNrUSJVU0aR61J zL~*EDTbe34%G~BCJB}Z$Be2-M6@PkF}nv&`0?*|K%kfkD_+YFj;PU7?X|_1vL9 zTGB_XuF#R835)NrE4ve>+3UUmLuT19G-`oCy&6Ml0qq?%_L7=;mPH@t;fAvd&r>1W z1KO)<)-gAnbv&+S7ak)Qq3o*RP34!M+TdBwQ4%FRb!v8%UA5{> z2oJ9SRx{7D)~n5W#vy(I5cR(Lw09)WDN-4{fQ?Y#=zz}v39x7(d-47wyV*%|2a0iLg4hPZn?cuELcqn!x ztl2!segcUUihY8XKBgdxYB4CBY4?=U_Ubwzz%tMO}od>!f}25aM7tH=l&vFe}WW9+YbfRSMMod zvQRdhcQ3g>E$P>ttG=Yi4W;8n@AaQ@^}EpCFneJ7hLW|;f&35A>eY?N(7Z4d$+@L!PRj3f@}UF63iv2hDqJc zuF_%1PC@1*S&dYiT``XoEP#gqlQw)2WEYT{ad=deO=FA1t333aP^S>*MHHCgpZD}} zU@6tT7oCaX7GA{fZHi1m=JhaYMZ*&LS3~lMDxX*7bJVkPJ}h4f%O6$@R-Bn!51QfL zA*Bz?&91xu4NPhM9GEto4!D*r1m?EQgWwod3g{`IVUoLTa{(U(Gz2KfAP2el#UzCJ z$c-+myYC4EPEVchdG=g}HAL#_r-Ll2NY5SQG{}P)OcvqH_w;G|S$%AyHU}ZgI_3qk z3-KJBRAXnTNgOX?0G$QS+8&J;JBdbGz!$Jy<7pYS&xb7|zsB7;oghFx1C$LP$&R`&(MNH?J`gt{W4k64ZmxbX$ zoZEIjsGX7zW3FZ8A(}Jl0Yf}Rx)K?RF+yRx9Vf8VQ3PU`$j$o#=_9sK`lzC&kGO+t zwtDJa1KLOVyqd;DE~6ozMuaLFvRvNjX;sq`Zr8y4Kj!mkeZ3(KTB~ZcOK5vqf#m1e zN)gcBhRSzf><(wJN~qvQ!TGT`(6Z%v^t&IT6wKkYVA4>e1v9tzB3lgy+z{se0}L%j zDkM$n%{I13y9Pduy_fpXQCh+8-4@sy*cSL=AcAc`KiuonVkuUCbjDhY>B=z8j5L@V zf>>?TpzoIf*ONm*->+Ry4F-yaXwLLu%sn~iJFXt6&92%V$_9@~uBQ$Mibhn-bu@|k zhWxH4PmlueqVt%WbS*ZC(X2`{mECvT?|SOEZ%925$~tz}X6ui!hAU-HlIzLCMdwtS zD_rWn(%e454@b?b}NW7I(n5*}-)45pR@GR~;4`PgoFU>evg|uCVxN!cb{o zU{_s_MHEAuRW!70TwV4-XxV$zw>fkO-=AS5+tu`8j6C9Lkg6tB--Xn-SeRqW5n;vP*aQY~F7iH)c5;5g?oa!O4gcKmJ=DuR6AVHRNMdNgHL>?zcL4pFS_ZOVfnN?>>K*Z@@2#5qKb>@oJ>0kJ8Gq7 ztaN_3cbl4BJ3Mw1E2j@Qc1?uRW>g=KnR(0S9@#fpzBYo#+UIzD=hEK?X(t`Tj8mfx z2ZnNsYvkd;>4{KmWR2tTT~`IQZs@jooc7?ui!&6Zb*51uk zLJCTc3wD&%_S{W65YmoKX;S$5Uk0qvQe|$A4;FFg?&3cCCT;G8QbUR z(Xdx(HeXdHxK66pY-?#OyQ?y`%7MXZ4Or@jKc|k)J;1yqnU`G!FR7oRqfyycrc6P2 zt^@U3HfOMP`DkEp*9|oluUEXS@8i@=b!qyTyfL#@!E|fS?SC(y_bq=gr)C}~&3zRN zGWOgqJyViP=@}*0P8sdrshF|t$Zw|e2Ha=-w(QzI+Q$vOY0aHuTGizhgZHB6jOR1f zf9O2)Tly`mh6$Wp8p{o5o}j0J9eC^+d#2R}@8vkso%@(drS8hCw+A!jAb7<6U~!NG zp|(~tRrZQADrQx;n%(MAf|*{Jvo8KMCc4s!lUSHnuzt(kh4}v9+65s!q;}ULv^9!} zplvHol9ON4hS-m#AAYbctqWpog_%VO3{*0(le9cnOjSkiHDF0_uq7ciAAQk^s!y&4 zx%hZtp8A4Y&D@8`Zy(zZP|Zy8hlr>50%>{^fVJyd>zZCrdav^SyZJnAhL1vrP6IE4 zK7p0zWfcE1_}4MNc(4T3qorsmY65KsJqp?fx)9UQe$e2he0~)4X;25*gO(D-yqW}k z9rO{_qye49rNt_VCMmL9exdnhqhso)4kqVve8w1pS@4I{hBqOx!JRn zV6w;XlO)7R<5r|FGF$PR04~#8gDR){QA?GxWUqCDQ_0$@oTYo_tap|_JU8I1OwZfk z+-J2Oo_CqEGTDw?!fxc)xuu z*B4{$8qSON*)-QsqH#{?KQHAwiH`H~3ePXkhPwf%o!^h;xwZiFe1MJWRJNemmF4bjx^AE~Z7spW+q4 z!GrO%rTE`S;=8%I?1y>9zRBr7aQY)ok8^sC(>FN1$mv|(F^f3;G^eXM?dAQkp}P7u zrF2VUDv?Ynw|Z~!mY3a}VqC?pib`+!s%1JVtw4l&BkEls2JUQ=(>>7F8UX5%ZSdpO49Mr6&9%3JW_G-=U@phG5joS8I|RaoR>>75;@2J zUhhi%|DNvHO8R0Ji>-`xCYsw?R<_+-xf-_J$>wNJvLl*o^~Utaw8$bVVSP!Gt(VF7 zoU-7%myG)WN_9nr-jGUJ2fkXilr?px@M*5BvK&A%m58rvi6`RSZ85O)IdENXh4U&W692LfL(1dsiQa6(=Gj( z`mCHsY+*0P$8x?p)7XV?(oVuTo3y$T^DQoUs@i4<=q3Ew#JS+pk^p~jI? ziJrEWM7&99OC+T&eE4nSv`G5vr|K6=e%_i>aYx~P(Nui;_j{6Cu|pR3Z>H+IqzSR$7z9@AXvF$sfZ=Dp zB-Wj&_yXJ~Qj8GGTX7Rcd`C43t+e}9X41b>67$+r{pt51NNz2xHb!m2#cE>4XQ(Xm zBUV+jzF2+DxL9?~_@|}#+obpMW`PTU1@}e7lOm>P@;sA%DcA2Zjg!xC z{E!J>$MM2K51!Iy{A9zu6Y-ceGJJZ!YrrW^u@3^v61_=~XBa=v0-q^*g~zYLtU z1K*IavkivBe+@RMbiYdBkNi>|YV|Do@mcU4v)~72 z!G8#RruI3?^rzqV`W?s3_q~Q^@n6c?ZTda4*PuVs{jePOZ@z!_3CGR%w-)ktGv6mW z%lZ-X4{uL)UPhbV!1c}dz19GqsU5a3ecl-c(gA#_ffreg>Cdn}PTkl{-I_qwlT0-? zV*@vNtt^u4h{Wh(Sr6_)HFZW>+B+Mg?U5$zsCpvNRIfz$sM_Pnc$4?m)xOG^mDq)~ zNVK~u2j@AXaEP%S2{CNB;eOhibA8-s?Jrrv?1`-1ufz^O$TujpA zZt>z6;bn)9vbH9PajnswR>_OYny}Pq62}?R`*OUyr>!$Fg^57c9dD140nc@{CnYZ{ z-HWWZr4wwD?sj@v*L&lw{Bdk;g0bLq6aBf=HASA^8%435N%DxG=;EXo_a-GTp2QA3 z@WA4YsTP<-6D^oYINO$J?iBfGV`F#x%YwrpDlYP95Rs)zpWK#v5)Tr4St z_fvdXk^I2Ox3CxIF+s)qD5$VRK8#5PL+^%!y*RfCdK*_3@e4gcw?IzsnglP-cY=0t zdy1cQ%=M@C5i(L0=RiRR2}Ba*3wu%j1Y~4iMM_BGeVL#NuQ1tTmscoX*!Kb>mBRZx zets19y~TStsIWw38B-4!qrEtP3OZs`Mqa#s6qLQ!0WQiH=Tt#OeT5sr3;ISMV&7e$q99Jq_TM(yi*vM~mBKN%7xfly z{|Om-g(b#^IB$!8w_>iZuov_>*isV-dvPwe^P{HFg9=N67xV~F>N{aC&hO&i1&vVS zB8hqnI}z7z$Vf%li*x?yVzQDXh5eoUBkTqJSBQ-Eh4%s6kjA$$2|GdQgR{|Iyf^4o zbSt4Je3gqiF4I=3%2 z83@~ literal 0 HcmV?d00001 diff --git a/test/kernels/sddmm_spmm/fused_kernel.c b/test/kernels/sddmm_spmm/fused_kernel.c new file mode 100644 index 000000000..1572bce5a --- /dev/null +++ b/test/kernels/sddmm_spmm/fused_kernel.c @@ -0,0 +1,183 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A1459, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451, taco_tensor_t *A1455) { + int A14592_dimension = (int)(A1459->dimensions[1]); + double* restrict A1459_vals = (double*)(A1459->vals); + + A1459_vals = (double*)malloc(sizeof(double) * (5 * A14592_dimension)); + + A1459->vals = (uint8_t*)A1459_vals; + return 0; +} + +int compute(taco_tensor_t *A1459, taco_tensor_t *B, taco_tensor_t *A1392, taco_tensor_t *A1451, taco_tensor_t *A1455) { + int A14591_dimension = (int)(A1459->dimensions[0]); + int A14592_dimension = (int)(A1459->dimensions[1]); + double* restrict A1459_vals = (double*)(A1459->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + double* restrict B_vals = (double*)(B->vals); + int A13921_dimension = (int)(A1392->dimensions[0]); + int A13922_dimension = (int)(A1392->dimensions[1]); + double* restrict A1392_vals = (double*)(A1392->vals); + int A14511_dimension = (int)(A1451->dimensions[0]); + int A14512_dimension = (int)(A1451->dimensions[1]); + double* restrict A1451_vals = (double*)(A1451->vals); + int A14551_dimension = (int)(A1455->dimensions[0]); + int A14552_dimension = (int)(A1455->dimensions[1]); + double* restrict A1455_vals = (double*)(A1455->vals); + + #pragma omp parallel for schedule(static) + for (int32_t pA1459 = 0; pA1459 < (A14591_dimension * A14592_dimension); pA1459++) { + A1459_vals[pA1459] = 0.0; + } + + + #pragma omp parallel for schedule(runtime) + for (int32_t i0 = 0; i0 < ((A13921_dimension + 15) / 16); i0++) { + + for (int32_t i1 = 0; i1 < 16; i1++) { + int32_t i1467 = i0 * 16 + i1; + if (i1467 >= A13921_dimension) + continue; + + for (int32_t i1468B = B2_pos[i1467]; i1468B < B2_pos[(i1467 + 1)]; i1468B++) { + int32_t i1468 = B2_crd[i1468B]; + double tA1459_val = 0.0; + for (int32_t i1469 = 0; i1469 < A14512_dimension; i1469++) { + int32_t i1469A1392 = i1467 * A13922_dimension + i1469; + int32_t i1469A1451 = i1468 * A14512_dimension + i1469; + tA1459_val += (B_vals[i1468B] * A1392_vals[i1469A1392]) * A1451_vals[i1469A1451]; + } + for (int32_t i1470 = 0; i1470 < A14552_dimension; i1470++) { + int32_t i1470A1459 = i1467 * A14592_dimension + i1470; + int32_t i1470A1455 = i1468 * A14552_dimension + i1470; + A1459_vals[i1470A1459] = A1459_vals[i1470A1459] + tA1459_val * A1455_vals[i1470A1455]; + } + } + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/fused_kernel.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]), (taco_tensor_t*)(parameterPack[4])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]), (taco_tensor_t*)(parameterPack[4])); +} diff --git a/test/kernels/sddmm_spmm/fused_kernel.h b/test/kernels/sddmm_spmm/fused_kernel.h new file mode 100644 index 000000000..e67e5a761 --- /dev/null +++ b/test/kernels/sddmm_spmm/fused_kernel.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A1459, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451, taco_tensor_t *A1455); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A1459, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451, taco_tensor_t *A1455); +#endif diff --git a/test/kernels/sddmm_spmm/fused_kernel.so b/test/kernels/sddmm_spmm/fused_kernel.so new file mode 100755 index 0000000000000000000000000000000000000000..10619e0ca4f31ef30f7d2630d603c67321ada156 GIT binary patch literal 14512 zcmeHOeRNyJm7ixRjuIO^C757}OGFS|r^Jd~La-qzk}W6CQ~@_}+VGLevMkwNY{}IJ zIMhu{6;mFHXj3UDAe^Z&FFqtgu{fxNwr3T^=1@-kJ1L9$g zY%#u9v8%c4qPe1Zx&2odkfe+`IY4!k<8$HNoCS6AjHC~h8Tfudr5>@Q=eWu!cUfSIIH?}N~zcK%P3LkMma z^)2KVoA8IEqC32ARj^^fXY*~_+1tyX)bG7ArXD%F_{(p8aq)LQaPfF2RYBz{K4fMw zJ_ntOZApDuMWwY_d1#SSp|VTypI@BR8S!>P)yfxVH3D!%^FBL|=V z=#H9$=RX!{ek}as?`B`Kw^w}m+28*7Rr}EQKikkW@o)sH3#RMQ;N>MsYk)aR;Jbj8 zl0UHk{!a_wKVJafzX1NN1#t4W6CZQBe1U!iAy}&Yjli8|Bj6hsu(K6qt4he<4|xZ> zjEy**MWudjr;?$FC#jp|@+;UwCi$y`{6U&TNS*wfu`{5{StUCpIw7BRr;s0{$%1qp zcuEg|I@pSN?b#&c_lxJK+7M@V3A|pklXyz>JS=BR*p4Nfo&DM_=P}WMiWJ+eATL;7-wgO)3NeJp-|5Adw8llYsyW^hNuUED!-gB-q!_VgrMLo^Ub{O$`K+z436cJ28tHJQcBl@IWG* zWTAmU2K`u|3yq5J+7=GRL%rSzx-=hd4o70~FdGQ=_s2r4J525faOGq;nux^%NfwER zMYjfnama=HSs>BdHxLLW65#>v9L|Q&%2YDU`unzWSUh^IpC}*FMn2UHTJ0SSUTH-D2@4-`2P{*JsBfhGy?-GWVd_`cXclBW= zm$7dMdAZNY{mD~7T1a9)lepZ6Z6=%v`5~v1t4VpuPpvd?+Jne1GUW>J!bfYS0LQ8^ zn=TaK_F0s%YEdukA)-sX)LDR&?`5hjz-eS;sxQFh+CpWG1-RTB3HKD>h5M^kfK!`f zYA?X0kEm=%0WOX@T&1%BmqS7Yy#@H?T+;X&EWk0~vq?+8V$(8VzN-t<%+>Dq^hSBRhF{3nQ~A)FrL{Qbm}Yo`x!K2AKj zbowCY|CV@i<@A2ecM(snnjYc&oy3!irUyB{m3VT^bSLL;C7!0nbUWvpi6<9LdpLg+ z@#LE6dd`3IGAE1BHp-Lwjk{fMKj3%0?HR#A;~O2Ww~s{J>dL)F;Yy}uH!gvJ?OJw` z0;)A#(DdIVuYjwz!VbCWlAZafIhCD9$V+oP^q(cc>h+yya~TC1fuzFdXec4|+hmf3hG@Z7u6#hiw(V~air zULkdKjIq~`z@*O*-#HzDgu#11?ETR9X!N|>y&c=PX%DG9lQ#rz5+BorjkI83#e+_OhaDpciF}-TJ;0213-QHQ?7$l3aJsI*?le>RT(<( zhm8xOXa+QQ{?GaR-Z6D=1oOz7^|{cwSAXUmf7v?zYb#pQo^8#OxhcrP`WYfVS5nJb zpP`9Fy?Y$gug`4Jf8VA*lR02?F0+@{nAyi01kD#o8l}`~C{2QlPg%9lB$zKyS+?d; z^@#}$q7z^tc@P90cpgN3;&}}gxl7Vx)Xkp&rOL-ZefrcE{S_}N-^-=7<33k4aB68K zFp@>KkxYd2^dGJMzV3Hz(3Bak{sKfaf7W_j%Wn9o1BTMSzNoz5JvpuENtdTB zYrREFzv_Um9zt0r>9VzD*E|OuKUG%G%aN(YLu1j$VLbh&0^5UEjII7rdaO);9ag#_ z(4kMHU%IF%hqdu3i}ISL9N&_ydG6#{zmg|IJ0RAHO7>At^bM}o(cgJ9OVvl7gj-tm zSGA1KN)j}6ZlMP zaK9&_*8U39?^!Lg!4*-f&NQ5A%dR`xFlIb&5!GQWPajqyUiA(u)6!!OZ_~+$ddJa- zx?|$LUq;l}I5i6c;%^#uX%RK?yp4*^sXKnC?)_JC&0gstJoOkiZ?By=8) zvA%(J-C_Mnwq5Zy{S>V_4AqI<_rjx<%4whSCx}73NV2R@(od4e%9tH zlFU|>`;-gX7lh5{c{SiH3ZQ0~`ht`yI z)waQmSAF!jc5{cT-lINx%xil?-TMn@rSmrR!PBIkbR{9$p?C%(eugW=tmhU z8%JfW?3}u+mC+#Vz#yEY0cbeYa2|tj%CF4K`QMQ~roiQgF$O!Vm#V9+muu<6j;7b| z`;~BgH(cMDyc3?LX@YiRT{9jI!`B$l=dx8wdQ55Zxop&POa<#{(T;L7qkMM_M3o8c z3*?VVj16W%)pKa&Qj{)jOCPpsX-qYnA02k8qbV$41XL?nY!0GQnuyhkAJ2Q*cz~^9 z6#`bNr7t+t(Is!^^C+Bzv_ev}DtORl0w6p^#8~Wvuujl=rhe+- zw@`WJNn=SxM`wWg^c)&m3G+@$s2{`Xb-Fj4hg;wCT^mdtx1==M2byar4;k8pftKKt7jz!>A8iochB;_J$f?4LUi=#yILbW`Q$hgY5TQU86} z%JNoy!aJT@*{V!Gbq=pSL*~6$gvtEO&6VHYT@5?>>D9;57nMCP!rbhU$EP12 zeo3E%QKLROA9(dVoerM%pnv4QAGYbI;lf5NrEC0HQysa;`Fy@D^qfy`)bu0%?3&MV zSLhwCYQO$!PD3duEx$HRZ?WZWG=Mh!DA?*;Em$oBVQjqsoU}oZft-p0Jm_m$NBP?A zL}9I-aHx;$L8&*pIp5ll$67LK1xwbe&t8+hprkH+nzonR$v+xqoww@N``gmTt+b+6 zCpWcZtnb${ewVFTw>C9rd>dJdZr#9F{^pFeadfOj2bTKxr)^{uG;WM%iPmDKkz z(CF;*1(RH*^A#PUGK(?CM#pwv)7o&R;djQmqh(rZ)6XckW;Qt6Fhz1t{vn@lx^7QS z%Y3Xh_b*_OvE}~3$W-S(Yh;|ceU#CELDfu;AioU}GBU&i0jlXQky~L$j z>Kn%HGMzv9_{pOkJ@)?0N9&8ub+p~`bDd4g_PgqR8CzzD6}8mk==vi(Mw)KL6S}EZ z%Xn~%Tavcudi{bD~PVcCK~m-YVqCST7x#-iUDbOR$UGG zh}w0c;S?%@4(66$?{;;nD=;A;e7CE{r(>fgK@Tr?l0h8{7?5^ISYf#fPip@-78zG> z?&Rw<3J3V^iWRMfpL|nQSk1_9aP8cBmb(_Re7xzTtYtn7C)@|`Y({TynXcxA#`9CE zdBQUL8X}Ng_lf(! zfcAs>F;XL-2S6VK{Q;K5DbO`o{3@Y;Hz@6!PvPW01o|TAKF})Iq@>)xl_^6G<;u#% zw*Am^0jr?`w1+knW_N6|SKq2u+-DnMx4i4-_g?Q>MKI}O+Ox+YN6(8)eq`>(=OEhu z3f<1`d;NpuUVCGD@n-u}nWeRSsl5@H*IwIfcS06A z&34-d=<-&N&)4zsVDISxWnR1E!7{JCI!*SD#R+rMeH{kPf&EsA1q^v!EMOw{7j3tMMk zs}p(h#Xf5LgBGvdnYNa7@D`)1cu#?18oz3cBX=-*l`iX~B1S$vgt9d#=Em{x!hPO@u&PtN4?;uR#p@1%D;@Qw%G z@xcGN2ju%^`F>fxPo^cB61@wjBzbzbO{vl#v%ky8cb@Wh7x8oN?C-f(3VFUZBL6PI z%lFxQPXtEmF{M9V%*Tk%i27#t8Ih6ie0N~&q$J&0@<`4gfeZwlTj?EQZe z`MbHf-Umg;J}2lu3;HcVj|qBO(3b=~FKD?Km=%J)N6__x%HJtAx3t{ktli#)-@;PP z8{Id!>+7yhajs!^L!-NX-D(46YtTZ&iN+AfiM!C|c_-uzr_xab6L&AZzDYS#Dw_eI zY%|+a4txl>Wj7Umgt9zYt&1;GMp>p@YrnL@V!zV9{qkjs1JkC=tyt_gRoq=3+*6dd?3XYFiHzgF+r8HKpNNOn z(l2JA(ArQe8tLm<+jo8Adg#WJkzgV@5KQ*EL&jseq)c|g@?)l0mMV9iD8uhw3gQ8r zEe#EZBb{{v_|>wft~-{(PjhvR^#GEoXn12!I2w-kg}~C!fg6V!*VnCI=dA0I|8IYv zvrc{@U8%nQ?mn0?n&zyFL}PXQN6EUucq|zXC1Y`bgMA@3FceC}*e~gwr-&#<69enlp$XB{P+dw^Pt1?9<7|tV=y3SAR)DAyHI@A`C`RlKz!4MTLKrv0#`c#JW=`Y{4Xu>s>K! zVRD@-#;wdD7si4D3x02u>q-HPUn$F(T#t(J#Z0bi#kh^h^{*J8|9d=@Td)cbis!yq zzJhs#H;eJg+4!PjeE#qERBpjdPR8F9%d4ziF4zSH7M!l-@2dqc>?~t)-zmnIBc8}v z2=TlHkudUi)WSjwo$8&X+OyQ&$NyRR9S zyRI33cQGJ13%hC_PSe=RUdx*cQ~+f-CCDd5F3)7Slzgp_A2iLA_X_-w3EwF2*^M4N zrET~q1#u_xG3(;^{BhTTQ#<832rN(ZrcEg^f1U(hs{Qo7p%i`!IQN4%knlS@urmVp zhE&v1=EQuf_qD;;YPYTHXcZw=aMv7r;Layp$gf za{2jjUB?7&j_aa#W~KW39`2v{@nqkDe5p9Fqr(0{3_Ybz{>|763)p{4$d8H2={cvg zT=>l#Pxc2fj^=o>tA)Hd-m3|?lUpqQ+Q#LJG2-q4zG^d@U|9mOP~EXWPk*c{*dOS|i7F8YriK{BQT2zD;coYh>zf)&3-N%p zKrkK;?m`e%GQJB@T0z8`b*Bafc0r|p;}LE}qOnM%KZX!8(X>b^5$+D$6OKp2{b;EO z<8lYkj6fuc7%nqXj|w(z^=@emwBF{U7`lM47`QixK2T8C927}KLlWyw!~(s+Xg9^v z`R=&QyQQrKk~qEwS~X!&^KAu$&^u{+v;X#HuRn16rcK*gI|3cvW`C=xMue;JAhg+5 zP?Z9t*fozw29!}4*iZA=JjXcFY9@7PLZESP%_ZP-Ag?BgFz5!F|P56o%oIjQX2vLs1-KK{FNz zF*L;n&XW=mhkP7L22}EzWvHR3kKzLZMS+&4?(i^9=1%Myz-$0b#tqslb5Yz{+_*%z z`PHMYubXkMH<;*U?(SVtXc{z$%M8ig8;&RXV$mWdfU4D_I5*Aa)yfWR2q^p56i&;T&u_uSwy2KY|@RWN#@4P8^GfZEZmjq zfl4sigLOo(ebGov=7U{b@$kKp!xbtl^KgjFQpl&!6_!e@eE&O$PNei|1j{;4@;)S~ zOW=|iS9r-W=0+C3V#cJtyf;bOEkv;#^CaKb2o`_2V7w!e`tp7yDeYU7s9tO(v+t$k zdB6jVY)gH450g~Bhk^)CWTP-?VCbEY)R*@)NpBLuvi(v{((NdxcTAF(_c}=jg+8^P zWX%01|Hv{DmG?hM#|T7{^-F!(|0v2xznO%Pm9|A@qv+wc5y-~(_ zn{Px^S7AzkDb$zuPDv*Vg^`!<9VO-ObAZeG<^5Dr*s?dyb?-@&&0d zAnTR(vL=1`-c(ZaHQL<&M}@u|e|g@PbXZm-s9FCXP5ScwEU8Bt7W%T^((Koep;uRO ze#m>a{CgF1f2F>p--a&T5v9JokK4o*Q_4YvC&^2C1SpN2)R*@V`S(DRG&o4I-%?Mu z>k}v=5vecl_fsoLi!mqvp8JyelKucih5EDa0fZvW@4_VYB&9%%LVbC^8gd$1Qcl`1 z^FKoYVY2=5UNz!0G|ci;S7Dk0Ls)$wh`iKaZNjC#q|+#u`9jgJIt|Wj*DQNs0sR+* z{znWmv)^WY;~qO(Je#@kY9qU87BMnr{pA467)gFy=*#=KG-H;fzmp-%+|M&Yp91p= zlVl}bjf~6}%DuYE;BF}dkvGe(TR`7^jiKoLEBa*5^oVnfcA@VO6Nv24UL@yJHJ4)e mi=4T+Cm>EbuQRx;oS34}-2P?2O(I +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A2531, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451) { + int* restrict A25312_pos = (int*)(A2531->indices[1][0]); + int* restrict A25312_crd = (int*)(A2531->indices[1][1]); + double* restrict A2531_vals = (double*)(A2531->vals); + int* restrict cage32_pos = (int*)(cage3->indices[1][0]); + int* restrict cage32_crd = (int*)(cage3->indices[1][1]); + int A13921_dimension = (int)(A1392->dimensions[0]); + + A25312_pos = (int32_t*)malloc(sizeof(int32_t) * 6); + A25312_pos[0] = 0; + for (int32_t pA25312 = 1; pA25312 < 6; pA25312++) { + A25312_pos[pA25312] = 0; + } + int32_t A25312_crd_size = 1048576; + A25312_crd = (int32_t*)malloc(sizeof(int32_t) * A25312_crd_size); + int32_t i1468A2531 = 0; + + for (int32_t i1467 = 0; i1467 < A13921_dimension; i1467++) { + int32_t pA25312_begin = i1468A2531; + + for (int32_t i1468cage3 = cage32_pos[i1467]; i1468cage3 < cage32_pos[(i1467 + 1)]; i1468cage3++) { + int32_t i1468 = cage32_crd[i1468cage3]; + if (A25312_crd_size <= i1468A2531) { + A25312_crd = (int32_t*)realloc(A25312_crd, sizeof(int32_t) * (A25312_crd_size * 2)); + A25312_crd_size *= 2; + } + A25312_crd[i1468A2531] = i1468; + i1468A2531++; + } + + A25312_pos[i1467 + 1] = i1468A2531 - pA25312_begin; + } + + int32_t csA25312 = 0; + for (int32_t pA253120 = 1; pA253120 < 6; pA253120++) { + csA25312 += A25312_pos[pA253120]; + A25312_pos[pA253120] = csA25312; + } + + A2531_vals = (double*)malloc(sizeof(double) * i1468A2531); + + A2531->indices[1][0] = (uint8_t*)(A25312_pos); + A2531->indices[1][1] = (uint8_t*)(A25312_crd); + A2531->vals = (uint8_t*)A2531_vals; + return 0; +} + +int compute(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) { + + int A1_dimension = (int)(A->dimensions[0]); + double* restrict A_vals = (double*)(A->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + double* restrict B_vals = (double*)(B->vals); + int C1_dimension = (int)(C->dimensions[0]); + int C2_dimension = (int)(C->dimensions[1]); + double* restrict C_vals = (double*)(C->vals); + int D1_dimension = (int)(D->dimensions[0]); + int D2_dimension = (int)(D->dimensions[1]); + double* restrict D_vals = (double*)(D->vals); + + int32_t jA = 0; + + #pragma omp parallel for schedule(runtime) + for (int32_t i0 = 0; i0 < ((C1_dimension + 15) / 16); i0++) { + for (int32_t i1 = 0; i1 < 16; i1++) { + int32_t i = i0 * 16 + i1; + if (i >= C1_dimension) + continue; + + for (int32_t jB = B2_pos[i]; jB < B2_pos[(i + 1)]; jB++) { + int32_t j = B2_crd[jB]; + double tkA_val = 0.0; + for (int32_t k = 0; k < D2_dimension; k++) { + int32_t kC = i * C2_dimension + k; + int32_t kD = j * D2_dimension + k; + tkA_val += (B_vals[jB] * C_vals[kC]) * D_vals[kD]; + } + A_vals[jB] = tkA_val; + // jA++; + } + } + } + return 0; + +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/sddmm_ryan.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} diff --git a/test/kernels/sddmm_spmm/sddmm_ryan.h b/test/kernels/sddmm_spmm/sddmm_ryan.h new file mode 100644 index 000000000..f0f9e372a --- /dev/null +++ b/test/kernels/sddmm_spmm/sddmm_ryan.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A2531, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D); +#endif diff --git a/test/kernels/sddmm_spmm/sddmm_ryan.so b/test/kernels/sddmm_spmm/sddmm_ryan.so new file mode 100755 index 0000000000000000000000000000000000000000..c3deae084b90146089e3f8a64d8e5d1ad2488095 GIT binary patch literal 14352 zcmeHOeQ;aVmA}tYViBu)N(hMqB@yA#+660i3BhiXV%d`Olu^J12SN)~WLdI3wIx@Q z!J(NXtP+;DLXhp!h5hJmx-*?&ce`z-OBc#?7dr{T0lH%-v`p!4-9kZbpoCCpHqdB) z=e~R8r)RtD>>vBbPHysa?)lww&i(#)_Z{yIMY?Jfg~{MxpJl`yy1+!*MZ&>#A_CIR zT3IcA*RspF?4sGEIWg}F6OxoM9|tIp#dywK%2`k!k4QRJW8(V+m3qXIo_DpG?p^KU z5lN>+RU~zpV$2Cj&!kAVQ%%T8O0pEXf0gT__`wS~JBQ{>m`N)2J_kK2=l?W#h2V`M zzlGYxCE7z$Q5{~l<*?zvv*r5jY4xGJ048@==;TYJl%DFcjDJSdfPMh?AKb_CjTS})dj;M6nNP@sUHXCn+M+o zY(DvKE`T3g0H=2H;jxDq0Q1>jvOqaw3&^(u_u;XJWeeE35rS*&!r*%#?`0RU5udLr z^%LBVn~jTs@+^1?{ak(p8?(uuLfSIsW;=!aP7+4=7d(WIcsR@C+&wMuX4S;66!yO$ z?Ay!Z15S3j#k`=gN8uWQ9~6!Hs0pJ^B!2-LmlIOp>xBG>%?>Ra%P>;+3tWh^CphkA z`$YL~5(&=>d|KeTRES3VhtsKOHW$m}qEQwdObzB(Gzr3@;ht^LzCKUn=ukR663vXJa)ZMOGpl*`0v=GETBXPMux@>s2kRR~*xSEug{PY?V zr@cWeX@dTd*r`DFC!Gg=Zi_)4cxR_F0 zrPYGVwSyAcEjZ;ZgKoiPTToiJ1s6vRuCl{|UnGry?zG@_7F^H2>Cy{}E?wzk`hoFW zO?gVsA9Ee8NR+Qxi=^@u8h+I^?T8USK>b9=ll}lsmW#z zbH0XnYO0w-oIkq)JT=A4LC&8hp1N>mALoBfJT=M82h;#wmTGbe*mo*PlO^tB1jf zi%f%VGAN5+x3_QeD%fG8>-7BQhoAtHD34n&Y(5LFbSbJ>((!YA(L3N3Qb)xYd-o_z z22JtXXOfUG`M@oKo1dWG32YDaY`;mr`_0F}=!F~VgN0~opI&fY(LjK)UwZ_!WxTMZ zxe#3+Del(}aXg}RL)sfD^fyEbdws=`)kaX;lpj}hBckmu-<=>=7fzu!_O;hlQ%KJ8(ia-X&jw6x}2xx9Z|-G2nbC0Kxor26(-6R$d9DHkzb z4HP@eA>%)gpH{ffEKXrRw-7ddw#_&Tn-FMd#f* zU_DsKX+tfqsB2gOLdwf+sDgU`t7sP;fF6mvA1(wRb_Uue)Z!0;=1WS@_>GQovPb9MbY2Z?dEog{%lpWo`C#b4TZPE?n0o(76eFO1 z=ZE>-n&vpIdO!ZIde0lcLI++^_x~F&d5_kh$0tq)RPXX*kXUkBz4ci<&+Cq$R-aTG zUPW`BXn7&2E`M!XanrGuanwHvX=iQzh>}!qb=IP0-ay+F;>VKej>$WJkyO(Yg&xh< z)}t*?s@a#(BReKft2>@i_x}}o4ApK96*ny|E?2h2%X<8U)15GP7Dc$#`C_qK3ADY0 z0vv_HDdj7bgZ_H^f^}u&AxO5Dvc0GSo2|99GtDgVBs~$M}`*Jyt5u;#)OA0UxiY)ult4Yn$;CjLnkc0~)(%c0lTZ+aO-@ z0n#x-X(ILMSDQ0yv|YdZ7|ppa+#a|saC_j3fhguIL-)Ld&YS5*Dfx}xnvb;uUvfe;OzQiOzH6-x49qMz4l~~>;#MM#UW!dFi~0)QYHh> zp9#mOHo70$vodHLH;y!(&YxBGzQPv^(_YXR-~EP#H%{@=RhCE$+xY1=_hWl6JK09| zi|f4@%49yEoZ{wBZFE1jryh2UA2%M)pHue!4CX5OLF35o*NrI{HS?qOC}5OnHF~lg zt%4=V_+i-iG4`QW%!I2Vm<8U_>;F|QhvPp98m+og zq|xarJ!S%78de!^) zA~jIlQVwCIDmo81d2X0QCoj*RQARI(l1|^H&k3`>t%mciaQnH>_vrfagiIxa&N5R>8V7$`+Hu}9MU1X(>TfsU2D|;$dMxVg8i^{%SHA;dg z6&CG6t!{j9V0_Qzp_bQL-ZIy9z0lc^e@fX}*yIai!!GrmDVN)>+*{HMUuh`a0tOLR z>3TC#UkaHKUuhFXjGt3BGtG$e{xG+O{F}O7*wR3YQ|puUq6r%GLKmKPmtHUlZ?{5+ zt59ERhdxH>_|xswKJ4lCQYu(1+fFKgpDKW{oAo_g^OLzNO9O1 zfS{K>9n|>oi=@3+?<2-Q&9gaV* z8}I0ir}g4h-=hNP=we@>@r3bg&^Qq(YJa}Xc>UzXVqVc&U|`+MKjkbgUlnLuxu+@E zb~04Z{JUQZDR1e<(}r6M$KTT@Uau*oe~0nn)=oY%o4@hXD|<9IHU@0TyEQL$ps`Y? z!LSj7f!YpB%q^J5dVDv^W*pHc-f++|)EqIEYlTjyqtk^U>*#bFoo+{`SLs}Cbb3qg zy-z)s)3C=b*Yv{8Xl`1tjEf`2jcA3gW(*I#WxQ&1?4!!IyfBkRZY|}Lf64cM%%cIz zu8#exWt_V3U3Jeb&3fUEW<)v=p$=_DTnFty4!g0t6*vDoDjY0eFV>yviXPhV0!YJh zsuy#p*a{5tLbnrWbIXY!w%K419n`Ki_Gn(}Di>y`-tvOF0#UUwqWRQDjDgY)UOAi= zuqe9uabvU=rOF>0I9ht{J&YmCdPXM`y>N?HFWiaTubcV7N2I@rbbf}#B%o8Pa*1PO z4wUX;x{<~Yg2YZt3C{)Q=scXyJ`IfJp_6i=+uYh@*|+lSlnL( zWoT#(`s1MOpwpl|pcMGf2UYI6ktt(dmYPAV01V0_;1VQ!=TLL?ef%bRqO6_ z?PJ$nyzx_4YikK6eN1=Ojxy6aAwvX_+wioa?2~kg?(ldYsOj+3-|yJs@fDpNo`(At z1w767EDm^D^R-(%rxZudvBc8~EZ}MA@c1ALoeqyHNauc8g}vt7pWaD4y2foRYvs10HH~+B0_|PW8I4 zCg7>R#}V-O^3IwbPkn$Fn=xvGosjK;EWfvz%@_5X(*{A$KE?4j3fiD^PR=^D0j+I^ zVedus@1uybpl77!AC`DrfjZtk&mnCLeJ=Zy+F}~mejD~%V29o5N2y zj+{LBe4Ge1cFII6=U#bdy;9&)0+6^@L}WR~g`B**mighngO~rXC@7BDoUi07_?$?` zk(H;5>Itj=cSL?Sx99l2sOVx*!4-mjLeLF@b_sg3ph-dR5cD2F<@b**ot>ZYHEi$2 zx3W>+wf<}T%}rO2a;{}hORK+meWQu8t571d(ae&_fx6IP_4AN78_~swf_p!J@7k+H zMyczSo9yMk)k{}owesEqAXRe@LX8u@Lb~AzH+JJ#W1PyD-O?R>NeJW za@A60S#5j$LmjIWl+Ll9nRxiZ8i+YHR#*2~Xu?VyPYr)fQj`xoA7csv8OMLWf1UY1 zn~ATZ?{4w1!IsSIz#WzVs+Q@HMqI1IUf05}W%IsYGTl4wgPHZXRph(6nK_uc=@D zzv0uqCix_LM+b-c24Ti5ny)FDN;mP3mQ5p>bS@Fkr859W2IFjaES}AgufH z_Sd1(P1iPshP#8!-GbpmL?ok#(GBZSgw*I*Q-5kyv=hrFa`CfB=aeD3#pN_Vg&Y+5da z76k{ZV+TcRT42b#EB-{)_}uUNlNrc-+A*Wb*s0HPJ!4gl|6iiNz+KV<&s3Y)?=D)L8mgF3)6nKKTYAKW-Z*pAtBRR#n(6 zaCttFi8mplu-SH&)2x@{bNk&8cI3H7+M%0yWuEc#DDe5pPj__l;ZwkQJBaf0yDnf0 zwZkuwrZAPcS0G{u3*}29^X1Dovw7wVT`w#1;43*kcid_mpF1y_c*D+}kL|*a-7hG@ z@wxM*m*aEo+_3;&04`jJEUm-X*Ma-05R>`-0`foQIQ*T&z(mP-C;^|(&+2V1KX+Yv zZvp$y;fz9lP~%?HF2CPHbScM8%=+65+-C(u{5oOB?w56!!0monoeS6}e={%U#h9-; zms=K)9|BJD_I@P)%lZ7riVN8Jj*#DH^UFFeaJ!$_G;dEgv-^p?B;==Ueq3(>udb2V zaJ4w#&o@ri0ADNG)=D;V`FZxW>H|sjkpgMi+-Ne1gV(IPSTr{rjg!Mz7M@Uj>1h8@ zx;Hiy?ZZhZ8;y;QG4i4sO5_rK{%be1wazcZ9nqq(OeVGqE>yY9F8FH2;6v6oIy}4! zDi)4=u2qSqlgXiUtWOp#+t)Wd9L?;CrBF`c!sU;m5Yc1`zF5&{*NuT~p=jv(Ao;;X zg?99g7|KiTTeDDdG!>UvUp5^bh^6|-CoXvF^?_~SPDtX284c;eq8_{v5Zvda?JbcT zIs%dC4P9L~g?gerfsRPXmLnY0xVu@U6qKa^$y=?xUAm5u!`L5k)SK&nM}BtDC%-)M zK{Gwvq)VIhT{9g3ZGLyNxx!(s;@3CpsAsiW?N&$*d!`58tPLu;aJs7ot&Vf^`s>Z+ zUFB0~$$kOnK$|z;94X-|$lG&X=SH5hbPg2xkw;&&%IVQo9qv-e{MlW@=vB~M#-sx> zmcko?9|v|nKjt?L_A$;4#Igg--?u9TO_S#E;y`kDBr@5-bgGJpA}x~`ijjhdjSS_O zpXcsJ)Zd>5nV z!H4bG4};Km`bK)Jl5NrN!%ehysji<91S9)Bo+vFnckqc)NE zWaC0WQhC1x5gsTnYsiAJ>Q4)ONe^3v5f{gv3g!1Fz-9jOeNa+aUTH@1lKwFgtoh3~ za7kaYy*J4GrM;p}U*1bfYE$QJEp>_hnb4R0e^8toCCyod5x2MBH*EUyeYHZRVPQws zTblhxMCj-v$A^5cm46RnudmdX^fBntJDJp%_pC1QN+;zY!h_@`JqnciPU_3|Wcl|t zQ`9*Kvfffpmg|p@Mj}#QzE4lDAuYyy{CD<8>Pz|@lC1ib_imv$FssUAQcqHHl(Oo} z_wxavFXf~eJO2t22$SWP=i-qC^eHcEmK>NnbOsW0ga(q-H#`WHTvv)i@H zo>@TugwX$@X{PdP*LMQ2VTma`uSRuyUq$CZkHvW4m(CFt1mYdeI}{=+GUBiO>r*UF7zkGcLX2MHav3t n$@QPsCpq>ItpwMVW;9?+wC7KMyJZvMiecK=5jt!NHkSQ2tzdX7 literal 0 HcmV?d00001 diff --git a/test/kernels/sddmm_spmm/taco_original.c b/test/kernels/sddmm_spmm/taco_original.c new file mode 100644 index 000000000..4f084ff5e --- /dev/null +++ b/test/kernels/sddmm_spmm/taco_original.c @@ -0,0 +1,166 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A1463, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451, taco_tensor_t *A1455) { + int A14632_dimension = (int)(A1463->dimensions[1]); + double* restrict A1463_vals = (double*)(A1463->vals); + + A1463_vals = (double*)malloc(sizeof(double) * (5 * A14632_dimension)); + + A1463->vals = (uint8_t*)A1463_vals; + return 0; +} + +int compute(taco_tensor_t *A1463, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451, taco_tensor_t *A1455) { + int A14632_dimension = (int)(A1463->dimensions[1]); + double* restrict A1463_vals = (double*)(A1463->vals); + int* restrict cage32_pos = (int*)(cage3->indices[1][0]); + int* restrict cage32_crd = (int*)(cage3->indices[1][1]); + double* restrict cage3_vals = (double*)(cage3->vals); + int A13921_dimension = (int)(A1392->dimensions[0]); + int A13922_dimension = (int)(A1392->dimensions[1]); + double* restrict A1392_vals = (double*)(A1392->vals); + int A14512_dimension = (int)(A1451->dimensions[1]); + double* restrict A1451_vals = (double*)(A1451->vals); + int A14552_dimension = (int)(A1455->dimensions[1]); + double* restrict A1455_vals = (double*)(A1455->vals); + + #pragma omp parallel for schedule(runtime) + for (int32_t i1467 = 0; i1467 < A13921_dimension; i1467++) { + for (int32_t i1470 = 0; i1470 < A14552_dimension; i1470++) { + int32_t i1470A1463 = i1467 * A14632_dimension + i1470; + double ti1468A1463_val = 0.0; + for (int32_t i1468cage3 = cage32_pos[i1467]; i1468cage3 < cage32_pos[(i1467 + 1)]; i1468cage3++) { + int32_t i1468 = cage32_crd[i1468cage3]; + int32_t i1470A1455 = i1468 * A14552_dimension + i1470; + for (int32_t i1469 = 0; i1469 < A14512_dimension; i1469++) { + int32_t i1469A1392 = i1467 * A13922_dimension + i1469; + int32_t i1469A1451 = i1468 * A14512_dimension + i1469; + ti1468A1463_val += ((cage3_vals[i1468cage3] * A1392_vals[i1469A1392]) * A1451_vals[i1469A1451]) * A1455_vals[i1470A1455]; + } + } + A1463_vals[i1470A1463] = ti1468A1463_val; + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/taco_original.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]), (taco_tensor_t*)(parameterPack[4])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]), (taco_tensor_t*)(parameterPack[4])); +} diff --git a/test/kernels/sddmm_spmm/taco_original.h b/test/kernels/sddmm_spmm/taco_original.h new file mode 100644 index 000000000..71ce53402 --- /dev/null +++ b/test/kernels/sddmm_spmm/taco_original.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A1463, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451, taco_tensor_t *A1455); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A1463, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451, taco_tensor_t *A1455); +#endif diff --git a/test/kernels/sddmm_spmm/taco_original.so b/test/kernels/sddmm_spmm/taco_original.so new file mode 100755 index 0000000000000000000000000000000000000000..f50931baa4193cd10932136d3af5a561d44047c9 GIT binary patch literal 14304 zcmeHO4RBOdmcB3DkU(O0zz9K6T5M;6lBOd869G*p9r9u|!a#7)0Xv=kNm7$^XSzEO zGzH8CTzi{#m|dr|YMfejYj(%lof=t}(OK6jgF!@BtD=q!yE;4U_}5u+MJ!!*aOnNc zefM;_Uy^aFwrZ=kiVLsLJ>U7cKkvSK-@Wf-Z>w*8iKYoo7IC{EZqpnCX;OmD6)FSL zBpO61ewT{trS7ar(Ue?6l>spo!XW`FV>Ye}*Gd-DAu~(|N(_8RQMMzN>?*G_###Bo8vAiudp_256yCDB%``4z}4LP*4ZOJI`YJ`@BZoVU`1E|{>Ve+KmE?w;nUyQ zw)pTD4@8*#y;l$1tbHqM=@D%=arGLcahsN1_afLXkaHTGum3W2Nr0Ty#30_zD z5ryk);P-d;$D;m3G7wMt{lecH?M(`Q1Vs4TJ2v=3;dr>aH<1j-J2tfR#iHSkKv!Sb zkWDY~2ZsXWAkf!)e^|6_+_1^t7mE$}|(*3M6*qa{GA1= z{lR}FVwD_&Lbzcye#6%=mCh0&g^kw`ZxQ|ADQWQ6tYpMj_4ZytGyG144PM=g6|6+; zQ~JEF@UU!}OIAo~{a~Eey)qLnlzykfATOOn1vsxsBy$(w=&*chD8PBHAYoGhPGxh_3vliWl5Hx$ z)zU9bwiV#q^(4?o&X(!vSvwXwgnnQ&S&|#qM~;_`NgMjft?Gg(cVh^@_9aco5#K|i zi8DE*8}B7vBYs@+n~A4eFfl6mJBg>RnK&Z(X5y(!CJsyfHsYzt6Ne;!3-NTTCiY4G zCgQ2-69bZ8N<1}rqEqruEpUj)A@DNUf41HE<#&C~FPruWVgFf&^UJY_%f4j4AzUK# z%(_ouuvO2@ngG=sFX*ZFlZ(&=n~6miT(Ki}X3}KG7`1wmhyCA?VrJFfNH@za!CpUh z$)cyu>Bq8n=-LVW^-IZWIGm>(R**xkg4y4`bpfb}!4>+*x(}~HZY$NNLQk)22A;hZ zP0Z@}IWemdyhi3|m=K?h!KK$w-!Tz^hQWKj>ACw^x*MLYo{p_s^zWWM4Mq>{TATJa zg!Hua#v0;N`<*92>qhl-o711Re%z>8dcUk0YV#PR>Fv%cNP7;vZGZU37;5Q9 zj7p}DO84G$$k`XMfAX>Y$#Jj!$us(6dhiWBQ-4ZNC7rqsEm@=% z%5T%t0CW^E8lc7lKU#2Vd;IWj?xhd7*?C@o(Mo4Ku&nzBm4<2vNtUb}5 zx~QjyolWh>KC*a5-qpOR-*}h(s|UkEyMS^VMn1q$8Fto?t|wz%>3Ka1*oi{dW}0)$ zPI48>bc{{mElq?PTkLkL?fN2C1Iz za8~P&dF)TVHZtsVSkBukt6n&7zo+Um`-8_IZ$0ofnvq;cdQA{|Ulfp*;DXk+)4Ij$ ztU+17Men^__eR8C^`0*?{36B)kKc&LUTG~Ic|nWV@3EGm&Xu0VapaFj?AuQ6`nQNZ zb_}i?8atfMh&^$-j9_^FXZ!viQa_@xt(mpfOqI4Fn8RH;-vW1+P`vjok=dkq8sFlo z?1@6LN_z!LP%YLgDb!nU=#7k8pz3LS(WIUCcb$fwI->_))*qvWzb4Lm(1a6^kE4SB zqK}-=^fgImnf<_Fj7}N{Zat0ha^5~~@d zrebz$>ZIpbc1f#t()0R-_Tc!MibwY>^rlXvURZX1*=U`wraYr(kdD zh2alU<8W$}NAr~@m803}`6e_KQ&#G=_S7k?OAUBDmiy4(mDyAOmCLmUf8|Xz=&3PZ zX88}Gi8%>l-Iw|#`wK`x>DFMrX(`M8!~oh;$H7)-9|J2NjP}%v033Q|?OWs^yB7l7 z;^nOeFg0Pefv_f(sI))47gCynS}|v3tOu;JG|Ye>-7s=N8@%%QO0xDSw~jkg*8AH> zUb6Zchn>~Q^(|@Z?K&RXvgVYvu{rHsCt6b0wKSkAo72{Y1EVb|V1qw@zKpzFyg-#p zI?2nPyqCdeu?(ZJrweYqoEKHKm zrgh}k*XZe;c+8s0eCbMGdK*-macy>|t5E^E_ii0?ZcW|GbhB}PlFsV7Hvl)KwmaAA zsSkXhmEKgysi*p!ULTY&0@gaQN`HnbdtuvgipmQ=3Ci-!ahu1s<={$+uAf^d5zdr#C0<8q1miP!!=S=ex!zWNr@eev**0C-M*?VyvfgX)ZPCjI|<9Y)1K1*+L&9=(#l{DL`AGWmF92sk~t>)k?kIntyY>%yB zq_oX;sKoNTWv;COn8#MrY;!;tHqEv&54}&*lk-zt`o&z1-*X8Y#pMOY`%J=qiR%zB z9h4H$EUs69$$j%=U0U;XS)Z@Vowfmsrr8?2Q)&VqHMr{0KS$8toyb$W4wiUq)el-c zHphsyq{CM2k=4e#Kg6BTeGj^_vmh8L>9myrcjFp{>;hzR4{q|~we8bN{(i2_(Zc>b z<AnNJ=JQ{5MDY1K z?F}f=yDX*8FXduH&#ChG9sMNjfsyzOeH->1l=v+9pwj0vbAGR-ErU!Bha}2O(6I(3 z-na1C_Cf;|Pbh%#tSXr6IjZ#d?4HZR_DI(MA*GM)k>t7DbBga!a_r%yTs2R5{l6pp z-P~N_)2g8-6g{Qr2a0~7=xo&i3l#mDqN^3h{z%xZL&23{)&fO^nVmO5g!H)8^w|=o_6^CWk=9egIFIn?p{|;YZtI_NP65H*m`y z65dBynXJ~*OO(;(X*b%gEVtOMv29&APpic6C~;{P+im4*%D=v7o;JUqqLJS2 z6}>APR>L-)j06(N{y?(F6*L~lIVIc(%bP;8%+>CFr3ByBG<^4WwA9rZ9qFj;$5+no z+E8o|AO30^+yIh;(eS$Na5Nn64T7bQjq8RQR@bhschq+C|BbgeYWa$E4fghhdf~>X znxi%njn&GJn6(4(STY<;#^L}6dV`{WD42+guV~MT;^wT7{#T&UwYS!`_HXjKHz`IA z5hX?y{j2Lyh3Mc=ZFh7~^^-`1llY99NcyRB#9%bh+Z_#u9KF$`*s6x#4n^mP?t& ziv8GyK230i1*a3d&J^p<%daoR_kRf4?mhT4;4p>qaqtmEd`;7+)|Y zo`d8T!K+H4HsQQ#m~mc3%{Z@~W}H`9GtR548Nap|P@F|vHwCA8$|^q1nFV$LC3qX) z`^(cad7e(cM(Gcj#=}htKVrhyDLlWxgQv6wmxkyhj#HufQR>2_iK+KJ1)S>H#4ie@ zj6^5++BDMYHl_j?5Ua%&^#Qe#Ae~<#j5{QHf#r)lUv93U$KZG})gg-DiBq)liFPsdA zT(_=nY?xk22B7%^@pxc2La37Q-H6c&Aa*P?*x$b!CIy^~SSwPEMIwE%K!~d*W4&VW z-fjv>LseB2m&=cO_#;t-WBL8-H+wd;`djbvQV5)1Iq>fcpvn}%H3>xqqd~?(iI~49 z5Dif{ocEr)JR90upozChf2*!M>fX(O5IM)b+k6|FJwE@&_3O8^cKACy&AwJsiHJ>; z5oh^YP>~9xK((eOjvk|!u|E`}H#PK*LhN8qA$b&rW(2u$R2zq{8L@w+kh{rJ5wn&L z>6?tvEA(0%QAn|RM)2LF4_WhqFu7@?@+ONZiX1E`X@(F$Q|#UpEfo_Iz$yrnYLV2Nju^XF3Ly z?j76ndp`gD%sAZ~ByKm`alQTsGE!lC-Y1_~LRLaJ)dpw~-+ou0QWvdK`v^*`CTOOlQCl<}L)0XZvL)ob8!T zK+gF>)$cnD&g|E$dtnCqW6J(M!%hCzY;Of%#z=X|^@hp%yrq#b>&~CSKCJA!zrvpW zo`rmx+xe=pFDGF^VwUMLWH?`-CVsWl;O-~{kvHp7jExy1lLa>zh7N<|f6cnY^W2eF zLZ(64AECt=$$_g87mp{N|K|YXv4?CvW$EApX@yB>E}ypdCKYM7kQR0%{w`y1Q``FA D$wJ#M literal 0 HcmV?d00001 diff --git a/test/kernels/spmm_gemm/gemm_default.c b/test/kernels/spmm_gemm/gemm_default.c new file mode 100644 index 000000000..605cc491f --- /dev/null +++ b/test/kernels/spmm_gemm/gemm_default.c @@ -0,0 +1,160 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A2039, taco_tensor_t *A2035, taco_tensor_t *A1450) { + int A20391_dimension = (int)(A2039->dimensions[0]); + int A20392_dimension = (int)(A2039->dimensions[1]); + double* restrict A2039_vals = (double*)(A2039->vals); + + A2039_vals = (double*)malloc(sizeof(double) * (A20391_dimension * A20392_dimension)); + + A2039->vals = (uint8_t*)A2039_vals; + return 0; +} + +int compute(taco_tensor_t *A2039, taco_tensor_t *A2035, taco_tensor_t *A1450) { + int A20391_dimension = (int)(A2039->dimensions[0]); + int A20392_dimension = (int)(A2039->dimensions[1]); + double* restrict A2039_vals = (double*)(A2039->vals); + int A20351_dimension = (int)(A2035->dimensions[0]); + int A20352_dimension = (int)(A2035->dimensions[1]); + double* restrict A2035_vals = (double*)(A2035->vals); + int A14501_dimension = (int)(A1450->dimensions[0]); + int A14502_dimension = (int)(A1450->dimensions[1]); + double* restrict A1450_vals = (double*)(A1450->vals); + + #pragma omp parallel for schedule(runtime) + for (int32_t i1517 = 0; i1517 < A20351_dimension; i1517++) { + for (int32_t i1520 = 0; i1520 < A14502_dimension; i1520++) { + int32_t i1520A2039 = i1517 * A20392_dimension + i1520; + double ti1519A2039_val = 0.0; + for (int32_t i1519 = 0; i1519 < A14501_dimension; i1519++) { + int32_t i1519A2035 = i1517 * A20352_dimension + i1519; + int32_t i1520A1450 = i1519 * A14502_dimension + i1520; + ti1519A2039_val += A2035_vals[i1519A2035] * A1450_vals[i1520A1450]; + } + A2039_vals[i1520A2039] = ti1519A2039_val; + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/spmm_gemm/gemm_default.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2])); +} diff --git a/test/kernels/spmm_gemm/gemm_default.h b/test/kernels/spmm_gemm/gemm_default.h new file mode 100644 index 000000000..769514531 --- /dev/null +++ b/test/kernels/spmm_gemm/gemm_default.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A2039, taco_tensor_t *A2035, taco_tensor_t *A1450); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A2039, taco_tensor_t *A2035, taco_tensor_t *A1450); +#endif diff --git a/test/kernels/spmm_gemm/gemm_default.so b/test/kernels/spmm_gemm/gemm_default.so new file mode 100755 index 0000000000000000000000000000000000000000..9de7a7933e926848e51c4a98410b02d55dc8517c GIT binary patch literal 14296 zcmeHOeQ;dWb-z!$veuT^UE5$}F!*5>A_H1WAdG}<&`K-&iCKeU%fTi#PpkcqG?8}2 zei)g!wyA8&c3I-I$-qo$aWm-*(=x$LX-McF$R8L(Nk$j~o{}b-mXB3%2jTGH0P*&B z-n(bD`y_=?YHNi-#zEtd(XS~zI)$0+S?IWU!f?1lTF+vi0hhfBCWDu>nfQ6 zX%)?)3coAFRfcTkbkU65Q;SWAsSv6GP#yDdox8$dK~*EebhyIAMiy2RmL$HB%K-UW1Fw zRN*?LDvtRL71cHNHs#?;RiQF!%Mx5|9ltzt_^%H?_56SS#qM-X@6i6}!_`mUckIyV z`?fAS^y#;wZQqN${-QU5PX2ims&l4FGWl{Y2ub(*$^qo|X87#IJ{4N}Hv9YkO*ell=xc z{$(|yUgC!&aL_bkjGaYdzSzeTPvT#b@~sv-G|m=@i$s^irMP(Bz)`Pk=Vn>(n#2!E zTw?`Y?;DE8^kgcSNa?!J2Vw&$p+`Z4-r2oT4@VM_zJX*alIY&pJ{XThx`Vxg5wmRe z5!ABwB4MjNeL7<&XZW z5H}lRP>2R-jbD8&meNWQme_b5@is98o{|E4ZL*B`iw6cSq7{Cp#L|}y;0abC_DOku zuJEvIT|in$^7+9yKldCKTuAvIRW;ODp81L8CQfTU^`29{1TTFaOqJmBQERB2E5WPT z2xzUWm)30AC8Ja=!O7d4>Pm2aCQ(^K2@Z!9Q*#Nt;AE8Cl4C@jl|Qfn%rmb*AY)$ zpBy&$6~t4QCwmP3$4gWZ<>gPfeze{5<%0pwm#zB*m~PLP$D&@>^8Jtpzg%d!bss@t zi#w_Ox}05?;Fq4-SBPkb#K|MJ$U9RFk0yTWr3{j4r^Ka zHFX4J_In-&Z5+#PPy^Y!9f90_4>>Wb*J`;(JP#x5&&KQg**;Gov)$v)_&t6tTh&+y zWV$^LAgKVL9xW5_Y}JtK&$pc~6!wp~_WvIDUJD%yWT6$vLRdR8VT-yxe4E6&d^x|T z;Ln6TyE`&x05o)lUeq%9@Aqh0ZrO{dSNL)r1?|YY_Klf;2Rx!>_Z#iWK5MkFdcIhKy&Ajyv;W{q4+an{*@;**wU2o%x4feii($wzIXn$50R-Q zVbzzjH;^Oh8eaz6WZajI18--3;d=B6v~P5`r=imI&i+-C@h(W6S|=m~e!wZvYlDfy_+ zwQc=W%Ue;`)|2=DChCeGfhU%=bbD%}uH5er8w`!Jkf|`$ zxB34diqd~Ir?hQ`^q%`>w`-p%xxthiJz8ltVxF(%b+aAQveeu?S3=OQyrN9e{eGN0 z3(tXUv!9}J@~z{nYhlevZS;tutqFMCt^->!Vn;EQ6b(-Af<&vPq&yD44P&XrkKy&5 zU;m*{pu!#+T3eyN-Q%DM;Lh8>pwW)ILI|7-bKMZmF9S{kmL^wNyUCnw=_c*L<1`Py zxx=^Jx5Ia@Pv?0|H+u43OkX*#&09bDbu=x1`13*`TXp@sV+W_lYi3*vz1dRp*zOe{ z_-SSG=W6D4WKQ~ykN&rsd;SArT)x`%+Zx+M?Y8gzHKZP<`Rejz5D$V zJ~M@ul&o)9#X@IWYM$GB)dwwPKiA~;Wv0lyPnj~zPpzqWZci=jWZqnPV)VSS_g64i zTw?v1qr2bBjKio|AFW5eOo3LT7h2%~EJ>N4cV^zibEg>-=muIv-1$#`Unq2j-tuRf zwal?V?uPF}5^EHOav<|z{xnKKX|7@Vwmb4KnLuaeIM~|!Pr+)q2eqBFkhH6q@4qAi z`6p3;yLUs!fr2qnQCOErx?PX%MX4{hq0rG-&~o+zcB3}Gr}l%ZN6#tg#V@!?J1EVn zcVz4jbdJ7c547y|)TY+AXYIGqn&D{6*jw7N{&k`~V_!=n%-xo?Hy;>l&j3sR^9v5L za=BpzlNwg`6s@G6$72?qJzX{;Fwbex&Q@ueFAj|Dxw@nAoyPw%*L5x1UN`!paz}Qp z+BtgMkx!i~6k4v|o7b}6uFLm;^dTi)^#& znGw4d!b1DLEx*%{YHw&9`zG#O+4FAnj8^j0z^c_aW|+wCZOCeNExR7uhn)@nEFKrc zwt?LQ)}3v&YninkJTC{x12kzg+;9ih$=b$KjW27N_i#V!{3!7F0jhqm%ks;Nh6g;& zm^?Mcqbgm68b^<7$MT0hqi){q*0T4bmfI(-{5X}<8Tsx)t79PXF%To zrFQ}EfGz{&J(u#pCZUYDmCI|Y90wuiK^=6)KKyB+a0pa5-RqsTcetwWckC0lU$*AD zn>;HBCVdP8QHS=?_~8^lW;-qw?H#AxRGZWNor*SR?W490PBmw5bJjgl>2o$bG|%U3 z9qjPvVsTRMtuSzm0+9nN8!qBxuVGg<;0b-0?~9|!#0gFN-?kqV!)_92_k zsgBw!x}CK?quJPlqqqmMYawem3k9PUJx&L3D%*{+E|eWaUih6|6%Q?NI(*eeUmr)= zag z2l)LrzwhSv+t_*rmZ@%s+HL+0-{OJzB~Z@*mfyzjkC^89|C&Los* z4^8Ql^MyFkDOn$%4NlS-8HwMaZ^c=M62D7+M9T9!bJ~wnqN9qDRMj9-O`JB-;(3YR zZC@(!rzF65URKQQ9FubV?w;$zG0kZI!%`l{GK1%OrzGDk%W*a}%4PSY)&D=j-_5Nx z{zP{4grsju`kthpN;*$EV2PyHO1fH7{ywsyz5Q0TZc8t|ou$=Vyf=Fr>TgOLT;ra` zW^Y5&N)r_~pe3fW%o=!r&UF~N2lA%#9L5l+*$?2y*6xteDf}p#t@e~Bz5(2}hYIhd ztdZ>Ys`Hdl7An^`7gyVympivyx=?XrcvN^5oAcJ{HPv5Rx=>kE)mr;l+foIsvo#45 z?_68~F}p`pSKkIrSP9{(FpkxV@`dvvp&*fQ{P%iSng5fC&?@>K7YeNk#beQdzEuM^ zHLr$lA{7lLQ$xX2zc*w~$N3f93ES&Ju`N*Ud8Gp1))ajASKAvKO-HKrL-@+sS09e2 z@!_w&xdA{b9gD2%i^L*{fe={w*tl+_d3F8jCbhng|KD|^TF+OsH$5;I9)KCMX=;5m z7OyuxV%85Q;;Bd|6;A*h9teq{kx()rzM?;?%Dc15u)hkOuD_+eW2npD&?OmTh{$3z zQD5DJCdASs^?k9l^pi+NQuvITOzGqtk&Yz?`eKo=IuJ{VEpqtnkaWKIZt8pX1+Dw8 zdXk}#EUG{eZbn&R|FTS3;a4&?+$N4MDHXP15b(NIj@tyUXXUtE@VZxyR|;M~%JF%E z*PU{_O7QwsjynXebLIHV@Ap)0!)kj-&iiutYSAjaS&r8fsmR!WJWv( zmD>bARZ67^=ck4h=clL@=clI?=clX{=clU`zoHzFoK0Lg1E+P$E!kd!Wt?0m@xvB;oy3a|c<_`q<5Ccv#BnOrK1#i~6fyI@XMj^XTX~~E z!btRnU%}fp-lrHKexAiOTl?vK?riue;D#ThedFCIu(|y34$2ha5E@R{TGS_7XsFIE|jojD(AAE?X{w{)k0&zvt5zcE{TM&`hC zz!y=cE$K<%ss$tVU*?eim4V~FOaJix^S8ifi@W;Bke|6OeKv>vpJ9(eaZ%Pdt;GgD zv!OQvS7*Y)C+(;*o{9IneG;FLILB#q%whjxqhB-Q!?r=*T{2&|2@z@E8aMX99QL1( z^455)e`4I?=l?MLRwE8c#~+jxAC>af_^p2fKAV3&Hsrb0)3gQ~`xVn{F;@qCw)iv4 zKB|1{fK5p_SEWnjTCfg1Znxl}hYFj8+h_W8w7B&@QNya7M&hnP@y39gGLV+_b*P&=A7d zqQUfF3N4jETwWc`(4#R#W9j<(O}>pCddHo9ih$FlzP>Yv7E=J%G!#w8LX3rzalJnn z3sW?lf7_kDjh*d~#AZ_O(4dyW865XjicAh&_7GW-E^%8St~~L zO^4`}S}hMKq)Bmps9pO`#BfGw%?0gxYyXb*AIk+!Sx4|{lXjG6@#Wp zQ#cKf+|EcMIS`MPF*?c;k-;D-$lUN?N_dUhy~uj|;$Tx0O6fK3q&LzpXG?z=%FLN6 z=Cm;-ISU-E1QzK{_dz8X>%&Bs>_99U=X|iYHxb#% z9L^vS&ch*`rHD+aD=iflr2l1xc69Xa0n49}_&mYXBXK6?5tun4yvX8HZ=Ch{+`%*~ zMX_8O$-1p?21@TzSf9@)OgTQE>cvuDe1F3`+E!p>oAvn|!<66GK*UI7qcmw?=-mnH z^LdA9lN9FmvmDbcD5rNZ%=5X3X^+&W_LGdY|KuMoBT+s-F&!fiiR)*5?tctrq~AtD zNPNy>dfJ2yee9Zw^|Ss6F!Bk%@7gCTU}`;4P+g@d38qy4kkl{IQeotIf5g;y9|D}~ z=kpy??k}5Rp6Q>VptOEIComN(C=GDEY%gckm-`ET&uBR=7`|p*qJJs%`Tkq~ej`;X zjJ(x5ljZl+o6W#|NJ?`0qNb{bhZo&q0^YSgg}^_WrsRjEFow|i7mk>yNodj2r~HZml`?dS7iziLWY^{KAXGy#UNhEfoD)?aDC zS)b`7$~j*u`d!uJtah!k=jPBqBK7Yz%@lvF`gQ&YAP&A%raex4ChP4#9J#&?)Fj;d8;hN^jI-cS#q_hsG6ksYn3IQ z*N&nPGR;!|u>3Hf0&Bs=+Gr{&HfV(2ZQ2Rjmfk1@HK Gef@9X*{se0 literal 0 HcmV?d00001 diff --git a/test/kernels/spmm_gemm/gemm_template.c b/test/kernels/spmm_gemm/gemm_template.c new file mode 100644 index 000000000..4a4e5faeb --- /dev/null +++ b/test/kernels/spmm_gemm/gemm_template.c @@ -0,0 +1,183 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A2039, taco_tensor_t *A2035, taco_tensor_t *A1450) { + int A20391_dimension = (int)(A2039->dimensions[0]); + int A20392_dimension = (int)(A2039->dimensions[1]); + double* restrict A2039_vals = (double*)(A2039->vals); + + A2039_vals = (double*)malloc(sizeof(double) * (A20391_dimension * A20392_dimension)); + + A2039->vals = (uint8_t*)A2039_vals; + return 0; +} + +int compute(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *C) { + int A1_dimension = (int)(A->dimensions[0]); + int A2_dimension = (int)(A->dimensions[1]); + double* restrict A_vals = (double*)(A->vals); + int B1_dimension = (int)(B->dimensions[0]); + int B2_dimension = (int)(B->dimensions[1]); + double* restrict B_vals = (double*)(B->vals); + int C1_dimension = (int)(C->dimensions[0]); + int C2_dimension = (int)(C->dimensions[1]); + double* restrict C_vals = (double*)(C->vals); + + #pragma omp parallel for schedule(static) + for (int32_t pA = 0; pA < (A1_dimension * A2_dimension); pA++) { + A_vals[pA] = 0.0; + } + + #pragma omp parallel for schedule(runtime) + for (int32_t i0 = 0; i0 < ((B1_dimension + 15) / 16); i0++) { + for (int32_t j0 = 0; j0 < ((C1_dimension + 15) / 16); j0++) { + for (int32_t k0 = 0; k0 < ((C2_dimension + 15) / 16); k0++) { + for (int32_t i1 = 0; i1 < 16; i1++) { + int32_t i = i0 * 16 + i1; + if (i >= B1_dimension) + continue; + + for (int32_t j1 = 0; j1 < 16; j1++) { + int32_t j = j0 * 16 + j1; + int32_t jB = i * B2_dimension + j; + int32_t jA = i * A2_dimension + j; + if (j >= C1_dimension) + continue; + + double tk1A_val = 0.0; + for (int32_t k1 = 0; k1 < 16; k1++) { + int32_t k = k0 * 16 + k1; + int32_t kC = j * C2_dimension + k; + if (k >= C2_dimension) + continue; + + tk1A_val += B_vals[jB] * C_vals[kC]; + } + A_vals[jA] = A_vals[jA] + tk1A_val; + } + } + } + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/spmm_gemm/gemm_template.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2])); +} diff --git a/test/kernels/spmm_gemm/gemm_template.h b/test/kernels/spmm_gemm/gemm_template.h new file mode 100644 index 000000000..769514531 --- /dev/null +++ b/test/kernels/spmm_gemm/gemm_template.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A2039, taco_tensor_t *A2035, taco_tensor_t *A1450); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A2039, taco_tensor_t *A2035, taco_tensor_t *A1450); +#endif diff --git a/test/kernels/spmm_gemm/gemm_template.so b/test/kernels/spmm_gemm/gemm_template.so new file mode 100755 index 0000000000000000000000000000000000000000..2cfcd7ad3adda27ba1b465d8278ca0b29860a341 GIT binary patch literal 14512 zcmeHOdvH`$nm;$)Krn50gNRWPu6T8VkS5WHiGZe)4%}FcW*|7IVAJXDJa&@qOg|(z z2ysGOdYWcfSJ&NIx0c!3nR08Vl(O7$#=#1_fQ4tmB{e9=2 z)46@qJFbIv{eWmlVjO@*Q`nQZJvMqI~v2GT49b#)>G(##s! zEZi5f%en51Dbcjt*hK~;DPt-JsEnDoPF==XP?cvS-BV%U`vsMD#FCwRp&@rKRCz|y zW1=aNRvK!|4ocS%A-7UZ=t)Ytlseu`hAHp3kh9b14g+SAO1p2uj_UcRCbv*rFUl*| zFO{M{Bo)o!ZJP%lHe73O*~sp`aoeh^oB#d(J7V)c)($>%>!Pu@U!x*O?!rZGX5l)Z zDwVVADykgz7G=*2RUz30xGu)k()Rj?2mj&Vqfh_-fgLGF*TC-XJyk#X&Y=TuerMBy z11G=R-SS}M)t_fSc6L<#)6qY?`jKW6@tI^tTHAt=x*ScR=Sd2OFGY0PL5d{D`<()x6y?5Wh`D?&o6R=O7JwidrM|__e8}gr^BKCLDCIcrV8f#R z8-z43@Z$p4rBxu%GZ2dg63JjZ83?dIU$igD0^J}i(B8Q&5RSwnJ$;E}B;L8MwLcb( zbOyWnBZjQBBoG=3l7nD>-#rnw_SSVBf&N%*Fc42glYIjbqo@*iG%}Pl!I5Y<5RGjM zQgz5C^zA^X_uGN)U|&Cr4Gac)BFR8BH4sSl#v{RSqJSAZiP%77AQ4Hj(7+&reJs$0 zO2xNthy>%IUT-&=G#zeq%D5sL?stUDeN%^D2Gp%>|A zfkbcLKp>b%Lf9fz-xAeC#sT4F1>^ zSHZzd_JG7R;_d7a!5^z3-p2kOJS7EX3zrJl znybhPNvvlQm+Nq)1!qEkNL9I+)R+A6MFvi55cP{pITLPvwI)rtLn?!wGT}}WUMmC4BIrf{{`{Xl@t3p z-$gui)xxrkXnb^wtn~A5PG10;K7UHRkCYm{a1M$=~6Lp+_;zE^m z(=y7F>#@z+>HGZJ>E>Z9I=yck`z z9)75+&fk(hK4r4y5cSd&5BsB}n7!^fkd6A1v#{64&)W3#q&}XzNmmZ*FP}|T!{Hp^ z&_NDm6|DY-S8s+2K4P)$+LvZOc z)VEA@L&M;`-}2t}G(A1sN=(W9r?zAC71zF%lt&x09SZ!7|IY4ud@|jsb+{h)X`N_% zKa7S@atD<9U-W57+LMmt+z@R=O?H#_-Mos*B>2B29u}|~^kw|oRxnBLvt;C3>gOJO z^g!ULnO-B%RrLBme>3ups10Ea^vZzN(W?Twj9wAYno5Ie^h$tN=~Vz-K(7F3HO-cw z^XTmdb<->f$|q6KUa(YOsIKlIaGaf{ZSh6-t4^D z>0f!9UUThJAz0b3IRI64yFMD*d66&M{IMSTa$RWbM2pL9KXk(Dy4`n%lzclch493b z(7}cNGwg(G(>N5Gxxy#q6;^SDPs=Mba)r;zD>QJ0ljRlaxWeb<6_#;@Q{@$ExWX6Z z6*R8!WqAdaD|}U6VF6b-U0$J@E1W5>Fpn#oEwA9_3i&b$zNRxf=j)+Y^w23hU$_5t zJi^-@U(s{)DR1CKZ_my<39jOVYfgneGOj2&edG(bYjpffK0ku-rTA=}njb?-_t`dU zoh$K1xkmRI!d?X$o88`#7u`_k1>`x>eI{0o_!(X>_Zh+cLV<(XD}QjdWW@ zw`RKe+)bD^T%%P+F@^Stfey#?VnoD(#Q1B8%>UfcP#$ENAJBa>zLV=KH?qEEoxJacwau%9y+$t@i5+b z%-hc`IWh93vgEW+%C7%{yTX#~OFy^cz4S3SHOiy)&702C>h^3i+C%;KAMNQk z(1nedpqKeEv$=CW{yd*=5B=JgZq(C<{Mls>QPXuy^40$IN4Y(ag6dGM)YGk%xorl} zo_-!|b*>kzj#;w0J^caz6@B_TImq1#0Vd34ZKHWIRw1lOC)}=myCC&u*XG;m^Lo}k zYUib45PxvR$SEat!Lzi3%*_#Q)tl4yd)h}{wELTOXw}Iztr`1`Iu?$~mbAU8CF5Jo zTGRGbI)in6T>r5LHOc(WI+-Y{R=EipGExe@u%+5N0+M#7CGre{_vj!!;?aS2oGK1h+ zD(heIWmu-!p7v|i^+)~bu;xzpYYTjMMh<$&xx=WbKLdMu;@Q=|M)iEiJL-?tzXDf& z>}D_mlNt)~Ybs0{$+p1aT(S z6qAjHZABQpvk5c_+7J2~=rE{;3GhMCPHe`GgFXrBfc-n5HK4Cy$CxA@bPwn}_@t!V zvz{qKZsk(Ptjhi52N<1=9r+}mKSl~p_ZnyQ&916#mBZ|&OIBWet+tq8viIQ{M7?R8 z%H&68Gp=6L|1sL)v^d>AsAzFk@3XCSs#$xBvu5uMue0v{nO#;=rgJUCBi<6Zk42kFhV0(b^Gp4Efw-xHYrcnQzor5+-aW?v< z)gFFoaM71lKc3N6t*ewiJDZOYofPF2_6s5-&wMvwEuo=vs8MLZ$UYIde_?P=GI_;r!kwI9asF1{mL`K$gOz6oobXgv@cf9_ULLb{Z&dYKq z1^<$02tHT2oIY9pKa%|2+*<-my+N(n*oEK1QtI`d>pXR}*QPjEzq7v4Q`fM>K-n_X(C9?t5y*jCZZmWT^o>rX zEhPr-Zrq-roF!FE0#G(s{V9*!2Hdujgm+PvC%b*tS;{DLlq;PVRN0)DIyYW4M{#4= zRCpAd^MPA??O9)p5-?l5tSI+a9 zf<(^o-{VAVT;|kculik5Yav+%O^@NPqbasVogzZ(P z*v?b#c&P%vdnt$qP+RNkjgC}n2k@(9Pi;7s!cTLxjdcK$sc2+%Pb3e_b9PA6RfuT?$ z&c3ES%Zr<{ocmvnM%P|n+cwbQtLqR9A0k4GDh5_GpbF8{P;E~%CHjfsH%|O4nn(ty zb66^x=YsWIWd|9HvHZw=M@u-IdCSE^HDKAi^+Md7_ThMf5rIp z-{VPc!z?@~-uq(xD%LD|vlw?2;){xLtR`X|DaM_JxSL|!#rBlccNf;5V*NQxt~_`r1}Y>0ij?yjH9~JsyJOw!&&-)+Su8CRUuHa;7I+VO1@b z*_d2?t+-rut@vfdfZ%Lw;WV6vv7LRKw+dVUDzHnCZ;G6r$#W_F8lgXE87Ef@{GbJ2 zE%3rZ51!HnT#6~~M80NS9H0K&)4-{oavubir-yJ?uv?M)7Xh;;kttPwIyWkXzY3i9 zgV>Ppvln3R3Y>|!lOFJt%Jt9ZkSWZ?o{<|9u8p$?&{E}+xD-AE9irf5%WZjIs>|TZ zI6i$mH*tLW`_clOPR?YU9C4H=UPmbrpA_Wy^!I2R$EVjbQwBc(yi|PE6TnqogW}t- z%IJT@aR-}3l_<%xm@~jj#aG#}7b#V)%JJ#**i~ipFUAIy;;XE2S}kStZ!3eR%HR(G zS9yzyZ$ISv)8o2c6u336>qlk$e}nhW^mwwLL%&oU*bBnHHO}m5HYlCEn`>;HCPflj|4v8FBr<7f*-T9 z-PpiQg_Q-81A!0)mL(7i6^;dZ`eR+e{y-Qzsze}|8e$Yj)gMVl!k+6_G&Po1;sI-c zU_2h&jv%UJd^@7Ff`~N>rv?VL!^Fh#2)82DSa)}S3?XEqYCVwwM3qHQXfPQ;RYe?^ zCxChcx}yluEj|jO3ka8iyMt&1MRiR<-Kl6uV&OzA&>M_~DWJ}G z`z_ve?XA$n{x#613y->QJs`y1N#ASzx3+lwfm_$C+0fP*==8Su+bks_UX4eg6>336 z3Xp==nww=r8O4$Pu_(Xk!Fv>>2YU+YqtG-X@J&X#iI6txCyLISio|PfE*Kb*g_eN6 zsTwtipuH)C>Nr)8si;6xEo6bFa6nUu#Q}~K3}{666&GQS!z&r~H&utCKt@wF%ETC2 zf&-^%iI7A797;x1@{(n&p(u|61Or8pmX_x5I8NqCY#+dA08PdX+ADKWe71P-72)CE z9<_a8#<|{LqL+EX+oP~FXcAvDq<42Dp6H83in$aSE>gl z!DtUA62bOGyJIpR?COd~?v@94_1`SSV8{kEh#%*x1H?SE*om*3BlHcH3Bk8HPe`>)8*(Ulw@ z@;gcXy^6KH(q7W1U`yYK(q4X#SBg(esRtFFBroY9p!Dpdz5H&Me-CtwItNL%TiVHb zeGf8Hk@oWY{`exYVoc@tshhNy^p_Bs?F;7s!jQ(dIY~Q7Ux(0aFV72xRKrT@Ngr1J zHxLje>p!fjTy@wWh4+nvxKUZ=bQ}y}b!HHGX}`pROM6KtAeVWw>c6N4XZ35 +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A, taco_tensor_t *C, taco_tensor_t *v, taco_tensor_t *B) { + double* restrict A_vals = (double*)(A->vals); + + A_vals = (double*)malloc(sizeof(double) * 5); + + A->vals = (uint8_t*)A_vals; + return 0; +} + +int compute(taco_tensor_t *A, taco_tensor_t *C, taco_tensor_t *v, taco_tensor_t *B) { + printf("Adhitha1\n"); + + double* restrict A_vals = (double*)(A->vals); + int* restrict C2_pos = (int*)(C->indices[1][0]); + int* restrict C2_crd = (int*)(C->indices[1][1]); + double* restrict C_vals = (double*)(C->vals); + double* restrict v_vals = (double*)(v->vals); + printf("Adhitha2\n"); + int B1_dimension = (int)(B->dimensions[0]); + int C1_dimension = (int)(B->dimensions[0]); + printf("Adhitha3 %d, %d\n", B1_dimension, C1_dimension); + int* restrict B2_pos = (int*)(B->indices[1][0]); + printf("Adhitha4\n"); + int* restrict B2_crd = (int*)(B->indices[1][1]); + printf("Adhitha2\n"); + double* restrict B_vals = (double*)(B->vals); + + printf("Adhitha3\n"); + + double* restrict tA = 0; + tA = (double*)malloc(sizeof(double) * C1_dimension); + for (int32_t ptA = 0; ptA < C1_dimension; ptA++) { + tA[ptA] = 0.0; + } + for (int32_t i1439 = 0; i1439 < C1_dimension; i1439++) { + double ti1440tA_val = 0.0; + for (int32_t i1440C = C2_pos[i1439]; i1440C < C2_pos[(i1439 + 1)]; i1440C++) { + int32_t i1440 = C2_crd[i1440C]; + ti1440tA_val += C_vals[i1440C] * v_vals[i1440]; + } + tA[i1439] = ti1440tA_val; + } + for (int32_t i1438 = 0; i1438 < B1_dimension; i1438++) { + double ti1439A_val = 0.0; + for (int32_t i1439B = B2_pos[i1438]; i1439B < B2_pos[(i1438 + 1)]; i1439B++) { + int32_t i1439 = B2_crd[i1439B]; + ti1439A_val += B_vals[i1439B] * tA[i1439]; + } + A_vals[i1438] = ti1439A_val; + } + free(tA); + + A->vals = (uint8_t*)A_vals; + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/spmv_spmv/spmv_fused.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} diff --git a/test/kernels/spmv_spmv/spmv_fused.h b/test/kernels/spmv_spmv/spmv_fused.h new file mode 100644 index 000000000..bc78275ac --- /dev/null +++ b/test/kernels/spmv_spmv/spmv_fused.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A, taco_tensor_t *C, taco_tensor_t *v, taco_tensor_t *B); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A, taco_tensor_t *C, taco_tensor_t *v, taco_tensor_t *B); +#endif diff --git a/test/kernels/spmv_spmv/spmv_fused.so b/test/kernels/spmv_spmv/spmv_fused.so new file mode 100755 index 0000000000000000000000000000000000000000..5efd6a4d8a9832f423c2f248ec61fa541f7833f7 GIT binary patch literal 14152 zcmeHOdvF}ZneWlAv22NzY&Z~@;K3|+azJYdKaj8uT0QJp*BTUCN}L3HS?xp8MY}8Z zL2Phr6JhYMEa%HPfvO}G*H!8Ql}EWqxa3k-XJH5197V+`f{&{sIp<1jT#H@QB^4j zgsKYDTFZ@agVJ?E$(_`z^q7(^rM|baQOY-YBwIm8Em$yRyN|(+;{0=yY9_Hsl{Ztn zoLBKPRjjPr)hNt`YyCX~V%~4R{mOXL==D~>!$s{#xA>s0 z)y`|JsjGK)Y5V5r8p*E2brr6z-q(M9=&OgG{oyaaurpUbn0Pe4ukKq996k8P16yu5 z_`$>Rt^={x{;BZ3r?2ksU;fo=?|HVr^2N4}6HmlpI%7Hn$-*kq8-VFm@FT#g=~KO{ z=^vOu|AiUw_8I(a0=@*7Gfe_$6xWD8T`v>;7~DcpFFNZi_!=d3A<#x~t=Qtw|FY6Q zJ>OEiU+ISxKB+n`KU-f__#wJMNVmc-rQhQs|IT<`22SxDq??IEeS^}^aS?vMinD_{ z0uuGVh3FZ5>baL7e%_FHy_j(Lq53UEPx-0BSwE)qWhX;^vl8r7`28w=L*XJ68cL*+ zp-eWM&W1uFG?E<2iclOxgaZ8=L(y0|HZ+pS#?t*8yGK*WSbunMG-k=FOG1(DVR8^2 z9eF4wB8f4P4M$R;K_KZJn`7a0WY`~vze>0(7Eh&PA`u=PO+`dBMzMsXaWh=PqL9zVl0=9iP4e4h%b}!trDSF zG@K2?!(b+3RT@w-D%J-BUEQJEean3-O8MJ;E73}HgRG05-5R0&iGKpJ8u2#tcs$8o zH-Z*-iCWnYg~(k`e2utT^&cK@d7M10GL1YQ!xtrfA2d1egG%4fb*aaGm_NPP!f7m2 zT}|p0Z8(paB$%|}_O5u|hI1LD-=xYVT~#H5g>J*uBPQ`?8&2)Xsnv$_Ge@#^8*U%Z zJ8k#^b^>aQ|GL)5&nYzN!q_{Jt(iJ$jQ^~*Np^VeEcx^nB( z>1mU#N2wOmJna8KiiMTuKn9GZmtb$ac*$j$lg5k1yAAD_@sms0COCXRIjkp#9D>u| zp0&?GP2`pt<7?MbZh+#cH}Y$r09X77s#rAeb8OBd;59Nw#f13HQMl}})VH3EL&M_p zb8o%k=>F+=&67stR7d^OJD0rMLpA9s)L+qSp76g|T->Xj@c-m|AaZhb{nNXy?JN&94FkcOr zr_k@(&8+vfpxNVXES~ysYAO)LQgB%>dD&I+lKUYZGgS7CPjB_#=ilnz z<`1FCaR2@N1Do@64fDd@iCs7Mww-DFZ>t{|`R?ZNm$VJ}HF^LKXfX@79k=c-8u^Et zi$k)Ehii-5tV~mJz{=>wAZ5(|q+-r?B9G?3aJZQQ8(TL)4mbdM7x^0HJ)7yBMp7^2`NHcsVpn5`bXfn(S5>J;7M8FnR5*b8jSJdDDQ|n&~YL6&7gU_xaf~`YtWaC z{C)JeZE2;RicVths2D*}w|D;~zMtsXE(}*{95eFwVyyfGtUrr8fA`rp|A3LqHhwwf z7a^DLAbRWgqr)TF;c(l0p>nsEa?ABwqPO8H0V_(m<@CPP7P(f_nx-Dt7Wmu;dIIzi z=o_Go=tgg#YA3)KLEi*5aAPMyp8<8F1Drr_?gMScgTE8h0DT5@AF6i-^jT0|$MF1A zduWr;wl`{v>gU$(hmIHiXl-!f{M6JKsPHtd^E7Q}sQYZ~9&z{et3SHjyM$n}$Kx&< z5gTxy51s!_Yr7qPl?vHlzw+M8zvQO(#dxpbT7^%jkNu3xO=rkB&IzTbwzEp*U7E7uKdA!8Ck|O3 zZpTT*A5?ODu8?w7J>~WPj_|&@v&6?##lNKJKPdW-iXKz+l%l-fv%b6gPQ7_xFqh2c z^gDbjeXT9abCPS@)ztwLRg^ho`Gu@XpQIwH9pPdxwCF{-Ctd| zKwCJsv+3!s>oi2?S|Kbva8(V|++I;vcNZ+-C4#F)eumPt3!W>5hD6Ts=kqPI{xa#v zGHeG9MIy^0sbqX)XxYf}_EoS=XXD{aHWAJa`y$p>*1Q_7gzGh-x#nx1`d55c)imty z>)mZ_RzvD73G9jvwM0`n>^ru!w*tuKlCiZzv1BYg5&=s)o@=+auWDJfLT?%3zk6@j zTlk6(=0--NBXDDdrnkhCsTS>ldP%;+Pu>&i9 zr@G&Vip~>%n?2y3-?`_euQB9Ok8BM@xEW=s)V};!mq{`zv@nE7z~w?0cvwyt4~kRnB3M+*Nuz*tH4gx1|&3x1`IpJdhi{~Wkh<+*r&27C^^Y;Zow**_Obyt4lg=y7Itv)XkZ`A@wWIA-w>++Ig&hmim?B&j98e)y`=hk^0V3g*XLVuY|>0 zQXlL; zmcePNXeu-`ni>p`hN76MWJ2NGc0q@)Mq}An)OW|Kj`r$G@}OBLoKA;#;22gmy#oil z!iiWYnoA^hz{JMMlWAqDsd#)e6^?RfnX$x!p?EHX0wXY8x5>Y;H`IGi51ncYDc#V6 zVMIpZfIh{7wR>XN26yGARLBc-|K_-cKL&$ zd)KYo+}j`O_jd(*9VOxnn>>?Nih_z%ARTDy?BwHTbmZ($9=j_$*2ag~2eOP%zSRsAzg5l)vW8%Ul}6u!)k z1R54Jo3`jM=aQH*`!Iv`$$4+fNK{B}IGh<4zUYo5EG?SFOq%o_jHNRpsbm=wf-D^y z4U>V&jg4l7PnPaO);E*_n~iP9ue=4mbV{Bs^u>l%&mE4!33HaI)rBn!&ZC5I0&{!v zNS+ZKol^ASwg?}dqXeD}V6nm65KO|!Av|=79ZAMhoDUBUrehB>hk1OA^Jo~(($Pv~ zt2|X1CjQqD&5USjis3~jo|iNADx8UPy-mm4Kfqd??RoyrG^z~wdxB#+ffMmU(6Har~^ubO18? zcELQaFPOGUYnr2!$FE>P?L#uQFReq=dSpT=a{1hTT>qr9?^1faUSYc5W{=rNMf+_B zMs32&pFXutVanfilp7f<=ZR);+3k7#Q=)cdx)-zJ@09)R3gPt`(~gp+mEr!!^vlYg@4xf^ zON=_~o#j90u;=w2(;sj_$`7|2yZtUQv_vfRFSX9&|3~3$FSciT7`C*gVS8RjG^wQ$ z>p?{(=9#_%ljy)(3|Y_OrtO*kB{HPL@$>B7UuNJb|nh{6&QriDf?|DMJwaft)Ibu?oF2dy35#KKZE^7Wj}No z`@dB7&U$`N+1EjCPs}oHLx%Hqy~9f^?ruAXyi<404EC3ly?z;c^2cL`T~)u?;)d0D vcoSF}i?~1W`-8?e?t7Mn^}CNgFd#keaPEwM0d%U_-`OI~#Ri9ggBAY+iVKWt literal 0 HcmV?d00001 diff --git a/test/kernels/spmv_spmv/spmv_spmv_default.c b/test/kernels/spmv_spmv/spmv_spmv_default.c new file mode 100644 index 000000000..dfaa1c4b0 --- /dev/null +++ b/test/kernels/spmv_spmv/spmv_spmv_default.c @@ -0,0 +1,157 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *ref, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *v) { + double* restrict ref_vals = (double*)(ref->vals); + + ref_vals = (double*)malloc(sizeof(double) * 5); + + ref->vals = (uint8_t*)ref_vals; + return 0; +} + +int compute(taco_tensor_t *ref, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *v) { + double* restrict ref_vals = (double*)(ref->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + double* restrict B_vals = (double*)(B->vals); + int* restrict C2_pos = (int*)(C->indices[1][0]); + int* restrict C2_crd = (int*)(C->indices[1][1]); + double* restrict C_vals = (double*)(C->vals); + double* restrict v_vals = (double*)(v->vals); + + #pragma omp parallel for schedule(runtime) + for (int32_t i1438 = 0; i1438 < B1_dimension; i1438++) { + double ti1439ref_val = 0.0; + for (int32_t i1439B = B2_pos[i1438]; i1439B < B2_pos[(i1438 + 1)]; i1439B++) { + int32_t i1439 = B2_crd[i1439B]; + for (int32_t i1440C = C2_pos[i1439]; i1440C < C2_pos[(i1439 + 1)]; i1440C++) { + int32_t i1440 = C2_crd[i1440C]; + ti1439ref_val += (B_vals[i1439B] * C_vals[i1440C]) * v_vals[i1440]; + } + } + ref_vals[i1438] = ti1439ref_val; + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/spmv_spmv/spmv_spmv_default.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} diff --git a/test/kernels/spmv_spmv/spmv_spmv_default.h b/test/kernels/spmv_spmv/spmv_spmv_default.h new file mode 100644 index 000000000..b53193484 --- /dev/null +++ b/test/kernels/spmv_spmv/spmv_spmv_default.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *ref, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *v); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *ref, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *v); +#endif diff --git a/test/kernels/ttm_ttm/fused copy.c b/test/kernels/ttm_ttm/fused copy.c new file mode 100644 index 000000000..5d40c8aa9 --- /dev/null +++ b/test/kernels/ttm_ttm/fused copy.c @@ -0,0 +1,248 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A1532, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) { + int A15321_dimension = (int)(A1532->dimensions[0]); + int A15323_dimension = (int)(A1532->dimensions[2]); + int* restrict A15322_pos = (int*)(A1532->indices[1][0]); + int* restrict A15322_crd = (int*)(A1532->indices[1][1]); + double* restrict A1532_vals = (double*)(A1532->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + + A15322_pos = (int32_t*)malloc(sizeof(int32_t) * (A15321_dimension + 1)); + A15322_pos[0] = 0; + for (int32_t pA15322 = 1; pA15322 < (A15321_dimension + 1); pA15322++) { + A15322_pos[pA15322] = 0; + } + int32_t A15322_crd_size = 1048576; + A15322_crd = (int32_t*)malloc(sizeof(int32_t) * A15322_crd_size); + int32_t i1543A1532 = 0; + + for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= B1_dimension) + continue; + + int32_t pA15322_begin = i1543A1532; + + for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) { + int32_t i1543 = B2_crd[i1543B]; + if (A15322_crd_size <= i1543A1532) { + A15322_crd = (int32_t*)realloc(A15322_crd, sizeof(int32_t) * (A15322_crd_size * 2)); + A15322_crd_size *= 2; + } + A15322_crd[i1543A1532] = i1543; + i1543A1532++; + } + + A15322_pos[i1542 + 1] = i1543A1532 - pA15322_begin; + } + } + + int32_t csA15322 = 0; + for (int32_t pA153220 = 1; pA153220 < (A15321_dimension + 1); pA153220++) { + csA15322 += A15322_pos[pA153220]; + A15322_pos[pA153220] = csA15322; + } + + A1532_vals = (double*)malloc(sizeof(double) * (i1543A1532 * A15323_dimension)); + + A1532->indices[1][0] = (uint8_t*)(A15322_pos); + A1532->indices[1][1] = (uint8_t*)(A15322_crd); + A1532->vals = (uint8_t*)A1532_vals; + return 0; +} + +int compute(taco_tensor_t *A1532, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) { + int A15321_dimension = (int)(A1532->dimensions[0]); + int A15323_dimension = (int)(A1532->dimensions[2]); + int* restrict A15322_pos = (int*)(A1532->indices[1][0]); + double* restrict A1532_vals = (double*)(A1532->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + int* restrict B3_pos = (int*)(B->indices[2][0]); + int* restrict B3_crd = (int*)(B->indices[2][1]); + double* restrict B_vals = (double*)(B->vals); + int C1_dimension = (int)(C->dimensions[0]); + int C2_dimension = (int)(C->dimensions[1]); + double* restrict C_vals = (double*)(C->vals); + int D1_dimension = (int)(D->dimensions[0]); + int D2_dimension = (int)(D->dimensions[1]); + double* restrict D_vals = (double*)(D->vals); + +// int32_t i1543A1532 = 0; + + #pragma omp parallel for schedule(static) + for (int32_t pA1532 = 0; pA1532 < (A15322_pos[A15321_dimension] * A15323_dimension); pA1532++) { + A1532_vals[pA1532] = 0.0; + } + + double* restrict rA1532_all = 0; + tA1532_all = (double*)malloc(sizeof(double) * D1_dimension * omp_get_max_threads()); + + #pragma omp parallel for schedule(runtime) + for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= B1_dimension) + continue; + + double* restrict tA1532 = 0; + tA1532 = &tA1532_all[D1_dimension*omp_get_thread_num()]; + // tA1532 = (double*)malloc(sizeof(double) * D1_dimension); + + for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) { + for (int32_t ptA1532 = 0; ptA1532 < D1_dimension; ptA1532++) { + tA1532[ptA1532] = 0.0; + } + for (int32_t i1544B = B3_pos[i1543B]; i1544B < B3_pos[(i1543B + 1)]; i1544B++) { + int32_t i1544 = B3_crd[i1544B]; + for (int32_t i1545 = 0; i1545 < D1_dimension; i1545++) { + int32_t i1545C = i1544 * C2_dimension + i1545; + tA1532[i1545] = tA1532[i1545] + B_vals[i1544B] * C_vals[i1545C]; + } + } + for (int32_t i1545 = 0; i1545 < D1_dimension; i1545++) { + for (int32_t i1546 = 0; i1546 < D2_dimension; i1546++) { + int32_t i1546A1532 = i1543B * A15323_dimension + i1546; + int32_t i1546D = i1545 * D2_dimension + i1546; + A1532_vals[i1546A1532] = A1532_vals[i1546A1532] + tA1532[i1545] * D_vals[i1546D]; + } + } + // i1543A1532++; + } + + + } + + } + free(tA1532_all); + + A1532->indices[1][0] = (uint8_t*)(A15322_pos); + A1532->vals = (uint8_t*)A1532_vals; + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/fused.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} diff --git a/test/kernels/ttm_ttm/fused.c b/test/kernels/ttm_ttm/fused.c new file mode 100644 index 000000000..f490913cb --- /dev/null +++ b/test/kernels/ttm_ttm/fused.c @@ -0,0 +1,242 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A1532, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) { + int A15321_dimension = (int)(A1532->dimensions[0]); + int A15323_dimension = (int)(A1532->dimensions[2]); + int* restrict A15322_pos = (int*)(A1532->indices[1][0]); + int* restrict A15322_crd = (int*)(A1532->indices[1][1]); + double* restrict A1532_vals = (double*)(A1532->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + + A15322_pos = (int32_t*)malloc(sizeof(int32_t) * (A15321_dimension + 1)); + A15322_pos[0] = 0; + for (int32_t pA15322 = 1; pA15322 < (A15321_dimension + 1); pA15322++) { + A15322_pos[pA15322] = 0; + } + int32_t A15322_crd_size = 1048576; + A15322_crd = (int32_t*)malloc(sizeof(int32_t) * A15322_crd_size); + int32_t i1543A1532 = 0; + + for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= B1_dimension) + continue; + + int32_t pA15322_begin = i1543A1532; + + for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) { + int32_t i1543 = B2_crd[i1543B]; + if (A15322_crd_size <= i1543A1532) { + A15322_crd = (int32_t*)realloc(A15322_crd, sizeof(int32_t) * (A15322_crd_size * 2)); + A15322_crd_size *= 2; + } + A15322_crd[i1543A1532] = i1543; + i1543A1532++; + } + + A15322_pos[i1542 + 1] = i1543A1532 - pA15322_begin; + } + } + + int32_t csA15322 = 0; + for (int32_t pA153220 = 1; pA153220 < (A15321_dimension + 1); pA153220++) { + csA15322 += A15322_pos[pA153220]; + A15322_pos[pA153220] = csA15322; + } + + A1532_vals = (double*)malloc(sizeof(double) * (i1543A1532 * A15323_dimension)); + + A1532->indices[1][0] = (uint8_t*)(A15322_pos); + A1532->indices[1][1] = (uint8_t*)(A15322_crd); + A1532->vals = (uint8_t*)A1532_vals; + return 0; +} + +int compute(taco_tensor_t *A1532, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) { + int A15321_dimension = (int)(A1532->dimensions[0]); + int A15323_dimension = (int)(A1532->dimensions[2]); + int* restrict A15322_pos = (int*)(A1532->indices[1][0]); + double* restrict A1532_vals = (double*)(A1532->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + int* restrict B3_pos = (int*)(B->indices[2][0]); + int* restrict B3_crd = (int*)(B->indices[2][1]); + double* restrict B_vals = (double*)(B->vals); + int C1_dimension = (int)(C->dimensions[0]); + int C2_dimension = (int)(C->dimensions[1]); + double* restrict C_vals = (double*)(C->vals); + int D1_dimension = (int)(D->dimensions[0]); + int D2_dimension = (int)(D->dimensions[1]); + double* restrict D_vals = (double*)(D->vals); + +// int32_t i1543A1532 = 0; + + #pragma omp parallel for schedule(static) + for (int32_t pA1532 = 0; pA1532 < (A15322_pos[A15321_dimension] * A15323_dimension); pA1532++) { + A1532_vals[pA1532] = 0.0; + } + + #pragma omp parallel for schedule(runtime) + for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= B1_dimension) + continue; + + double* restrict tA1532 = 0; + tA1532 = (double*)malloc(sizeof(double) * D1_dimension); + + for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) { + for (int32_t ptA1532 = 0; ptA1532 < D1_dimension; ptA1532++) { + tA1532[ptA1532] = 0.0; + } + for (int32_t i1544B = B3_pos[i1543B]; i1544B < B3_pos[(i1543B + 1)]; i1544B++) { + int32_t i1544 = B3_crd[i1544B]; + for (int32_t i1545 = 0; i1545 < D1_dimension; i1545++) { + int32_t i1545C = i1544 * C2_dimension + i1545; + tA1532[i1545] = tA1532[i1545] + B_vals[i1544B] * C_vals[i1545C]; + } + } + for (int32_t i1545 = 0; i1545 < D1_dimension; i1545++) { + for (int32_t i1546 = 0; i1546 < D2_dimension; i1546++) { + int32_t i1546A1532 = i1543B * A15323_dimension + i1546; + int32_t i1546D = i1545 * D2_dimension + i1546; + A1532_vals[i1546A1532] = A1532_vals[i1546A1532] + tA1532[i1545] * D_vals[i1546D]; + } + } + // i1543A1532++; + } + + free(tA1532); + } + } + + A1532->indices[1][0] = (uint8_t*)(A15322_pos); + A1532->vals = (uint8_t*)A1532_vals; + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/fused.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} diff --git a/test/kernels/ttm_ttm/fused.h b/test/kernels/ttm_ttm/fused.h new file mode 100644 index 000000000..d613c8f07 --- /dev/null +++ b/test/kernels/ttm_ttm/fused.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A1532, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A1532, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D); +#endif diff --git a/test/kernels/ttm_ttm/fused.so b/test/kernels/ttm_ttm/fused.so new file mode 100755 index 0000000000000000000000000000000000000000..69c65a1dc2a6cdd101f40ffea8b439083d8e2761 GIT binary patch literal 14560 zcmeHOeQ;aVmA}tY5+yNuN-)6?nuze|+9h@@2*J)rk!;y{%E*bAIMlRB6-Ab8Pi@KA zG9)z3M-_)W9tz$qEVQN5blc7_UABZSEtKsHc0#^ryW@lc>9F0pTM9CvjT0b*65{=x z`|hgCQs*{-#zEtd(ZuR_g(L-4b~MX3X{RcKF^3dFv~)`B4Kldh=6!m zC7XfYMeKSmJ8e8^LhQhG79=TSZVpf$)A3yV3}->zJR)gdfrURNsMI5t^jr(9bk_nm zk4QQwsv@bw5@U8ydX9_qe5wgKNlBJM!=IAP6hC+kXD86@7R)4-dVdQ&D(8PUxP;&x zBEPBp;t>9jR8)u8tq3-3cvi37z|^-+8QreT-AbhRy?>qg@ec>a%nzrCk$OUItrzS&RSefYrXyPK{% zaN(=5>W8B*|2%!p*)aPbPrUu|S?AVYKH{l5{y+?>Q-=GIGIx?x>L%_<@CSiSCjYxB z@aLw$cTItRe+vA>6gc_YjVC{B129=RE1@tM9-c}bxEoJ?@J?ap4hSyF7Y6@z$h+88 zY@gemliJAb6taq|Ecn}^KRv?b=dr##`5T1%Aax3a&HR_K5zx7;kPV5-$vNOgA%EaX zOYkP}6bw)on>V36YlVEL7+)eA77_U1)s{Y$j{^12xojruo5|U+pG`u(FVD_AE`J64 zfxxB0dmJxhBclB4MZ#=hr$LlomkOa!OGhFe>Q08clA#a_wZ+?#EEEG_p+IAOs5#md zZE5RHM!Op8YuXd>Xk&Owd(=vsJS7y_8YTnb_O|q$mgd)t;4M9aGqfcT0;YVzKZZ0L`wIF7^*=A3!5cmc1Jl{JB&HE>XFm}9Uaa6y&f~q!C}# z)^-I$NuCr~&$Vsn(FN>hLSC+ea{cO?O%2=_;m)00jC7$Ze!pZY8lw{%5GBQ+T;c||lw8|`e zjuZygVgd5iouMj`T z`D)^+iHG|+znpk-{qRA~-%LEYeE0z8KSw;ddbp4Ci-;#z4)5apwZxMPhdVhxmw0mB z@Mg}>BA&X=a0BNHh$j~gdpZBttH6`%hATLKo_KQEu$%K+z{mQ)^Pv8CqjqU`P`l*a z#hCizM(xt!Sh>1zkCnKP>FHGiP}rcSr~MpMuezuke@b2h7Zwl;7hbVxbZA^<(_!-3 zI1l~ZB$&SSK5nM;GW7L<%QoE@(FZcO>&j96!+MtJ zsa20c0VP9u3UzbUT;Q3{po$qCKS!s{1h0@fD#qBmhhfrhiEkQ?LBisFU+~@eJhi58 zgRgPJdVTjhP2hC%HZ!C*>!$t25~S(I9!&@A={Hw<{pP*?V0w?{B{}06t)B$UgiB99 zsL^gDXzr{Cnk^;J8RSW;-NxhE6Nvhaq*kdLPih%3L8O;(6MM9SmRzxJJgB(^MjI{N zTCOe<-#}WbUp0QAJ&c$d81ak_T(;|x3z9|1g3~Ks0bzz;+pMSUTthdu zYi=@ioD0<2DNd75&={&WE(MHZ<}WO-n0vSDiYhEUQ$FhktT%10?r z2bbGp(PMtC6KSm8c(dO4m~uG{?#v?US$YbIik`ACyKaKjQx{$8o^5tG$+Bt#SY^3s z$G=hbRQb%)mqt;)BRYi0p6M{X_NoCjJ%Ygs8FHl_Iho2RrN?z;P<`q{)VH0qskz_~ z6m+}RL;zJ9Ub&x&_827^8T4fv9N^a~Q*SB8WmM@&B$QqZrkmeKdQz+K8OPwwm8qf2 z3Pp4(2YKCh@+|DO>S_NpG`3$W=Ka)c)GB*Vsr#3paJu=hb^wx|3%YSO^ZZAshDjxf z%rL)TuR5gex4is}_8f##Z!1A$=O{hohuRQgWIQkNBVuM1_U?R{O33?#3jH2)EHUcKwkd9|q)kvX+Y zhr=skYROBUQ!%yZL?FHK2*wJpM@)6uZK*>_OucI$=2M&OcAr{mx9h2Xm#^xzdxy-8 znqIY0TbT^Ncdh6cdV1q4EM26e`jskpQ;(eDS-v8(v>`{^j`@&Oj_Z%<=|-gZ)Vms~ zOigo{RrIRMYA?-yK@+|}MV~-LdoQT_e@{&pY1G<-@J>HKKFJ^)Fvn7FURK@>8iSAt>dI@7 zYWAHR4l1wdMx*8p7;guS3!c}}eMLLz#yLG*=L(jd)8Qr;`Dm8x4z;Rv>lAq1%$0+pnAnu4u&I=q=!_ zO08mAt>Q;}@%A&V_M<7H$qtZ@Xq2Z(p@F=@8mnsW_xL<;Hgo;O(b3^2F`8&z@$_5k z0UTl<^N5|h#BM`19l@%LJ^wk}j?8(#_Vf?0!(7`3>I8=Qh~_pvG0|M+tvBAo_$I90 zIE&OFw6P0w5)D=Lp*egEKVvNbC@D?)neTnT=kDI0{S|B28(?ULpk=37zr|XIdgki8 zUu**MrA@w#zD>R_`$C5^4?uE!y~Pq{d_1a`Ly`3^V?Z};roB&ns;20h zAADSO@@v!3E}m2BQ^%E)%JHg!uPw&B7dhd%fEkc_r2{3_%^C;okG`}C{r>py?XV^L zA4d=OnKN#hezN!`B_slN2;(OM7IYkMzN zHspq-;mi%Gi%QRxFVGB>`NR8`S@#;leqSJU%pR=Tt`#TiYE1j*bqq*HwPCNSHvOwu zjbUF&E2XR2v{&}_*BHQhe)L>=?7s^E{dzl|7x+(|Et9O_DhSeXO^C z#|^cfH#~o~#tnK(N$QBQ#$4$Rplf7ad~bBL>ZY9;-TZ1v=2S`l~V zaf%qfr)-A%5a;b-ZN*aWv|1aW?U**r3e)R=3pN5O@uhbtw3)&Z$Q_5Zw9cUqyj9>g zD-kWxA?wGYTBMuUSbF)~T#|X3XLXbu={<(3a~Ci0rH-jSHE?X;qP@2F0@7Bh`~QL= zhUMDYF72>m-AxzqM#!v!jTbPyStf!Q+AUxVsQX`syq>PHW8*?C!$2zqA1=8^1d7r|VRr7X~`Ko?jYc`j$oqw!V;Lc-4p%#dIpbxxRklFt(I%uJG<^&qq8ZT7d zV&k_r1+_w~QP_7kYVK+y_#=A&tsh7o1^w8i_O7Bi5njNAh^>JaMXE%^Qn97#z5p_w4|-~)Q&~a!gdSG(Q~LV!&HUwZY$bt6z#@@tW+KS z+W8>Ln)&T{>Rm}q52C2Ld8dOI`1od`4MAfK+P|&>{khfCpVghMe(O0gT#0g6ZTb)_ zp&)3YmA(DyL#3BTM?IrwzU3Jmz8f-D|EKxF>Yvz}?tbS9lq}~xf%b!*e(Jc``e|OX z$w7UxmiI~BPIjQq7(*&Vc98=>#trEYGr%h!i*N&-MUh-o{ zv=24qdAmuY1Y#S}S7E<_qbLpRAkSjLHX|71Ye&p_)R?L_qScS*ucimhEjW581_rk#K!`qP@`-JpgA)TKI zVA47aR`2OR-DjW_)f6#F#zerzD<6|Mrl-3aOh?F2mlx(9Rw z^f6F3R{J+VcY(T~??%7c4BC%Qbr5t2^hHn#MHXSoeRnWrt4o<*IK$BgT@5&$;cq@W zIyyu$PFJ0?c#S&yUdJwW`?V`RcZ;@&VAA*FDMndoewQJL$VNOal>I23HdH%Z4;559 zi}%}BJKbq}wX@{GX+CGg-swJPWopK1XP?b>VEQa)B`}|}q}u6*EOe@!4j&yZRN$dE zoIY#`5}?fIbnPwhIg3+dub_UGvlxia>1dekmr8*N&CHZmj%koSimBaaQ^#lpc0U@U+6#Qnb%z#K{*AQrY*~ zd`@@DUeL%(jH=>g0Lc$SwiUAcEPp&-_+*0L{LTZ4ZCwHURx=^z)#Nv_M}6Tr^aXzA z{>k>%IENHlLYdSaZwK<#QuG}MHlwod5T*kw%ELASbNyv>R2;oS>i@(4mIvf{vOGVQ z=f!Ax9_Xx_g5!Sefx{LCWHujdPSYI6$oU6F!3uckSJ0=hB}?_ait z>xjrtj>8fWk!QM1=qnWD+3X=9FVAF)tz>?>&!cWPCqj)?3VGQl<(cr+La#{_NaBx) zh%9HnkdtTRGCxe6e7Y$S@|Zd~FZ0E;$N9HKMKEkRPh05!iy*(7=jVM?RP0qj-xTzM zpworp=Lz~*L6-?yC+MAm#ss}b(7l4n?-8qOYL>f8Hf+JSuO9bp<+qksl-<(9InNGH zWqHNYQVV5^QBup9R#V6hJk_M}MaWyur74@S!aex;3B~e4!3Y3leZD>A@jHRrc2MG% zDawQ0KI1Y)lsU?c&MRlzob#O=m>B=9)Rm+!@~DN2{+_2Hmb?EZm*!2q#<1Bi3-8Ss*K6dzmS=S;`k*D!|t*1@{8nH6D-UNOxHWzEZZ7 zH79!T39hWN0zk4S9$nQEjYqrMB4Fv0->R*Z%gUB5b(gis|Lbmcm&qr#rKhdExeaEl zqPfdr@kAN_;#k(%l}JV-$wU{x&Nh6(+#2cbVxLlFUN z_+C%xHtc%kJ)B&5l~u@vB`d*(ms+_FWx=pBXKek+#pmK)kvkjWaT~jm$?v7vi8k8J zx+jyriphB`SAOFC2THdwx$0y~6E0Vkd|a-w`S>JG;P&MzoG&j|-+cTtxq#qoY{3Ma zy0M*|8_iEp0TkeVh#V<$dM3-0$(IQE{=9zjIe{~|u!Ez;`m zW?MKuvE2>eWJm6Y!16#RXv!r0=NaIWm7m_tCc}RToclqPpTFAx`<=i$#r+e$2>>>g zf6gLJVJdq(ugRt=55y-e&&S-(#Bt<;qm)U;m&Wml{n^9uiQ{ECclbp6^}^0Rx0{MK z7VhHs#Br41_(VIYDR6p^6|Nk!O_3h~cjq7$b94&%w>XaeAch!9#>4x-C%aE|X$m{V z*rUv46X&r792Z8j$Y*WVeX9I>S<9!8UpocfF$MkraCcUIGWty}Kk**dZv>uykLx>A z*r!i8bib?5otw+kQ^+3$KG}V+QzGB|`(|%VVP~fBTmC&Rx4`r7mHnBwS0T&4S4Q74 z={{Tjy|2~4b8BRIF3jbn7su%~;ETrDvDhPAev)-Fcf*srWkFhZvL_b9#%}z2St!{N ziqHkKZrp`xPJ~+86I;UVp=Ru~xj^A|45lzHm?Frl)6Ge;lbR(|_&lmT%%0tLM6pQ2XmblS1 zhE%|E-5tLA+EDFUKV6~=2~(ka!YDl5<{F1$J@JUdn!6LB)^NO;F4Xz&TI;J1)IbvZ z*-))6Eb9I{0O9VPw7ojGuG$w2t*fhBU)vaJ^i>CI^K!&hHGcbStQ3@`0O_Kg*DG(5 z(fzScy5%?V@*dsDgFapAqbq6Fg*SP_OZO?nM*ykI%E8?(wb#bv|1 zn}QRh#N9*QIFnwp)J7XeLnS+rHg;&`_y$E!v; ze?cj0Yi68l4R^P)^5$)EXj(LhS00kPC)(BBmWbytA*6Lh+ry+FVx8?tR?c%TN3^^p z0X9jOJIi^0D37*^q1oCDWyx78))=)UWgM2n9eDX7OJos+qDz$JxI@Xx(H%Q5_<=>Y z^t3=F9B;v_BG|TgEFt6JEnB*x_ec&eT2UE?Lu8b$sR~_TX>9iUpGRotN=GS}{&|q^ zO_FK?mn8ojr5p+PikXo5^8HHEW}%4bnFo2kBUt>U1LIpysW0EdBpnn9lrNT+v2!fB zbN2!x+frY?uSqJ;tsue!*~kt$7&=3e`trR_(v?D3mS4(Ax&i5QCM9|K{wHat(5Lc~ zOn&{zKQfI(<$I!}{RAS&{H4CEe;jF~Urj;?@_kZLc@9VVwAael-wKR;BG3Kgy>ChL z-<&9~?9dG+TVKAPN;;G+jJQ0fl$4+Q03XX=yr0PTSy^6bM)H#0kA&>}<@=DNc6w3D z3Nl{_r4h;2m*=dK=D%*|*Z&(rU$(z|f0lH+%t+9D{qN=J%lB+aJEdWvFY7JM{u3f} z#3lQOd>@y8-;!TnsW0iX&?Og1eR*!|5U-+A4kA2AUed!rsqLh`e7~1}KQu_rMUeHD zda_(!M;eJpeR&Qrw2-tIbMxQvAE__tuaK0jKXzUq6nBlQ@|e_&tV6U2aQJ z%30j__>uhUh>(mdzkL7N=e8vB^(n9HFa(CMifj;Zsb89hOMOX)kuKxeqEEXmF5hmx z?8Pbc{~+|gY?&GR&DXaB$j3L=f^!Fy& zL9&vTA|m72avviTUNHyZw10Tyg#qOTRi#A&+JM1#?@JGXMYp literal 0 HcmV?d00001 diff --git a/test/kernels/ttm_ttm/gemm.c b/test/kernels/ttm_ttm/gemm.c new file mode 100644 index 000000000..ee2b24e99 --- /dev/null +++ b/test/kernels/ttm_ttm/gemm.c @@ -0,0 +1,181 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A2886, taco_tensor_t *C, taco_tensor_t *D) { + int A28861_dimension = (int)(A2886->dimensions[0]); + int A28862_dimension = (int)(A2886->dimensions[1]); + double* restrict A2886_vals = (double*)(A2886->vals); + + A2886_vals = (double*)malloc(sizeof(double) * (A28861_dimension * A28862_dimension)); + + A2886->vals = (uint8_t*)A2886_vals; + return 0; +} + +int compute(taco_tensor_t *A2886, taco_tensor_t *C, taco_tensor_t *D) { + int A28861_dimension = (int)(A2886->dimensions[0]); + int A28862_dimension = (int)(A2886->dimensions[1]); + double* restrict A2886_vals = (double*)(A2886->vals); + int C1_dimension = (int)(C->dimensions[0]); + int C2_dimension = (int)(C->dimensions[1]); + double* restrict C_vals = (double*)(C->vals); + int D1_dimension = (int)(D->dimensions[0]); + int D2_dimension = (int)(D->dimensions[1]); + double* restrict D_vals = (double*)(D->vals); + + #pragma omp parallel for schedule(static) + for (int32_t pA2886 = 0; pA2886 < (A28861_dimension * A28862_dimension); pA2886++) { + A2886_vals[pA2886] = 0.0; + } + + #pragma omp parallel for schedule(runtime) + for (int32_t i1551 = 0; i1551 < ((C1_dimension + 31) / 32); i1551++) { + for (int32_t i1553 = 0; i1553 < ((D1_dimension + 31) / 32); i1553++) { + for (int32_t i1555 = 0; i1555 < ((D2_dimension + 31) / 32); i1555++) { + for (int32_t i1552 = 0; i1552 < 32; i1552++) { + int32_t i1544 = i1551 * 32 + i1552; + if (i1544 >= C1_dimension) + continue; + + for (int32_t i1554 = 0; i1554 < 32; i1554++) { + int32_t i1545 = i1553 * 32 + i1554; + int32_t i1545C = i1544 * C2_dimension + i1545; + if (i1545 >= D1_dimension) + continue; + + for (int32_t i1556 = 0; i1556 < 32; i1556++) { + int32_t i1546 = i1555 * 32 + i1556; + int32_t i1546D = i1545 * D2_dimension + i1546; + int32_t i1546A2886 = i1544 * A28862_dimension + i1546; + if (i1546 >= D2_dimension) + continue; + + A2886_vals[i1546A2886] = A2886_vals[i1546A2886] + C_vals[i1545C] * D_vals[i1546D]; + } + } + } + } + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/gemm.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2])); +} diff --git a/test/kernels/ttm_ttm/gemm.h b/test/kernels/ttm_ttm/gemm.h new file mode 100644 index 000000000..20cd2db53 --- /dev/null +++ b/test/kernels/ttm_ttm/gemm.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A2886, taco_tensor_t *C, taco_tensor_t *D); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A2886, taco_tensor_t *C, taco_tensor_t *D); +#endif diff --git a/test/kernels/ttm_ttm/ttm1_1.c b/test/kernels/ttm_ttm/ttm1_1.c new file mode 100644 index 000000000..e016491a2 --- /dev/null +++ b/test/kernels/ttm_ttm/ttm1_1.c @@ -0,0 +1,219 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A2398, taco_tensor_t *B, taco_tensor_t *C) { + int A23981_dimension = (int)(A2398->dimensions[0]); + int A23983_dimension = (int)(A2398->dimensions[2]); + int* restrict A23982_pos = (int*)(A2398->indices[1][0]); + int* restrict A23982_crd = (int*)(A2398->indices[1][1]); + double* restrict A2398_vals = (double*)(A2398->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + + A23982_pos = (int32_t*)malloc(sizeof(int32_t) * (A23981_dimension + 1)); + A23982_pos[0] = 0; + for (int32_t pA23982 = 1; pA23982 < (A23981_dimension + 1); pA23982++) { + A23982_pos[pA23982] = 0; + } + int32_t A23982_crd_size = 1048576; + A23982_crd = (int32_t*)malloc(sizeof(int32_t) * A23982_crd_size); + int32_t i1543A2398 = 0; + + for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= B1_dimension) + continue; + + int32_t pA23982_begin = i1543A2398; + + for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) { + int32_t i1543 = B2_crd[i1543B]; + if (A23982_crd_size <= i1543A2398) { + A23982_crd = (int32_t*)realloc(A23982_crd, sizeof(int32_t) * (A23982_crd_size * 2)); + A23982_crd_size *= 2; + } + A23982_crd[i1543A2398] = i1543; + i1543A2398++; + } + + A23982_pos[i1542 + 1] = i1543A2398 - pA23982_begin; + } + } + + int32_t csA23982 = 0; + for (int32_t pA239820 = 1; pA239820 < (A23981_dimension + 1); pA239820++) { + csA23982 += A23982_pos[pA239820]; + A23982_pos[pA239820] = csA23982; + } + + A2398_vals = (double*)malloc(sizeof(double) * (i1543A2398 * A23983_dimension)); + + A2398->indices[1][0] = (uint8_t*)(A23982_pos); + A2398->indices[1][1] = (uint8_t*)(A23982_crd); + A2398->vals = (uint8_t*)A2398_vals; + return 0; +} + +int compute(taco_tensor_t *A2398, taco_tensor_t *B, taco_tensor_t *C) { + int A23981_dimension = (int)(A2398->dimensions[0]); + int A23983_dimension = (int)(A2398->dimensions[2]); + double* restrict A2398_vals = (double*)(A2398->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + int* restrict B3_pos = (int*)(B->indices[2][0]); + int* restrict B3_crd = (int*)(B->indices[2][1]); + double* restrict B_vals = (double*)(B->vals); + int C1_dimension = (int)(C->dimensions[0]); + int C2_dimension = (int)(C->dimensions[1]); + double* restrict C_vals = (double*)(C->vals); + + // int32_t i1543A2398 = 0; + + #pragma omp parallel for schedule(runtime) + for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= B1_dimension) + continue; + + for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) { + for (int32_t i1545 = 0; i1545 < C2_dimension; i1545++) { + // int32_t i1545A2398 = i1543A2398 * A23983_dimension + i1545; + int32_t i1545A2398 = i1543B * A23983_dimension + i1545; + double ti1544A2398_val = 0.0; + for (int32_t i1544B = B3_pos[i1543B]; i1544B < B3_pos[(i1543B + 1)]; i1544B++) { + int32_t i1544 = B3_crd[i1544B]; + int32_t i1545C = i1544 * C2_dimension + i1545; + ti1544A2398_val += B_vals[i1544B] * C_vals[i1545C]; + } + A2398_vals[i1545A2398] = ti1544A2398_val; + } + // i1543A2398++; + } + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/ttm1_1.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2])); +} diff --git a/test/kernels/ttm_ttm/ttm1_1.h b/test/kernels/ttm_ttm/ttm1_1.h new file mode 100644 index 000000000..4c631f227 --- /dev/null +++ b/test/kernels/ttm_ttm/ttm1_1.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A2398, taco_tensor_t *B, taco_tensor_t *C); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A2398, taco_tensor_t *B, taco_tensor_t *C); +#endif diff --git a/test/kernels/ttm_ttm/ttm1_1.so b/test/kernels/ttm_ttm/ttm1_1.so new file mode 100755 index 0000000000000000000000000000000000000000..911c44fa1340a6a884b424eac6af27f998d5bbb9 GIT binary patch literal 14400 zcmeHOeRNw#m7ixRjhd*E(xj@0cMYzshEm=@qW|;1-vGB(fWj$g^&v%Vg?z={p z8K#q}D@@7^MIUmV|ZKK$grbw>R+{^HFIUp~L2;geJam3wfJnZ>vc z>6+`}V9ipuvsrs=k*-nMrMPNwHMjoo^`}4k^cSD~<=^edySoSX_B^)qD-TZ``tieC zuR3(*BR$Or)8GGA@eNPM(l5O5oA3YLGy0uRhMJClya%carbTEl*|(>w0O)oQ{B~UP z$^Xj&_>u+eA6P)1{HNoxr)dE5)w^r~`-uhQ8-eS%?CFXH?A!;z)plXEU#5|41%9+_|Y` zAe%{dCb|dGR@wX|@#JWN3?v5nKb#gD@7dH5AIN5h;=}n&u75CXRW%QuNss0#;B+Px z&txA>P;*Ep`Js5S??dsPME`(D4h{(**?2dq8Q#7*ofuB`MS9T4xo~s3Cp(-LgNcEG zY*M7s)WNt^&ZRRW+2MFj^bDsVkRBXK=frT@5|r&8N(_Tb4~Y0kU;kh{F*1@Kl)aa1 zGCMew&!xpce|Imkfst$=L`;%$>qI=AO5_q~eD}zR>U12Pvbwl75VSHO%Ea26TjI9_ z*3RT^3EV~lM3dNoJ*`IWS2c<*0@eP)Kho476P0sQh#>gM>u$i7yhw~FY_g7chu8t0 z63O4I%7|al-;bH%6#EpGe^)>5gf-#`CC~RKp01NiNDE2bhZyJkr^|*PQ1U~%F4b6` z`KeVFPWKMAl%-s$0;heO)2Rx4DTAPAD{wXSWofOdmu#pmNnEeM=^nwUt^((K8D`ilmUtST=^@E~jd*g+beH76 zL_E1%jEs)HvhRjG5@*nZXvv1?ew3U=m~gN?X?P538T2- zVJK`dii`R|ji$4P`C9G@xaRrGEZ1D}V0mg*<-rN^$}A845a|_fyHT2{KM#H5@Og(} zo-z)X?liQc#!KgOwJ^9x8FZ6DZi3z3?hXG9F_B+mjBR*;a$Bf9Zlkb)+Ecn3T`U>+ zIlAau;5AZ5$Aoxu0w$xD_=D3ukg)j31CjfmrO}FPiF9t+Z0vdci(rhxs#xLCx|C6H zUSCJNx!3;fd;R1tRj$haueRoJ+;ICKqi{;_edxBXP8eE7W6NS-NOf4*GaJMP{4?`WElMB9J= zvT=9{PK)|$BgKtn&>e-(Sgk1Rm90QsqefwyzYAp@ho_t%y74Ka=o}-hI-@`pkBvKt z+X`k!tk|cuhF-P&06>pwQ%z`!cmJEzDIj&tg%yXI3(-T)rsuuIp8;*%SN87z0WyUg zdWPB>jpU1CB`s#A{J~fu>;j%sq;LN7;}eiAib=<(LQ_&0&s`4uTR4@A6A z9YZ~zIvw%)oJUW4-|zg9G4{M>G@Ur@-FiI#%IOyGGWU1d#|~?on!cBR#u)pyGv?Ul zuQi$m{4Vdl?U)SI^kDng5v_gfoX@-OcX0n0+^FG4_rVh#26A;_xM$S+#4(f_aKMmJ z*x~Ox?OoRCcat0x~cnMt=3e^Uux_*LUZhc4@R~{9*jH` ziBFVHyv_5mu=u8nCJxN@&zv-pCz{+(?pS>?N-HZ`bYIkJ9*-O@t!mYdM_xMHo}9eh z{p8NeqvlaK?DW{5wOv1zligavQFDC9uPb=-6jnsV`ot$PZ6_BtLeEgdWj`v zey`m;f&H}6%=vGo_2Dbsfc~~8e-t$v4Raz^yjdPS471Z;8#8}ja-kHICMuRmi>vh3 zpC!<49syfhDuFc$5Nwk!6S^X~4`Qi4lW6oI94u5T~t)<|+gC?h|*>pBxHEa+q zrgJ^54_|Y^*|=}K#RQgr<~bKxd7reB^OKdGGgk7?;`V{g{rnZ!~GJ0u% zGUm^zn&~d&<#fVebgLqUH|&3ZsCHIxP3Dx>7<`b(9<#K8CJK7he+@JLZje+m9VS>ulY324(BL z`+tXBEMgvu6x-nbozBo1qqss2Lg}5eqrbJ4w~Ddu0{ z-TyflYc1S;uC>YI-QOiy|^KEQn`>_RW zGJkgRN}lgy6Yv~%SL`Xp6<0)>R_<(wH2u7_kZKUSe%Y$QoySbKzdiYuarjp?r905w zg4=%*+n8D})ZSY2KW~cp-Ppi!AM5n%&1URN&UTD```A&?cd#LCz#fGis8OSRC>TbQ z8jrs61Q*cU4-Mk%tyHoItBaTSJ~$OpYEu>~8Lt}Z-$Tm_Hg1|0UT z!~zbz5PF&BiVJhyFp4?9&$|*_6arCieJ9xZm|yqS!^NeEGvtMse=Bxww|of5FGkbQ z>xt64QKsgVQFs9N^+(Z&JEwK2GRqGUPtTE9z!UJecBSL?Ay9hWKJv$M`Ek&_koh|3 zLC_SQ5)Z;<-v#f-HgXk)X)S0F^dKmmTV4g-3c3RKmVXA#fzrFI6>z8y{U3m?2YnT^ z19Sy!QPMtqpU_5q+Li9bt^<(w1EY5;L9}BWRCs)Cp4z*-OCNRZ7I(hu_8V{YuO^uE zqqvSkj`lZBF=V#k8b`Z7j(SA1$M-}{v!`~yW1~keI-5Opk1vXNf*-vo;%OXPywNkI zIZ`!CJdMC2p1Nj_4q50ldt4DZm(V)@3NC?dfX@YljpI6pG34_)Vf6fA(B=b5iD(Je zUSRV5!)#kxXWG&>)0VqDLk>;zG)CvN1UBk$t%ZO3;O8#nsb7!RL_D=0bwoV+n6swS zQyY=Z#ydY0cR}_ukd^OMX6r@&=J+7$Iixx6As@8Nsd*#$pcdEDu=hH~RKEX+dWLGA zS|U9}{r)b>4!lz?Q=SshDPWJ=us;B+`$M^Gdj%qOF8o~Zzy%Ln@W2HRT=2jJ4_xrT zLLT7v*8HBD-%I1sLMD0^rNsQbD&w}`neR3HZkfNwT*2j6>oRwx;`y28U5e-T;`EG4 z$z>7o$MfYZWlyR4_*wBRo#~PI-SJj>CPm_Rz@Jv~{0^L+wJ6QJ%hq+tD>dO#^8EbF z@6<1sQm9`6j0aVQ+c~b}_}w(uha-?||Dy?}5&mv& zuki-ev2}{JDSE%6J&Hb}=tmX(2Sxu?(QhbvR8juUv9YD)y?Wi2ZhRZd>+cEN76>-n znwMN?XQ(j{Tw8CU;%2nWa-QXSp0Eo&{C)uPmNTs(cJBr7HGB8S=qdbYo9*_r&)g5( zv6BiPqO43#=i>8}(Uxh~dzLSCc&_wpxqO-C!*r<$Xb#VNm)^eg-B&ErE?pe1eX{uq z4Xtym6&BvUyar-UzgW8T4rszk5*I#Gkv;8g&n5WPXgcE30)aKw|B>P38v4MNOs+|0 zGd=yiYx-|(TnF9ZTu)*oH<-xv1(MeCytsxt;rPDL980thyjX+pWE$cC^p;S_a-`lc zh_9Hv4XJD%U-B9ng8*{*OnO6aI+GsmPlBb7i5o^6*EOtLt2gxW|9fuH8~Ez!&i4Zp$u`K3lnq0}*<3oA%MJq^>Q9Qn(d5Xm_$&QcQ{A04(*7EBy5T(yt%DuW zV25Jl6j8-!Vtn0NG$E58ZRpM9m7fGYW8yRENG?v!5&6tWe{Uw8()%+x@tT@`zg6^N z@rm3)=aTU5Yd^=(o2sY=MHq~##QtTOs=~j@I512cZ&E4jz$D;(t{Qg;-ruTmr{I0C z8eb%MKdHtq61)#p$BTmg0FZP;}x5sn61AIQeU5*35eDh}w z@YURz3i>`NKhOQH`az}oQ3GWoxqMF#9>`|H#^Slbc#;CgMi2^>%Eo&KvfYV+cnVKQ zBk@FjR8SbzKsuLB1>Uo+sd0WG8ITrF3=b!^BZewByd5!H3B;16@`Hohp;E!g2(~KG zY){WXHj(0{<#L0ecnED&@mqm78W8WvAP_4aZ@VwDsWskuca&n`;z~3ANCIu8*sWQp zC!a|&mKw>%`x2QH#l=P6e|Kb4dkZA-*cop%ltm+YA0UL#vF(kqdzvG$_&sfHn_D~M zoss5PtF1-^smWNgnO0Dh3Z#g&aG2x9C|c|<1?$a?!J}9^=u>PS1)y2sZ5+?WfooO} zpe^=pw$`;OWNsFrcFfi_8^BjtQ+2o@1@c*udDT`_6kau08A&&99N%o+Rk4N@)h`e< zXbbO~Bc);prH|(gdXzPDw4kbwLj2-Y!H~A@$WTiW7}-9Ekp|5TTeOdJ8Jr{nc(4!1 z<3K}yN=U9RG14ajsqGnPS~Q0P2FX2=9vW~!Jt%7)Ve2~tqGp@EzT$l3$Q270q# za}<3UkV76w_o)Tdmx3~LmWs8qEJ@D8a$*oi9kN6g(I|?13?T4P1Td=xv1EazyYszJ zNo0Dl7bvzr)05?VqPu%I{Rne7)}%QPhj5mHIF+ulRIzLQ-?Q|jNUt`qeagh=6sCTK zGqH}=%n1=d7M}>Stk35erYR+gZB-_If1+6YC4mt1&W82*+{5&cDxi9COPG02#81j$ zU}T&1`FzBb-wQ!RCbChP3^4SLhV}Uz#gy7diQCU|Ot+wn-uW=k=PjmPq>M!ECmDPH z$v<3%?9920&u8NVB60n!&;8G!jP#pH2#McYF?Fd9lRlotX6k4CQD7v(@6|XCo2mVz zLUmQ95iphdd>&*vUMY+`-`|)B4?h3$ z-;db)%lb^e30-<*Wqm#;yVTK- +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A3056, taco_tensor_t *B, taco_tensor_t *A2886) { + int A30561_dimension = (int)(A3056->dimensions[0]); + int A30563_dimension = (int)(A3056->dimensions[2]); + int* restrict A30562_pos = (int*)(A3056->indices[1][0]); + int* restrict A30562_crd = (int*)(A3056->indices[1][1]); + double* restrict A3056_vals = (double*)(A3056->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + + A30562_pos = (int32_t*)malloc(sizeof(int32_t) * (A30561_dimension + 1)); + A30562_pos[0] = 0; + for (int32_t pA30562 = 1; pA30562 < (A30561_dimension + 1); pA30562++) { + A30562_pos[pA30562] = 0; + } + int32_t A30562_crd_size = 1048576; + A30562_crd = (int32_t*)malloc(sizeof(int32_t) * A30562_crd_size); + int32_t i1543A3056 = 0; + + for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= B1_dimension) + continue; + + int32_t pA30562_begin = i1543A3056; + + for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) { + int32_t i1543 = B2_crd[i1543B]; + if (A30562_crd_size <= i1543A3056) { + A30562_crd = (int32_t*)realloc(A30562_crd, sizeof(int32_t) * (A30562_crd_size * 2)); + A30562_crd_size *= 2; + } + A30562_crd[i1543A3056] = i1543; + i1543A3056++; + } + + A30562_pos[i1542 + 1] = i1543A3056 - pA30562_begin; + } + } + + int32_t csA30562 = 0; + for (int32_t pA305620 = 1; pA305620 < (A30561_dimension + 1); pA305620++) { + csA30562 += A30562_pos[pA305620]; + A30562_pos[pA305620] = csA30562; + } + + A3056_vals = (double*)malloc(sizeof(double) * (i1543A3056 * A30563_dimension)); + + A3056->indices[1][0] = (uint8_t*)(A30562_pos); + A3056->indices[1][1] = (uint8_t*)(A30562_crd); + A3056->vals = (uint8_t*)A3056_vals; + return 0; +} + +int compute(taco_tensor_t *A3056, taco_tensor_t *B, taco_tensor_t *A2886) { + int A30561_dimension = (int)(A3056->dimensions[0]); + int A30563_dimension = (int)(A3056->dimensions[2]); + double* restrict A3056_vals = (double*)(A3056->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + int* restrict B3_pos = (int*)(B->indices[2][0]); + int* restrict B3_crd = (int*)(B->indices[2][1]); + double* restrict B_vals = (double*)(B->vals); + int A28861_dimension = (int)(A2886->dimensions[0]); + int A28862_dimension = (int)(A2886->dimensions[1]); + double* restrict A2886_vals = (double*)(A2886->vals); + + // int32_t i1543A3056 = 0; + + #pragma omp parallel for schedule(runtime) + for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= B1_dimension) + continue; + + for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) { + for (int32_t i1546 = 0; i1546 < A28862_dimension; i1546++) { + // int32_t i1546A3056 = i1543A3056 * A30563_dimension + i1546; + int32_t i1546A3056 = i1543B * A30563_dimension + i1546; + double ti1544A3056_val = 0.0; + for (int32_t i1544B = B3_pos[i1543B]; i1544B < B3_pos[(i1543B + 1)]; i1544B++) { + int32_t i1544 = B3_crd[i1544B]; + int32_t i1546A2886 = i1544 * A28862_dimension + i1546; + ti1544A3056_val += B_vals[i1544B] * A2886_vals[i1546A2886]; + } + A3056_vals[i1546A3056] = ti1544A3056_val; + } + // i1543A3056++; + } + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/ttm1_2.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2])); +} diff --git a/test/kernels/ttm_ttm/ttm1_2.h b/test/kernels/ttm_ttm/ttm1_2.h new file mode 100644 index 000000000..86ebdb633 --- /dev/null +++ b/test/kernels/ttm_ttm/ttm1_2.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A3056, taco_tensor_t *B, taco_tensor_t *A2886); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A3056, taco_tensor_t *B, taco_tensor_t *A2886); +#endif diff --git a/test/kernels/ttm_ttm/ttm1_2.so b/test/kernels/ttm_ttm/ttm1_2.so new file mode 100755 index 0000000000000000000000000000000000000000..c698ec99101bea4a4b7945cab953850f3e96d6ca GIT binary patch literal 14400 zcmeHOeRNw#m7ixRjhd*E(xj@0cMYzshEm=@qW|;1-vGB(fWj$g^&v%Vg?z={p z8K#q}D@@7^MIUmV+VKJ?_jbw&I){^HFIUp~L2;geJam3wfJnZ>vc z>6+`}V9ipuvqgJsk*-nMrMPNwwY2^4^`}4k^cSD~<=^edyL*QA_CB`sD-TZ``tieC zuR3(*BfTvLQ{Vqq@eNPs(l5O5oA3YLGy0uRHZ~vscrR2JOpDN9vTsjU0nqIr`0cpn zlmC|m@Ffe_Kd^v2`A^4XPtySAt9RJ~_Tvl4Hv!ji+0zvZ*trjatL?(zKL~lBxJ-=d zdR1wIwBr`zUJL%TnjdK?zf$bB$)AGQrNS+`m3+6d_6>y(Ut$5grcWz8ND~9;CfKFa zqwL$;^KRf|Usvme=kDtsf zm`SI);yr^Yt8D&~SYk9z2I7MQA5Mvl_iXBn4Q4XKv5|Z_H!zg4s+tE+r$%!Xa4MaQ zr8AGlsX3&R_)sj-|DjlKd|*%{hK2=@OsoghjBMYWijO4v!@X$aT(~9Gn;A)oq4?ln zCLxk3>R?PN=ThlxW+avqy(1|Iq=vGooES-2g0kJi@ey#TK@rRL4-CcP*=%Y^_Fl4y z%+PQ?mlA^mJ$=XqvY9|5F$v186R}h>o{OXLJ=v`4bPS!cy0|tFv@(s9iFUNK#%>9$ zoypx2xQzygCb0v1T8-SVY7|`rs{Mt3q^UtBBIl+MLGY8;-GD85k;p1+vW|F%*a4mr z$=|EWh+i=HR1^+&-W*uu9Hhh3rXFF80Y(^%Z49N^253=)mWbS zsZ|zE_YSp`rCg~3r+u5#sS124gP>J-u0_S@hl?5ws zwUtPfrV5<*5Gn{&;Fb49qXK7NkbGwat{z1Q7F#QDc0J+U6*%n?ocb$pH-n(Y*y}E% zu;{gwx-j;Q=W5E6#@G?pgec#38c_NAKj7E9Duf*I{iHcPRYto0H^ghiPfGq5#M4ks zk4ye1#FGoBpO*Z~#FOi$4@v$-;>k7B2PFSI@#K=}-I9Nncp9GRVab1ucyi5jx8%P> zJh^1LQ}SORo?I~od1_YW!3pxpED!xg(ktF}qcl^09{R@N^A5v2 zWgITuX=q1{m(J&EVQ`N!=q7{Q1iQW68~z(&BEQBM+wcJ8worTAMqvZBr*t*CSTgW) zbkVoKYov~j3GwCxOhzp62d8@>Ve#Py!uLN*qZQr~?%J~1*z@`q!5D>A(ZZv3Nu%Jr zzK(cvum4k^jpIgPqaH2p^^*gQ;^Y3uWa&1)QHTX2grH@#X@R8isZp-H8^jdXw0#+#1)+_!b^Sf7?V` z;|oS1>h~41(J9ji}9Xk|r;?u**Yyp!iuO^pIC*7)6MR0?z;A5Gubb$^@Yt- zWIn8&lIBm{?*8V^TG%m9)E^x?ukCsT=4Q5ti23}EUzw9IYSl;UDQuQ$H9Z$XFR`S| z?{%0bu%9-WIseVHK76Gc(BF>4k0NH1VNOJgH_M}kVRrdzqvr2RE|h}OM8z^`b(P-w zvjjTKBVcPwC9p;T!X4(f0q90?{SQ%P4oA!;jn!xhUJm(@S?~_(?ezLN2#!CKK+&<9RA6JaJoY%DJ6c!ui+xy0M zUWe7#_*-i|8im%nv2Sa471ryRmafvlKb6bPH|;7Jg^$#gJ_ZIES81n}sV(KLj9wa~ zjQMk_X1W`BIUd&5HuieIwejrmcSZ`qLdXRdYy(smF7DJK7@e&~LORYc3hgfA@ULqk z1?*Mr*gHb5h`GbR!YE*)4#~Z%t`wJ59VJJGj-l()#n*(#j(Nl0j$?<qrOVPciIqF~G z-TyflYb)G+uC3YQ-QOYi*qq_RW zGJkgRN}lgy6Yv~%SL`Xp6<36tSMF>GH~+k?kZcgUe%YqMoySbKza#OMarjp?r905w zg4=%*+n8D})ZSY2KW~cq-Ppi!AM5h#EoSsf&JK)z$JkNOcd#LCz#fGis7a%JC>TPM znvn7Nh}A>4=iueXHONF@&J7z|;m3&gsX_m?j>J0{NyomIcdmxkR+`8mx~quBmubB_ zR_rlX?7@9OYd-onr=w_V$$X1uSI%!l(Nv>wzl#|7_!gp_QS&a0e|r$~xxaC|Vme#% zw(*7OCbYvE(>q}a4Z#qt-#6}kq8^)KWBKGiHI}CzhK!v5*cbXMZnL;G$$PBpP4CVJ zg7|3<77R3?-4!Vi!BiXt4f&#l#VAFV*cKOUE3U$@6ZxQbCAMG#)76ECkE`Hu+<-&g zl~}-yFEqYPbH#*3wkT;IzE5bQKJ7~PV%Gu4`+?Crl_1(N4k|ppc2DhH-ldPac8fdTb^DFC`d1T7 z`Vm~mAxHZgrzkSpaE+tgA4ff+#p8RTro~gc-?7o77o9Dhy2lrVJ;9G&6!tWYE#Byv z(j3W}C7vc=VNYF)M~5tQT0E{Wol9t)e+8GoHo)fs!p3o(!x-{;oiKX-Flh4ur9`xZ zYcDYQ{$aK)Z8L3YpJ~foo?(Zkd72_~S^^t&xYoiy{qS=)^3<=#Yr>w|k2=C0eau%$R9wtrGJ6p!hW=Xy^m{-7$yBdsh~-BVWo{|J9K zx7T=s>exC(+ZDZE(OyL#QS_sV{)3|bs^~WqJ*p^w=h)cV`d+PLdDH!ndLmo^*mu0diebShN6Y*>d?Z&4=ky6VM!<_b$DC>ASC3rd_%?RQqJh z6&hOSSSu{NV|fk4oPM!%=^fC7l>{z)rXqXV+n!7CtI>4Cr3C_OtpC}O#2Wg*mPo8g zWYWC@eQO48ZCVH2kz8*)n;VMf`U44Td0t$@op5|#XpSY?2VShfcQOre0D5aLGz9_V^6AuuzEnCjGLQgE9}_o>Hmz$|w^ncHEqBro>+mjy{ zOb)<|)ik}KH=SvaA1ND#M>4roB9|EfI6ROLL!*i8i1;i0SySDeHPZeXbh_a^4Q)f6 zkzl7{#-+$ne; zti~4!-cPFWiv;gO)%aq;`&l*an%VcN@wwmUsoa6x*QdNzEx%Mu&WpQe;)<%}=YG$( z?0aV7ZK~zHVtk(U@4$(fzt2|7FB5#fsm3ow9FblL$-F}>7yKQzvd}?qOVoX&8ox~N zzE+K|oD;7>Q>0pWgS)hkqBi^n+@@e0K(HAwT>V$~55>M=nFAA}v%e ziOg5;>%cWcc6GAjl`R^m`S9g*vq3zOectL4pSv#VrDNx=M~dqqJN7srL*jGS%R>^M z+s=^%@Q(pkAw#N17WqfudKF?Z&nzH+LgH>=k0*K+_TJn_^cBO*VVk`bHXdq&xljA#f!bBjt%FlDZtA0?ae$+r&Hka@1#RJ)F*jOw#6iZOxSQepB$xN(oFw+wsj3x1e zl#Rvnqk_Vy22;6IGVq>t%}w(Q$$+$2d}JiP9Whk7k?n}tiX)aRnI9V34wVW{MzB?h zW_o)EGw~!hEteZ=M6gvtR*d+qKnx9t^`;Ss6^pgs7v9tsYr8u_v2Zb^8G9s-wo>fY zEYzD%Cm2g+GqL`7I!SSHk@w#n-qg_wNj!GO+6-mUh};JVA#`keWAvVua5Q#Ld;8|L zu2@&NCE8}I5kYD))@-H~RHXtbVl5QnxG{ z6$EIDy_>CdtqPf&MW`LKbZ1_9SXD5jtvfQ*QUtQwhcMEhxe<%@b1sdOL;w%= z0eKu~7)T1q^~baQB9PpkhNeYxIAD<6BdL+>Kqg(q#85Vp8jO>I$_)?ZL_pRaKsL~q z0h^=f%YYp6K&oFYsQx6BnX^=^m1RkC9+u-nIO>okvWP}e;ImmCq;U-f$dW!KBqAC zE1ZdSv}R6-0J8W*m|=ZB&oE6YQEaO+@%t0S;x7q=pm#Q`&*vVdhg1R8i(A6Xdm?^P z4gn+Etk35oru<$AA~KPU%4C3{cQmZe=P0JsK1$qvmSegFW%SO6c|LD3?IvX;YCp-? z`%nJiGGu4YWqdvxClHD2XMOH}8fB#4LPALV-ioP9b(r+=G&WN|>yH8>5q_`6ao9}l zCl#uzGG)P3>hpP!>3F3u@_c`1D&MmJ=lc2F$dvobW|(LCF%(qR&*u-O-L`Wm*UR>b zHhq4N$<(GUea*T=|6b|y_}l+ZBv&bnyxo7Fx9N|o=OCv1cOGnp`_259k)c;gJU{sS z%YQ#&?=R~!{U&tjk(KrNoa|CZKbC`tOw2Q#07_%W`h1?|zt5SZ!9n7FvmUqWZ&5}f ztk37}sa2#Ugf4$)e^{R>owq9WXU^wJad=i$=2(yE4{iE<9vW8qEXQW-{7+Fp7`LC# zL8A-kD|fItF@+pq!5PHLuztM_XMLvADCc~o=;w8dv)i@Ho`sklBh8m|`W50Ki_HAm zWt{-*7zwYr)>3Jo(X=vl*-IDD&nx}DzoLJQ(zo~X6{Wuv<&}w9rgX05e5Ks4S6kej zl_2tVS&AjGW2AD!b(W%Tk(pn+Eb+W|%m^W4DE%qAIV0(~nsM>` +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A2593, taco_tensor_t *A2398, taco_tensor_t *D) { + int A25931_dimension = (int)(A2593->dimensions[0]); + int A25933_dimension = (int)(A2593->dimensions[2]); + int* restrict A25932_pos = (int*)(A2593->indices[1][0]); + int* restrict A25932_crd = (int*)(A2593->indices[1][1]); + double* restrict A2593_vals = (double*)(A2593->vals); + int A23981_dimension = (int)(A2398->dimensions[0]); + int* restrict A23982_pos = (int*)(A2398->indices[1][0]); + int* restrict A23982_crd = (int*)(A2398->indices[1][1]); + + A25932_pos = (int32_t*)malloc(sizeof(int32_t) * (A25931_dimension + 1)); + A25932_pos[0] = 0; + for (int32_t pA25932 = 1; pA25932 < (A25931_dimension + 1); pA25932++) { + A25932_pos[pA25932] = 0; + } + int32_t A25932_crd_size = 1048576; + A25932_crd = (int32_t*)malloc(sizeof(int32_t) * A25932_crd_size); + int32_t i1543A2593 = 0; + + for (int32_t i1547 = 0; i1547 < ((A23981_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= A23981_dimension) + continue; + + int32_t pA25932_begin = i1543A2593; + + for (int32_t i1543A2398 = A23982_pos[i1542]; i1543A2398 < A23982_pos[(i1542 + 1)]; i1543A2398++) { + int32_t i1543 = A23982_crd[i1543A2398]; + if (A25932_crd_size <= i1543A2593) { + A25932_crd = (int32_t*)realloc(A25932_crd, sizeof(int32_t) * (A25932_crd_size * 2)); + A25932_crd_size *= 2; + } + A25932_crd[i1543A2593] = i1543; + i1543A2593++; + } + + A25932_pos[i1542 + 1] = i1543A2593 - pA25932_begin; + } + } + + int32_t csA25932 = 0; + for (int32_t pA259320 = 1; pA259320 < (A25931_dimension + 1); pA259320++) { + csA25932 += A25932_pos[pA259320]; + A25932_pos[pA259320] = csA25932; + } + + A2593_vals = (double*)malloc(sizeof(double) * (i1543A2593 * A25933_dimension)); + + A2593->indices[1][0] = (uint8_t*)(A25932_pos); + A2593->indices[1][1] = (uint8_t*)(A25932_crd); + A2593->vals = (uint8_t*)A2593_vals; + return 0; +} + +int compute(taco_tensor_t *A2593, taco_tensor_t *A2398, taco_tensor_t *D) { + int A25931_dimension = (int)(A2593->dimensions[0]); + int A25933_dimension = (int)(A2593->dimensions[2]); + double* restrict A2593_vals = (double*)(A2593->vals); + int A23981_dimension = (int)(A2398->dimensions[0]); + int A23983_dimension = (int)(A2398->dimensions[2]); + int* restrict A23982_pos = (int*)(A2398->indices[1][0]); + int* restrict A23982_crd = (int*)(A2398->indices[1][1]); + double* restrict A2398_vals = (double*)(A2398->vals); + int D1_dimension = (int)(D->dimensions[0]); + int D2_dimension = (int)(D->dimensions[1]); + double* restrict D_vals = (double*)(D->vals); + +// int32_t i1543A2593 = 0; + + #pragma omp parallel for schedule(runtime) + for (int32_t i1547 = 0; i1547 < ((A23981_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= A23981_dimension) + continue; + + for (int32_t i1543A2398 = A23982_pos[i1542]; i1543A2398 < A23982_pos[(i1542 + 1)]; i1543A2398++) { + for (int32_t i1546 = 0; i1546 < D2_dimension; i1546++) { + // int32_t i1546A2593 = i1543A2593 * A25933_dimension + i1546; + int32_t i1546A2593 = i1543A2398 * A25933_dimension + i1546; + double ti1545A2593_val = 0.0; + for (int32_t i1545 = 0; i1545 < D1_dimension; i1545++) { + int32_t i1545A2398 = i1543A2398 * A23983_dimension + i1545; + int32_t i1546D = i1545 * D2_dimension + i1546; + ti1545A2593_val += A2398_vals[i1545A2398] * D_vals[i1546D]; + } + A2593_vals[i1546A2593] = ti1545A2593_val; + } + // i1543A2593++; + } + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/ttm2.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2])); +} diff --git a/test/kernels/ttm_ttm/ttm2.h b/test/kernels/ttm_ttm/ttm2.h new file mode 100644 index 000000000..40f1400d1 --- /dev/null +++ b/test/kernels/ttm_ttm/ttm2.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A2593, taco_tensor_t *A2398, taco_tensor_t *D); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A2593, taco_tensor_t *A2398, taco_tensor_t *D); +#endif diff --git a/test/kernels/ttm_ttm/ttm2.so b/test/kernels/ttm_ttm/ttm2.so new file mode 100755 index 0000000000000000000000000000000000000000..16a3d2542141b1b755548f437f4ba19dc7f1e9d0 GIT binary patch literal 14400 zcmeHOeQ;aVmA}tYh=_%r0!c6>5D^}&16Yy4q;`{lWlQ!GBNIa$8rp!O$dc`?Eg4&e zgnIL_ido(&3fWy+nC|fmE>D=?X=bZcT?)N+16%03(DGHOp#%^T9ojTt{>P5ovY7qgcXSHl0 zepj+9xa@+Nq*<}Ci%m#U#ylLLJkG;&_A<_bdU!6Bq+6*b%sGNUka0|iPM1J$N zi$k=Bq@p^!ZcAXphNtJ>c>S02!8a~^e8&@yHJ^AfMOjd~3lEuDi04s{ z;yB+|R_?SnC=V>~D3o>~o(en-!57{-^e>0L`@}2%FgW1c*0VS9K>1^Lk3M?x?#;^| zJ@e&6!$a{CKh6EY)mr||<8PjL$F==GzEWK`_IC-W&Knk>z+~SVmILruLGU~9%q9Pw zdGJN^*gr6jJhh((k2OpHm@D7K^VpBhBVP;LgU1>!na9p;5L{^$2LDCKyV+8<-Qy`q zt>Si^Y<#f^e@*m{B$r>tc3b2pA$B2ivR)y-RapD6z;~-AAjkBS!0SZ^y#{tEY!mja z<@pS7vhNY&MfU4;0`H|xig3^*s1wPb&(6w$D)2Qz-eIE*u*%KUhvIC<0 z8%4si0-qGPE)^n?j-FI9(w~m@r6Um*=}LB`StJ3%BB8dHNPE05-qF>cj`y{-GNT$1b;$~KJ;K}&*bP*g+wnvhw zd!tkw(usX366^d@BoXcEX0e`L1|${PhHUzFY>Y?yVx9g33OO5Yh$m8gan=*m5;mYZFvOm=qNwY*>90Kv4{&<@8#Z5t8?%rr0xOg{<^mlgkM56uu@g81#&c;$b zy#whu>+agtfvC4X<*glfH~CGj>k2%ZAT z-zw6GU((fe0j(560vq^L7goYD_MniL>yzxR3 zwZg<{?GQtm%jJu3nzv<`EW*nr2zs^%M}?+Cg~*p|h$`_kPZ3UQgbbBMxLn&P%~ynr zp~Y2di*PxIP(pnXUc4^qMYwDWl5Z`-#a4u1wz&wGO;7mNBAn(389Iw_rvyRu%v%mU zyWq8D9;Ocsr^^cCdgh2@lof860#vx_J^ZRG>JcNplQbtz6%ej^lX!*ranApaczRV6 z!<>JScxu9lL!5t(cxt+dM>+pA@zgXE2RMJ2cxsY~-JE}dczQh(y`296@zgXETRHzd z;;Bg{S~>qs;;AVn>N&p>eBuCj9@Ot|(LUM});_A=&6xWAHtnO)gjZd$*Gyc&^xXQp zp|DBME$9N(>(1)NYw1hSG>4a(O>@DWg;O&scaBo4%<#~!CcWGZS93F~&Ou)vIcL+2 zNqr=Lldc@qpFNkZfWbY&ppy*BB3SM1UjJK&v4PckX8q?Wwu#E))U)fUJo(E|#k`K6 zqYHitULkcsk9% zmBbr+wV!}i59`@xMmWHkNAv z<7e6fBmnpkP~F(4(JL?lS~%OR<$5nitT!{Ps-em1!pJ$h9y^o2;#{GycUax~Ep(Pp z>{vLvTr(#0kyEyW`p!!_6gKA?3m`r{_Egw-J7gRQ=W3pURhSQGt)Y=u?SA7IVdGSb zG1X$c95P0;`^++D_wwSR5c^0P@NQ5jrAJQLLE804^qhSUnd$WBf(1P@tPtY}8z)I`wQ?tIw`GlnorR*Bvtk zH8QtV%{4<{=p}XkO`s?hwUM4hi>e2YsRswOFfYi6ZrC;BWB&g1*b5}k8ZcfD&*QUf@Jf#B%2+5DcnZ@3`Wb{+=CBgV9Care)Ma*eSQ$(B)jOId>wc9`H;>)>-w8D} zl5NwL*R^TxgxY`7LGYBi`Du0UAJGg*sxeLma)V2mdhl>dtT5H6y6qnZay7n|*cmW) z1m0(XT%QuGdki$t^DJT8P?~xwJtvin^ybF;sQ~Qp<$81{otMdulW(n`?DYELD zwrZa(7~q1L!wWi!-F025(QMsXcrA6Kq1Qhypp&B8K>)vc=B!&Cx*H&{ zw1S^@H%17O(thLD0pk^t?X0ic%lm(e@r?T5HDn?CLvwUyf5t~AO0h+AW9Wp9HtPFM zEq~en(3qs57{&{7;4MQ!9s1Tsn0EX~)6t1(qG;EzH)oE4YJJa_1G`#oDUDKJRP9%7RWiz z3mRkok^G9FGUk8wY$!Iq*7@+xivz|{O96UgDFVX%7Qr@Qotgm_%`k=4;H# zv8g)en44ZQ0>*Wnkv|Dm$LpvF8BYQ5=(%+-kb(S@NI-|VHaJw^ z10RW%M!#F#zYD4UTyr57K#|Zp=>5v5$Es2TQ_U@M$X!ULuZh-PTl`|EVzE-m_OHq=HF?rKBMQB@RyK(`%IzmBzA)=Z-X_w zw~#TS?tcmLdaluq6^Gso1Eq5_QNCyML&eQkJ)b%Lk&P#8jG1y%Z2{n7X^oLjy>U>t8Tetr5<+21pxXgRG?&pc(%ExE*Bw`^yXzwS4|Y*8V^Uv*mdyYBJ*sSiU%4YFpcjmZtO8 zk#`FP+I;PW%nv~y0&T~R;~_NBGvI5mm@LO@34r=QzYR*~*7ra+gKAiDz74t;l>6MI(D<0 zKDGAh>$Q~xlRjPx8-pCpXEKBl*@9;n<$lDDNQ2A$U|EB!V!y4~<;mF_T$TG4_+7sH z&-1%#GYgwtyUT1F?TcKs!2GVt2A2o2&}ncvf^;sWas3!ktOW90NZ2qQCop+mL)i0p zHUrZ^DG<%$c?cN4f0`*vaJno_(`C8Y)oW7}S8ZTcNnoQAPYv2gTo7DyZ*Q1JULmAqjaaPXF)CLuJ4#D0> zc&GgSE8yB%_MJs8hrgV+&of9nfPD(J0R^I`fbFwj?*XfvDiki_QlI>M(gL5fz$Y#6 zNeg_^0)Kf6$a`yfPc84Iv9$0&dl3qfzg0w>COrMUM&2#U?=hFi^vgUvcB$ayn)WHd z%X@L!qfl^|guQ>RkfP|M$WQJS&e9$PLEas2#@>g5yaWE4ke7GhwANCfi&!2!9!`WB za|n64f0B3V7n{;-QUDV7iHI!cu#l5?(=tD7d3pKAMM1Gm#Mm7BKV+t`5T zv)&uLzN+g7I9I*1y4LHfS!JT^T9nLeJhSO#hn;Wp^99J8jcLAO=UxCmuvU+ZPU1(| zXtk&O^X!Na->r&UIix(?ybeA%(Vsm}2d~Nw>E?KNx zxUjzB;f6~Tl+ISeOgwZ^8N}=wD=)thny?bXgU?iCPx;Vw0e%(gj(8NWceVMyzc03$ zKCs1Nt7EBTqN`(d*Y&k)pxc*DMElb{(R8ObW)9Eu%VZ^NCzxVeq95#^b@9Id#+dW;$ zGzA zZewzOE5+?h&V!}+0w(8^Qv5t7=b=)3A(Qi2Dejn__e$~E-{&dahS}FGTCG&RoQ={(T<#=Q-|VR)3=30iVmSYJ$tpo|oR4 z$Nr1hqmbW|)eozj?icx5{mAH^Y%V)L7xGsBtmg&p5&c%46JF%)3_mq=Aqodg z!d?;bR)4M!fqR5ZF{1AFfdCsYpk6YFo5B(vBTcei2p2W(X*OL7%2I&juU20#y~fB_>`%Gs&Gx_} zPdn(7XCAqqna*v}n@zf{nJ$19&%2piYlOqx3_|6Y$!o@auQ;btXG3!1GyU>Ptq32y zQn1*MZcZ=0nY>Fp4U4j$$7Rss+&4=~co6b7p3~)#=PZ2%MSkSq7b$Utv{Z*XS~73{ zjvl;e&~%?kJ7p}1bA%V$dM`f=RCTp8&UHrnJDInAM-rMQP2+e$a`(jh`nyue5+;JQ zzIb<(6hy4IJI%a2cQ2yejuhB5`CfYYEBD4b#c=9uhqB~M6?0sfk}?j<(HU z(j0Tw{{^C*B3*4@`s6{LQzX>{E{S=xmKw%GNsV~n*lFEA_i10u*ih~Y@?r5aGJV!}N<)a|W zFXbfNgfzPIk-R)_NxGGk5vcqmW34~6k4!^!`dlW@XTt;{$o!?gtbY<|q~Ab72=d-a zQu7kT)TgD?s=pl=wTZk}lYZEeTDGM;FNjZve#DCP<#|xj;b}=TBG-3G`8^AgWd8Ep zD5)&3G$VOQ{}u_w`OEW%q+8{MpfDiwmG*KLeR+>5sr493ZCxDx5e(%o-@n`+OPVeg zM%=3Z4U7J;*at}}|IS02k@c4Rzac_bO0s{*^RN8-5o>*=zNC*sm$sBrU!Id4;^-&k zAi{&>B^?Dy?@sE=^R)c?oN;O#1X*vXC(HFWNFx!cFVEelR*)8B9{xM?BlRVH21&*G z)8})c*gK=jV^UAj7cBbn9N#PSrJOWlRC!^~Oq*yQvsN+$u}H1XhexK7FOB=rPIk zuT_?KId@D8A)*WYQ?xiEc<|KWk^N83|1>|zzK7^^sJzCEZm=X;^QS)pvWRfSdud@u KXs{?)SoXht2qxzM literal 0 HcmV?d00001 diff --git a/test/kernels/ttm_ttm/ttm_original copy 2.c b/test/kernels/ttm_ttm/ttm_original copy 2.c new file mode 100644 index 000000000..cb21b209f --- /dev/null +++ b/test/kernels/ttm_ttm/ttm_original copy 2.c @@ -0,0 +1,242 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A1537, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) { + int A15371_dimension = (int)(A1537->dimensions[0]); + int A15373_dimension = (int)(A1537->dimensions[2]); + int* restrict A15372_pos = (int*)(A1537->indices[1][0]); + int* restrict A15372_crd = (int*)(A1537->indices[1][1]); + double* restrict A1537_vals = (double*)(A1537->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + + A15372_pos = (int32_t*)malloc(sizeof(int32_t) * (A15371_dimension + 1)); + A15372_pos[0] = 0; + for (int32_t pA15372 = 1; pA15372 < (A15371_dimension + 1); pA15372++) { + A15372_pos[pA15372] = 0; + } + int32_t A15372_crd_size = 1048576; + A15372_crd = (int32_t*)malloc(sizeof(int32_t) * A15372_crd_size); + int32_t i1543A1537 = 0; + + for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= B1_dimension) + continue; + + int32_t pA15372_begin = i1543A1537; + + for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) { + int32_t i1543 = B2_crd[i1543B]; + if (A15372_crd_size <= i1543A1537) { + A15372_crd = (int32_t*)realloc(A15372_crd, sizeof(int32_t) * (A15372_crd_size * 2)); + A15372_crd_size *= 2; + } + A15372_crd[i1543A1537] = i1543; + i1543A1537++; + } + + A15372_pos[i1542 + 1] = i1543A1537 - pA15372_begin; + } + } + + int32_t csA15372 = 0; + for (int32_t pA153720 = 1; pA153720 < (A15371_dimension + 1); pA153720++) { + csA15372 += A15372_pos[pA153720]; + A15372_pos[pA153720] = csA15372; + } + + A1537_vals = (double*)malloc(sizeof(double) * (i1543A1537 * A15373_dimension)); + + A1537->indices[1][0] = (uint8_t*)(A15372_pos); + A1537->indices[1][1] = (uint8_t*)(A15372_crd); + A1537->vals = (uint8_t*)A1537_vals; + return 0; +} + +int compute(taco_tensor_t *A1537, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) { + int A15371_dimension = (int)(A1537->dimensions[0]); + int A15373_dimension = (int)(A1537->dimensions[2]); + int* restrict A15372_pos = (int*)(A1537->indices[1][0]); + double* restrict A1537_vals = (double*)(A1537->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + int* restrict B3_pos = (int*)(B->indices[2][0]); + int* restrict B3_crd = (int*)(B->indices[2][1]); + double* restrict B_vals = (double*)(B->vals); + int C1_dimension = (int)(C->dimensions[0]); + int C2_dimension = (int)(C->dimensions[1]); + double* restrict C_vals = (double*)(C->vals); + int D1_dimension = (int)(D->dimensions[0]); + int D2_dimension = (int)(D->dimensions[1]); + double* restrict D_vals = (double*)(D->vals); + + // int32_t i1543A1537 = 0; + + #pragma omp parallel for schedule(static) + for (int32_t pA1537 = 0; pA1537 < (A15372_pos[A15371_dimension] * A15373_dimension); pA1537++) { + A1537_vals[pA1537] = 0.0; + } + + #pragma omp parallel for schedule(runtime) + for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= B1_dimension) + continue; + + for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) { + for (int32_t i1544B = B3_pos[i1543B]; i1544B < B3_pos[(i1543B + 1)]; i1544B++) { + int32_t i1544 = B3_crd[i1544B]; + for (int32_t i1553 = 0; i1553 < ((D1_dimension + 31) / 32); i1553++) { + for (int32_t i1555 = 0; i1555 < ((D2_dimension + 31) / 32); i1555++) { + for (int32_t i1554 = 0; i1554 < 32; i1554++) { + int32_t i1545 = i1553 * 32 + i1554; + int32_t i1545C = i1544 * C2_dimension + i1545; + if (i1545 >= D1_dimension) + continue; + + for (int32_t i1556 = 0; i1556 < 32; i1556++) { + int32_t i1546 = i1555 * 32 + i1556; + // int32_t i1546A1537 = i1543A1537 * A15373_dimension + i1546; + int32_t i1546A1537 = i1544B * A15373_dimension + i1546; + int32_t i1546D = i1545 * D2_dimension + i1546; + if (i1546 >= D2_dimension) + continue; + + A1537_vals[i1546A1537] = A1537_vals[i1546A1537] + (B_vals[i1544B] * C_vals[i1545C]) * D_vals[i1546D]; + } + } + } + } + } + + // i1543A1537++; + } + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/ttm_original.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} diff --git a/test/kernels/ttm_ttm/ttm_original copy.c b/test/kernels/ttm_ttm/ttm_original copy.c new file mode 100644 index 000000000..2db396c0a --- /dev/null +++ b/test/kernels/ttm_ttm/ttm_original copy.c @@ -0,0 +1,225 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A1537, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) { + int A15371_dimension = (int)(A1537->dimensions[0]); + int A15373_dimension = (int)(A1537->dimensions[2]); + int* restrict A15372_pos = (int*)(A1537->indices[1][0]); + int* restrict A15372_crd = (int*)(A1537->indices[1][1]); + double* restrict A1537_vals = (double*)(A1537->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + + A15372_pos = (int32_t*)malloc(sizeof(int32_t) * (A15371_dimension + 1)); + A15372_pos[0] = 0; + for (int32_t pA15372 = 1; pA15372 < (A15371_dimension + 1); pA15372++) { + A15372_pos[pA15372] = 0; + } + int32_t A15372_crd_size = 1048576; + A15372_crd = (int32_t*)malloc(sizeof(int32_t) * A15372_crd_size); + int32_t i1543A1537 = 0; + + for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= B1_dimension) + continue; + + int32_t pA15372_begin = i1543A1537; + + for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) { + int32_t i1543 = B2_crd[i1543B]; + if (A15372_crd_size <= i1543A1537) { + A15372_crd = (int32_t*)realloc(A15372_crd, sizeof(int32_t) * (A15372_crd_size * 2)); + A15372_crd_size *= 2; + } + A15372_crd[i1543A1537] = i1543; + i1543A1537++; + } + + A15372_pos[i1542 + 1] = i1543A1537 - pA15372_begin; + } + } + + int32_t csA15372 = 0; + for (int32_t pA153720 = 1; pA153720 < (A15371_dimension + 1); pA153720++) { + csA15372 += A15372_pos[pA153720]; + A15372_pos[pA153720] = csA15372; + } + + A1537_vals = (double*)malloc(sizeof(double) * (i1543A1537 * A15373_dimension)); + + A1537->indices[1][0] = (uint8_t*)(A15372_pos); + A1537->indices[1][1] = (uint8_t*)(A15372_crd); + A1537->vals = (uint8_t*)A1537_vals; + return 0; +} + +int compute(taco_tensor_t *A1537, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) { + int A15371_dimension = (int)(A1537->dimensions[0]); + int A15373_dimension = (int)(A1537->dimensions[2]); + double* restrict A1537_vals = (double*)(A1537->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + int* restrict B3_pos = (int*)(B->indices[2][0]); + int* restrict B3_crd = (int*)(B->indices[2][1]); + double* restrict B_vals = (double*)(B->vals); + int C1_dimension = (int)(C->dimensions[0]); + int C2_dimension = (int)(C->dimensions[1]); + double* restrict C_vals = (double*)(C->vals); + int D1_dimension = (int)(D->dimensions[0]); + int D2_dimension = (int)(D->dimensions[1]); + double* restrict D_vals = (double*)(D->vals); + + // int32_t i1543A1537 = 0; + + #pragma omp parallel for schedule(runtime) + for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= B1_dimension) + continue; + + for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) { + for (int32_t i1546 = 0; i1546 < D2_dimension; i1546++) { + // int32_t i1546A1537 = i1543A1537 * A15373_dimension + i1546; + int32_t i1546A1537 = i1543B * A15373_dimension + i1546; + double ti1544A1537_val = 0.0; + for (int32_t i1544B = B3_pos[i1543B]; i1544B < B3_pos[(i1543B + 1)]; i1544B++) { + int32_t i1544 = B3_crd[i1544B]; + for (int32_t i1545 = 0; i1545 < D1_dimension; i1545++) { + int32_t i1545C = i1544 * C2_dimension + i1545; + int32_t i1546D = i1545 * D2_dimension + i1546; + ti1544A1537_val += (B_vals[i1544B] * C_vals[i1545C]) * D_vals[i1546D]; + } + } + A1537_vals[i1546A1537] = ti1544A1537_val; + } + // i1543A1537++; + } + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/ttm_original.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} diff --git a/test/kernels/ttm_ttm/ttm_original.c b/test/kernels/ttm_ttm/ttm_original.c new file mode 100644 index 000000000..ac2674239 --- /dev/null +++ b/test/kernels/ttm_ttm/ttm_original.c @@ -0,0 +1,226 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A1537, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) { + int A15371_dimension = (int)(A1537->dimensions[0]); + int A15373_dimension = (int)(A1537->dimensions[2]); + int* restrict A15372_pos = (int*)(A1537->indices[1][0]); + int* restrict A15372_crd = (int*)(A1537->indices[1][1]); + double* restrict A1537_vals = (double*)(A1537->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + + A15372_pos = (int32_t*)malloc(sizeof(int32_t) * (A15371_dimension + 1)); + A15372_pos[0] = 0; + for (int32_t pA15372 = 1; pA15372 < (A15371_dimension + 1); pA15372++) { + A15372_pos[pA15372] = 0; + } + int32_t A15372_crd_size = 1048576; + A15372_crd = (int32_t*)malloc(sizeof(int32_t) * A15372_crd_size); + int32_t i1543A1537 = 0; + + for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= B1_dimension) + continue; + + int32_t pA15372_begin = i1543A1537; + + for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) { + int32_t i1543 = B2_crd[i1543B]; + if (A15372_crd_size <= i1543A1537) { + A15372_crd = (int32_t*)realloc(A15372_crd, sizeof(int32_t) * (A15372_crd_size * 2)); + A15372_crd_size *= 2; + } + A15372_crd[i1543A1537] = i1543; + i1543A1537++; + } + + A15372_pos[i1542 + 1] = i1543A1537 - pA15372_begin; + } + } + + int32_t csA15372 = 0; + for (int32_t pA153720 = 1; pA153720 < (A15371_dimension + 1); pA153720++) { + csA15372 += A15372_pos[pA153720]; + A15372_pos[pA153720] = csA15372; + } + + A1537_vals = (double*)malloc(sizeof(double) * (i1543A1537 * A15373_dimension)); + + A1537->indices[1][0] = (uint8_t*)(A15372_pos); + A1537->indices[1][1] = (uint8_t*)(A15372_crd); + A1537->vals = (uint8_t*)A1537_vals; + return 0; +} + +int compute(taco_tensor_t *A1537, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) { + int A15371_dimension = (int)(A1537->dimensions[0]); + int A15373_dimension = (int)(A1537->dimensions[2]); + double* restrict A1537_vals = (double*)(A1537->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + int* restrict B3_pos = (int*)(B->indices[2][0]); + int* restrict B3_crd = (int*)(B->indices[2][1]); + double* restrict B_vals = (double*)(B->vals); + int C1_dimension = (int)(C->dimensions[0]); + int C2_dimension = (int)(C->dimensions[1]); + double* restrict C_vals = (double*)(C->vals); + int D1_dimension = (int)(D->dimensions[0]); + int D2_dimension = (int)(D->dimensions[1]); + double* restrict D_vals = (double*)(D->vals); + + // int32_t i1543A1537 = 0; + + #pragma omp parallel for schedule(runtime) + for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= B1_dimension) + continue; + + for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) { + for (int32_t i1546 = 0; i1546 < D2_dimension; i1546++) { + // int32_t i1546A1537 = i1543A1537 * A15373_dimension + i1546; + int32_t i1546A1537 = i1543B * A15373_dimension + i1546; + double ti1544A1537_val = 0.0; + for (int32_t i1544B = B3_pos[i1543B]; i1544B < B3_pos[(i1543B + 1)]; i1544B++) { + int32_t i1544 = B3_crd[i1544B]; + for (int32_t i1545 = 0; i1545 < D1_dimension; i1545++) { + int32_t i1545C = i1544 * C2_dimension + i1545; + int32_t i1546D = i1545 * D2_dimension + i1546; + ti1544A1537_val += (B_vals[i1544B] * C_vals[i1545C]) * D_vals[i1546D]; + } + } + A1537_vals[i1546A1537] = ti1544A1537_val; + } + // i1543A1537++; + } + } + } + return 0; +} + +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/ttm_original.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} diff --git a/test/kernels/ttm_ttm/ttm_original.h b/test/kernels/ttm_ttm/ttm_original.h new file mode 100644 index 000000000..a27841047 --- /dev/null +++ b/test/kernels/ttm_ttm/ttm_original.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A1537, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A1537, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D); +#endif diff --git a/test/kernels/ttm_ttm/ttm_original.so b/test/kernels/ttm_ttm/ttm_original.so new file mode 100755 index 0000000000000000000000000000000000000000..fa04aed35a00dd5622e9b53f76c56020c2b923bb GIT binary patch literal 14408 zcmeHOeQ;aVmA}tYh!YzEg4$| zho(uWVwSfLg>>64%$82mZg-dIc1o6^rDeOtjzc~u%$o4gY+>7MTR?L+e)pVnzrWr)-q{##Dp3?BgM(eih&wUQLi{4($Vw3b@v~Ys z7r)EcC0usSRMNDVztVyvWlZA$<#7(4iHkT3s_}@VyGktlenF)kv83m|*h+U_tnrAX zqoOL3mRe%W2};iqk#46RAtx!xQfPTS-A3{K=W%u#OQluU$pdJ-hKQl$rNQl=`K8EW-gw6 zno>HirlhRgS+CqbM^h+mAs*_s`o@>v-v7=0k3RL=zuDej-rBu8aevwO?;P28{Lalw z_MQ1+qW){~7k`p{$JJ8youhBP_y^a(&%f@gJMz~FsLmSZpul9`9+m*m>>&7UcxICS zhgtCXv)JD|i#)ZThQ}Vp0L+x{f?4cGXOXW3uHmtV#k1JC1%k`$!r(s#c{e+c4QN_X zYBjf0K3Ujri1Cr+^5?T%Hu-UAFJ$Fx-vZ0bR$=W&0^hG%fSl8(1>PeD=#{Wbp;g$o zm*)!LWWPtu7aEfk)(U)|01jF(>O}JM*o2&@0$(lU_uA~7&*jf$-xatJXHRjwoDGTc zZx9JD2wW58*QG)v(%zj)M*7mx-gG3wBAv<3G>araSR}M@eWWek8*lIIOUHXRu5ai{ zCF2{TtzB^|ZRV6nY#>SoqFtT$#98xA>sumSsZ>v-w?CQg?2cPm&44H41L*=do@|RG zQ+G$HIHVK%LL}Dlg-9aW*~MbrJq$=H(u!<)w{3_=dt)7e1PVDFu8${Dy>ZqZ?dnR! zSX-Pb7~#t4c(O0m8%eW7ZyW;g?!I`M^~NnhUhbY~FSvLYi}ZDLc1NOpeerHyd(OsE z-97#3IP2sZ!lC+x$W`7| zld-G3*H8!1Aa-C)E8*)^iJ)^pmA~+hG!@7M`PgKv2K?xymt#qu!}*##aLJ<3q#O3~3YQy&m`5sNmUwEKvAvu>Ks+_c*e=dLMLc!SSP$nPC!U&SY%Ax# zM?5vjSPSRBLp(Lbn4j~@z$f;C=Ry6!7SD(GhCLtpcQK~^V58^5k%U)Wy4y-z%Jl5I zJE5>i&(7%t)$1m7X~)7QEU^Hr(8GJQF(F~p^7;j zKZoZ01iV7(s2F4Kj=*Hl62D_C0SSu_+!na?DeA4jrohHc8}xhM&VkX*Yt4w>rkl>o zs)#jqdya$phRkMv(7d}goZamih`1@I`zgh2E~Pm2li9pB+jB9PePA*}su~*i z<%iEY_1Kx*M<3<$yNA@>FJeH2Vu$qM6KL3=rz6{#2dUCyBjN0-XQ0PI#$ig)Lc?!5 z!^Ze}$eI5vwxhZeEFm`iC>dBc2~aNk$Czw>ywrXO&*v^K;Au9U1L{lx}YEw3rnS zo56>jb))K@$B<@*Ju3!}s(b#P%RRyy19tV;v&%#;AXY|aF0C=I!V-+wUqT%vj=T>7(lI_U6L`C$D8|1db3(~%_f$F6`@V_tW2L#^bU<}Ge?>Og zY4!GlcuG#In~w*Kw~PaUfHyvw)5%sYH3AN&st~?ufAs5P)zvMfnE5}1w z=OI25)v9@!1B#v*b|=)^pFgd-S)yJ&-&qw_CIWS%z=jj*=Fz)BRBt*8VF>~`lX*;|#mm#M(0 zKa`X|qqlgP+xb!;Tj3yu0^t%%ct`T%1RX-Kkpwe`&~wM?-m{89`4^d`>~}0t&QjKA zg356!ndguC!R=f_OfSF#3g#H-b~mOT1P1hsHSGPE z3^cja2Y>bztv+q~)z)&`JX#1N5yfKaywP1b6DjqvsSN`PAOHS62 z{p>1tz!)d<0cD(<%F55;~FG-`DNE8L2&LlTpJqo*=#{2_M)sh~7_ zv4}R5=AN{GkZ}-fW$sb1x(Q*deE>A95HFK~+=ECkkxAoVo|H8tRvCS6bZBvyF*&x&^bQ2rBA11Ij91e>;=UKaJBP%e{yg`YmJ(t9xFByq;}vVq>D7 zVW72w@-tolDsGGyOnT}DKgG1LaRehfY<78S8hvNN#+miT??c9W;{1RWz+Im$Us)IS zRH%Et1#6AwjUP7Fxzs%&$}N`ml!szR!$y|}yZQV2@^|#?iXT#@I!07gVELbn55n0c zdxFN%^~QgmyiksJEH^;|8+GOxXSQN-pziz~)q%QSH=1qLZ0Bzq6*T8zqudjUy{`|y zQIb3R9tLQ+=j2gzvNd0*yp_g(Tp9M1Af#$@_n;E| zOo8=#2|rHu&y$rO$-VY2PND_twQk;ovfSp@&AU;E>&Kq^nDlb8PInbd@)7ika-m~Q z8kFvITcJy*Hw`kh;naW*;9OOMvWi+ z?$TZC`cJL-^wpka1e1Oc50#nLK^ek`Y{9b`Wq+R5-Flb%!IFActpP&v7iXb3iE&HiYLT zVDh~XVK3p)XafeNKs1ME3o!nkYpN`blVxd|EXxh99*3g1YJ<~C0vlC$sLlOo>#c}W zy&fnDxGL{+1YBCiS+dbp8Q{gnX&T8}A)AIQe}6ZXFX}g~4T7$HisPmdv_ZqPoSUf) zDDVBScM$!05OIc#?xVajj-YFoQu4%nS7n3C9iS$oc6bf49XQv>J|=9O%45Tv7z_K~ z$>$f5%}>H7E$~STe9{7+w7~z%7Lf0)<$G%RUK-0j5A+V5g5+-&k#Y;3{9Ys9Ez9pQ zi)H#n8joElc-m7ae99tBz89yn1BFtHu=md9`FBd%e<;X3e1gsf2=d+WW^pc&?|{D{ zwl`a`N4@%nwH>Uj9*0P#mE+ zFY_H2{MSS}jv72&R8Ls_za#Ryxjo0rMa5PN+9c?$f+hsLOVIlS{aZo*QP3X=dPq?D zouj#-;WJv*rdE6#>({RJUgNE)zPg`tz8$_=Z_TRZ7RpwjWLD!@O)m%RY=@A~Lf&dj ztBcEb19;qCJu*6uA7z8xp7O}8z#Tg%@$(er!RefPmLkdm!1lMF+BK8MfQ{rT<79f zq2Y)}@p@NU|NDAlE9nDUEVeS1N+vqnS9V@qyBfN^=|r?I-5pJLcw^S|oL3?%;dqfL zj`_-GpDn?6G6jABT7%DLHKbPEjjx#P)orPMe95b>tpSkkPsZ1^$CL5i&KOwwn7D4B zc6IgYRa$ks{D0F`TD5!&+rBl5CdpcvRdmz@= z%RZ()D~qeMlG|U2N>^W7-Pqj{tZ5MpA0i?dMU1Rog(4*T2ddkX{i2;1K4aoDXJV;Kj! z>4;cw3WXgQ1ah4##vM$qZ^gKi$#t+8pTp#OQjDL&EGum z-NEKFw`jFu`7$;-BVIn~S5z#2?zHdumVMWxzfG~c%Bl)K@`*$TZhG?jY_a@;$^E7n zUkE=Utq|gI2V2DCci6&22i;<|ndHx7a$PHyKYyCP2BkZg++_--375Nw9hbYR9hbYQ z9hbYV9hbYU9lxj;5S)WuJPoH|>}2oc?FlM?68L?{nIe~GvOJSK-BC#%L#QZxTHt;g zzE0rsoFbJrAfm9Tc9zqumE+U<-4J%<`A6EJH_XZmdUe599^I|1$*y;A` zg`ELS%Gx=4$ z!{w*1OYhHO|2H_JkbPNL>4&w5UlZ{{auM*E%5x3yWrczwelwS!VZSRrs1zSHkk*&(Pb6?4n{peAq`Mz`@E(H8pK$+!)yys1G;Va)hfI_c)s@1!XBf@>}!!rS};5jQu5- zz3HBKnh86Zz}I1kHdoOXK@|0Ir&YK5}t&-jc0Uy9L6ILh`bCOdB5l>- z4wuZ^x2+pJ4Vvz?Xorj?ai8$wc<<$xg6hsT#<`AYUkCHHZA(JaqG?<+Nbat9Z(nCB zS;Rz;)*J7Nl7fi!bfuY>=k7(++nxfOCLc^M?{aUvLrklVHYiKZQnBWjB`M>u9PP%1 zhb)mr6pH*Gy>NeIUW}@4Ok7~`*8X;=M3e1U5d_UuaWOU*2mZZ4+`>UU`u3Q3Q*> zC}51<-AH|T|B-Z`NT7VNHB7!QlE)@LFtRQ6G;2tC9NhJ|!uY zkAf_}l#_H5(&*ifWE!HA_cM7<8zK-v<}dYS{gX%|{dy8Y zkoP!At(PH|K5eOX{Q+RqCi4B7^v9Oeeq*7$3PT^5LVbBJlys<27;*VNP*VQB1-Q&# z-XA5E<&|b6FX>+)p)h}W50Uhsd@(2t$b6-}tW96O&y>`Dowe8ht3qG)zdVOanwA*} zYS;g^O<&$`C9RZ(g}$t}H2cqp(5odmKIA=E{vC_)Hw*U-cnDN>&r+Z5vecl-zS!m7GoO!o%)gbl0Ju|LjB2m zx=`$yQsptJCn-4y73$0Te2>tVa?*^Qe-#OY$@0tl(7-JElviOm0fw-eLJ)DOzubmP zeM!fVF5`uw-_|V7Zr3h50Wmv9nlEVd%h=~FGWlzlbpo(sB)swxOQmU2(~8(-7tW&J zFZA0#M*m`=Z?ETTLca{@g+a2CE=NSh3+3KiW^vaSf{5E?>07@YBbCc9wG=grO#a$s ziMLI0E~^Xu6XN@W2DA>39Dj2Cr}asWJwzu%)s +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A1542, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) { + int A15421_dimension = (int)(A1542->dimensions[0]); + int A15423_dimension = (int)(A1542->dimensions[2]); + int* restrict A15422_pos = (int*)(A1542->indices[1][0]); + int* restrict A15422_crd = (int*)(A1542->indices[1][1]); + double* restrict A1542_vals = (double*)(A1542->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + + A15422_pos = (int32_t*)malloc(sizeof(int32_t) * (A15421_dimension + 1)); + A15422_pos[0] = 0; + for (int32_t pA15422 = 1; pA15422 < (A15421_dimension + 1); pA15422++) { + A15422_pos[pA15422] = 0; + } + int32_t A15422_crd_size = 1048576; + A15422_crd = (int32_t*)malloc(sizeof(int32_t) * A15422_crd_size); + int32_t i1548A1542 = 0; + + for (int32_t i1552 = 0; i1552 < ((B1_dimension + 15) / 16); i1552++) { + for (int32_t i1553 = 0; i1553 < 16; i1553++) { + int32_t i1547 = i1552 * 16 + i1553; + if (i1547 >= B1_dimension) + continue; + + int32_t pA15422_begin = i1548A1542; + + for (int32_t i1548B = B2_pos[i1547]; i1548B < B2_pos[(i1547 + 1)]; i1548B++) { + int32_t i1548 = B2_crd[i1548B]; + if (A15422_crd_size <= i1548A1542) { + A15422_crd = (int32_t*)realloc(A15422_crd, sizeof(int32_t) * (A15422_crd_size * 2)); + A15422_crd_size *= 2; + } + A15422_crd[i1548A1542] = i1548; + i1548A1542++; + } + + A15422_pos[i1547 + 1] = i1548A1542 - pA15422_begin; + } + } + + int32_t csA15422 = 0; + for (int32_t pA154220 = 1; pA154220 < (A15421_dimension + 1); pA154220++) { + csA15422 += A15422_pos[pA154220]; + A15422_pos[pA154220] = csA15422; + } + + A1542_vals = (double*)malloc(sizeof(double) * (i1548A1542 * A15423_dimension)); + + A1542->indices[1][0] = (uint8_t*)(A15422_pos); + A1542->indices[1][1] = (uint8_t*)(A15422_crd); + A1542->vals = (uint8_t*)A1542_vals; + return 0; +} + +int compute(taco_tensor_t *A1542, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) { + int A15421_dimension = (int)(A1542->dimensions[0]); + int A15423_dimension = (int)(A1542->dimensions[2]); + int* restrict A15422_pos = (int*)(A1542->indices[1][0]); + double* restrict A1542_vals = (double*)(A1542->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + int* restrict B3_pos = (int*)(B->indices[2][0]); + int* restrict B3_crd = (int*)(B->indices[2][1]); + double* restrict B_vals = (double*)(B->vals); + int C1_dimension = (int)(C->dimensions[0]); + int C2_dimension = (int)(C->dimensions[1]); + double* restrict C_vals = (double*)(C->vals); + int D1_dimension = (int)(D->dimensions[0]); + int D2_dimension = (int)(D->dimensions[1]); + double* restrict D_vals = (double*)(D->vals); + +// int32_t i1548A1542 = 0; + + #pragma omp parallel for schedule(static) + for (int32_t pA1542 = 0; pA1542 < (A15422_pos[A15421_dimension] * A15423_dimension); pA1542++) { + A1542_vals[pA1542] = 0.0; + } + + #pragma omp parallel for schedule(runtime) + for (int32_t i1552 = 0; i1552 < ((B1_dimension + 15) / 16); i1552++) { + for (int32_t i1553 = 0; i1553 < 16; i1553++) { + int32_t i1547 = i1552 * 16 + i1553; + if (i1547 >= B1_dimension) + continue; + + for (int32_t i1548B = B2_pos[i1547]; i1548B < B2_pos[(i1547 + 1)]; i1548B++) { + for (int32_t i1549B = B3_pos[i1548B]; i1549B < B3_pos[(i1548B + 1)]; i1549B++) { + int32_t i1549 = B3_crd[i1549B]; + for (int32_t i1550 = 0; i1550 < D1_dimension; i1550++) { + int32_t i1550C = i1549 * C2_dimension + i1550; + for (int32_t i1551 = 0; i1551 < D2_dimension; i1551++) { + // int32_t i1551A1542 = i1548A1542 * A15423_dimension + i1551; + int32_t i1551A1542 = i1548B * A15423_dimension + i1551; + int32_t i1551D = i1550 * D2_dimension + i1551; + A1542_vals[i1551A1542] = A1542_vals[i1551A1542] + (B_vals[i1549B] * C_vals[i1550C]) * D_vals[i1551D]; + } + } + } + // i1548A1542++; + } + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/ttm_original2.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} diff --git a/test/kernels/ttm_ttm/ttm_original2.h b/test/kernels/ttm_ttm/ttm_original2.h new file mode 100644 index 000000000..8a08b4548 --- /dev/null +++ b/test/kernels/ttm_ttm/ttm_original2.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A1542, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A1542, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D); +#endif diff --git a/test/kernels/ttm_ttm/ttm_original2.so b/test/kernels/ttm_ttm/ttm_original2.so new file mode 100755 index 0000000000000000000000000000000000000000..6466a2af25f27b4b4cc11283e998dc09686efe76 GIT binary patch literal 14568 zcmeHOYjjiBl|EMzSb(D|h)bM=28G^87LwKc+Ws=o2)3j61YMRP{HV&Bpd7=6C zx#!4|Y&SE1X8!cR)js<>d+)Q)KCk;Y-d7u_vndLb$tp}t)jlU z;^GwXkW@5>x2*s+?6~Uh-OhfT`oZ@$>}tRAWWz&u|9Isqk9_je6JMt)sGRB|GkLg< zc@*b@GTWkjN40Wbo=2fFvU3xz>e^piJpQHQ-}vG0J~!B(-`qJIJ+SDTpE!B!g-`5Q zcI@&eqt%Z@o_jL!nyYcqSI@oj+^epEpFCP#b@p$fP@OZ~0f(2&Qo0P7XBK=9u-W9F znFD`(4*WNB;D_hHzc&X?@%G@#O*hQZuF5&^5Dd)Lu1esZT;t%qbJ*F6vK6z)e;V>` z_C9vNeKkVwKHa5)2Q7$Uvy0{hx&671@JFhuT+b9cwnk#XuKvzotgSZ^YUxX37EeX2 zGt$`?iLr2J4}*Spuo+JE?%5Uz^@iJgQ8Z~LTpfva_eNM}sH3Ah%vvH8jv!Z#MY{UB zdxJ3+?Tthj>f{o}zZ942JspBAq-)oDIX<{#b-{v^TdRTiVxMT24%u za%)*I(h`b=(8lJzJ`tNB+HOUnqO{D)lv5_qP+b#TQ(BSEttq_|{YXRIj+JsAazqKC z9PKDJru>8D-`w89wVB>OFbDlMP8<&44isKrm_rNt}#?rnSqNX zh^u%r@XY;J&%ntinHn>286zs&k%5a_A+EAJ1D9Pw1#KBPR+V(>$-pz0pn(kBm6m2q zk6(1^=Df{IJxm`NjoDJ;di-hU$+Sf3&aEg)-QI*>b-5Qg;@hZvaw3Iv`zGQQ;>S5( zO+0n+r;ts0INuCDO4~9|>bG`k@9Ym~?|6q8 zQ@_=uy>l{JsxBY43YRlIv2hFv+x5h}Cqeb9tGe;~*o_FGjaY>6x?QP>X_Z|kDQeR^ z^!Jls;?9S-nc{2E*T=5ebz@2&OWva^r}gvKVudg`R$v*-Cxg<3Tzf+s%VCH0uh!!m zAAi7{^A>9@ppO-3PM@xBlH?)xG2rf<8i zY5O*P|3$i^(9Pumb6;7DZaQu+LZNO9Yem4zN6mVFATg|YNGNeoD{)C$dDrQdu| z4?vLSBghfbGwX|tquNn!;-GewQ@_?`)_WmAod5|BSwC*&3n~8tEAOWKsFin8K8d_p zzai0c3z#V|@ljQ6m?}?=U32K+%gL#CQ>o!mb$A4$ts#6`ADcjw{hG&@s7rxX=n2Ou z8b}20_8U{1jcY#RjCs^*jXBI)19$(6*LjrJhnmiU9h-0<_fsl30yA-PXPk=(7vrN2 z!kiQjSmMozk8-ysAP0!z5l<>4HH55wW>`B0N@=t41_a2u-2$tu_Nj-qVj__}tFEJ5 z!z?&r`j0rOPN<1Hc=1syO5tZZF+f)k6@;o2YF*g&Z&n+z}AkW)I)>FkY(O?;b(O7 zM^+3H6+eSfYB)S2x8R)_(9q92wMGEz^u*vgglSOQK{^xULBer{^4uGARgWGX=AH)< z`KQR3o+!v`Fiz`<4d?GoI2P!}1}$Km*pwJt;MT`pL7xw5MaD1Ve^BBn#dr%`<&Bqh zWjub`7DyC~!Tc}CiV@Sw{KmE7YYoa6jR>#Lcpcb=_{-OnE57p=8kBQj$yh+eI$#cJ zjR?kJXsS=-5!*&Xq2i2=J{tlWSnsDc3~B@F=TGQW%QR|7z<8BxyVWm@mj7|Vi33fV zYRx;{cr%cQIs?U%dR2v{e&NLCstNV;qYF;>;;B6KvC$QNO&va=4qv1Jm8f?&RE=`8 zXc}xeQSvNG%qFd(s!1zTKjWZ&@oSY>kd|Ij-FcVP`{(1bU-GFt7=9Krpo?^SK=YQ5 zmOqOkS3sGJss-mPf1;|}p~O!pQT6_@D0Cb)pIYotFc93ns^=b@K%ePV0c}Gpf?5Kw zW%nft=JT$ks`T))dLsXPqG=u`rQ)kP0|2$&@Banr-Jx7k7b|;LN-ZUc$k8B3YZjd6 z!ly{s24S0SAJja4<=w#g7R?+`;NnHQxUpljCT9 zTE2XAG6+5ygB!FHVM#aMrcFQjRUpQxzOm%;TIHr z|NXs%uwy)5d^&zj+4pOhOKkKbNJ$k1CMv;-$~P_)78!kR_0T@p_a*96wdE?xyMHtTr4~)uw+Vt1%oKbj)LSwduf$QDXq> z|L#*xvSM;8F^#P3OzAvze1eIaAas&qd=Hw|VU#@NW{-rH)+Un>8jwVvo^O;P-m zvdP@wX~57(e&>x;s%quFq;7t)DEW0T$T*Y#U}XxEk6IZ|@*ri5-%>S`L&)>~ux?G_ z7u#^R!1szqzgcE_o#>05bc^9j>{a~eogI3Z2Ts?`I;TGN2bL3 zo`PGH{0gt?G%?;YXgUw^Exz~})u%R`8N2GJ9l4CM4eFskVY1>*f$k&p#9l}FWy}ZO z3CR-JcnWI*OKw9B{nlWNsfS*Kyq>6WoI!}mE@-8obR)`qP;v7fETeAl^3DzzXECw^ zW`|Z*TYfoUT*lJVV7wvj->|Q^s}uRFu?82YhrSFWwdTF=)KF#@m6!vO|93+-Bo97jBm0J$@3A(~9)?DMzBBbW7|~^Dv&h^m+sa2XhO;$p%g;?#!VjxY_rVezK^JWp8C5@9 zjD@v4b>WNUsmV`3#v1>$UdUVLBXBEV_GaHVA0S^x`N&P6Z~Z1Sv2$U7nhc{05vwZ- zm^u)zJ7BtLcy2a}X+&VC)|CaYsNhBzQPRz7CvL)!#d!8PO;{Rm@FmF%mFdQxbo0X~ zs4v44N8Vd-^qg*PMN1vp>*`(`g0Gb`>77*SkCkN42Zy;5?tJ?BV%7S7kh}-Zf>jN9QecRRZ(5 zimF{6$U>*u<=jLM&tYx(QtG zK!;qQL>S#IT?HoJff7bfAbz?PL7o!PWx$4j@prn@{@14cuS@&C$<8%4Q%e=R=$ z@_9l<^j(pXd|yp#DkV&Hp5C}-EoW1rKDka6iHv-wy90BHl6*INLdescq*Q1T{&t^d zJswVk8l&esN^;K1cfvPVFxw#0Me%F?omVhd%f;Hec%t1ILHo@-J42;{Bc(w4?p z{xE*NO}U(8n*yM0%eAK*y$`s3FBN`_vOGB)dDkeTELLuJUBAfgy4kh;hQ*2-!^c*t z*j;xmTEFOnH!fC|*98>HI zm5)4Q!`CeZ=K?%6<>gi&Jtdv^O4(Y{(%p|wa3z&x0Al@Jk&Ug9u1Ift7%YAM+c;3U zwq$LEr=(T>-?GM2BClw3e|twuJIq+Fc}k*P-6i}ZWJynNcPtW)b@u}7X%Dl`fpA|h zdyn?4&Th_XZhtizU9zsEwzJV+)+iW0L_{%M46dz!3tjyKC9PfkB2En7H1QdP*T_SUXQi>JLS#vT{L?_UI6!2UM&h-0C5=+-Yw=-)(<4MpgTtR&+rWwHw2ld+?l z&WiOXQ`n9{AlJQY+|J~BmyJ7^To<$Pc}%V^+4y`W*QIPckID5d8+WGH!EAiy_jf9{ zvxV%qSRb)K@w}Z~&*b;g%tAZu9-i6c-^b*-mMuT?`~#KSncQ_Ur3sh2N-jQ2W4Rr< zyXMNvT{st)yKgT3{%k;Sc6Q4QoQAQ3y_U)?PyyI*K19wGxjd8Q+2o6a{AkWN`H;Yw z+}OcW+K7wn%lj4%;%6H!CF7iloMz1&pV{vQaIz!shrsegZw!@L#?N~SA4FZOkpY;%<0Ox{DJta{>(!_lv(Cc;T-rXj?Wy= zRUDr=U#fxA(|hhb-7f4L@OVfdokAR+IgcLX_)I(I9QZNd;>b?gHf6pI+>?b^%+KeL zf0g6;EcYDIyTE5VPetE#W~CU=diO_#7%Yr&3{To0&f&c zC`K4hZ;1ZQJy*6)$nVZM_q7GMN9bmfb}pZZSo!_HSImUjm$>{a`)2kHPxdVf%KBpc z(I{^0rjM5eW1Yb;9Wd*|S*VuoU~5NrbEqTOf;+9gV5on9(K)J)NG#G)x^8V%bBK31)F@;f!drJac+&D zLQ8u=RSJ*}v3b4nL>ZkU`*WxLW**|B!+OxC!+mr#%{urdPr8XCZ4@Uur8j*d&+APq zSSJi~4)#rZR3Q%UO(W#vv>(%_12epk6=oa@%&0N@pd%d$v`+bDS0N7eWrLZA^JYC0 zIPF;0DaQ=o<~V?ub7F9YlsJ#bJ7?D8mb_+p08!LOrv`&r$1ro6!w-3~(!M>N7$Tst zUW>NLTo;~kO7X~1${$op+FKas+CqJ8th8lM7c?yz!=n$$JrL>bYwzyLVuC2^jdX-a zLF9TmVyu+cUW#mKYd6>!o%<~1l9YK^4t3%oj4Y8wI7Npk zOL3Nxm0~z{Vi5oqY3^@@N~o(9tBYXUyQ1APA8KyyjXWSZJa9#19s!YAIuI&!g{4{6 z!~ZQrw>|Vq1=|2m^7%(eMB5vecV2TUv{Eyg_jclt-_OZrn3W$LHj8wkar zX;q$+dXmxsluUj3eqqRCDM~qun;w6Xe-Rmyk^al)umc`TB3Ga4%1jer2rJ73k(c_# zIk?o9bQ0whivXT}fBl8(z?8*v@yC)MwK3BG44*j*aS&E+b&?kF2 zQ=Dt;5c)-80Fj-$aLMr`*Z&2;sNpeKE;%%!fee1o0m8aia G{r?8d(), {3}); @@ -84,4 +87,193 @@ TEST(indexstmt, spmm) { } +TEST(indexstmt, sddmm) { + Type t(type(), {3,3}); + TensorVar A("A", t, {Sparse, Dense}); + TensorVar B("B", t, {Sparse, Dense}); + TensorVar C("C", t, {Dense, Dense}); + TensorVar w("w", Type(type(),{3}), Dense); + + // the below expression is the concrete index notation + // where (consumer, producer) + IndexStmt spmm = forall(i, + forall(k, + where(forall(j, A(i,j) = w(j)), + forall(j, w(j) += B(i,k)*C(k,j)) + ) + ) + ); + + // after adding scheduling transformations to this concrete-topologically sorted index stmt + // + + std::cout << spmm << std::endl; + spmm = reorderLoopsTopologically(spmm); + std::cout << "topologically reordered loops statement: " << spmm << std::endl; + + Kernel kernel = compile(spmm); + kernel.compute(); +} + +TEST(indexstmt, sddmmPlusSpmm) { + + // Y(i,l) = B(i,j)*C(i,k)*D(k,j) * F(j,l); + // indexstmt order i, j, k, l + //topologically reordered loops statement: forall(i, forall(k, forall(j, forall(l, Y(i,l) += B(i,j) * C(i,k) * D(k,j) * F(j,l), NotParallel, IgnoreRaces), NotParallel, IgnoreRaces), NotParallel, IgnoreRaces), NotParallel, IgnoreRaces) + + Type t(type(), {3,3}); + TensorVar Y("Y", t, {Dense, Dense}); + TensorVar B("B", t, {Dense, Sparse}); + TensorVar C("C", t, {Dense, Dense}); + TensorVar D("D", t, {Dense, Dense}); + TensorVar E("E", t, {Dense, Dense}); + + // TensorVar A("A", Type(type(),{3}), ); + TensorVar A("A", Type()); + + IndexStmt fused1 = + forall(i, + forall(j, + forall(k, + forall(l, Y(i,l) += B(i,j) * C(i,k) * D(j,k) * E(j,l)) + ) + ) + ); + + std::cout << "before topological sort" << fused1 << std::endl; + fused1 = reorderLoopsTopologically(fused1); + std::cout << "after topological sort" << fused1 << std::endl; + + Kernel kernel = compile(fused1); + + + IndexStmt fused2 = + forall(i, + forall(j, + where( + forall(l, Y(i,l) += A * E(j,l)), // consumer + forall(k, A += B(i,j)*C(i,k)*D(j,k)) // producer + ) + ) + ); + + Kernel kernel2 = compile(fused2); + +} + + + +TEST(indexstmt, mttkrpPlusSpmm) { + + // ./bin/taco "A(i,m)=B(i,k,l)*C(k,j)*D(l,j)*E(j,m)" -f=A:dd:0,1 -f=B:sss:0,1,2 -f=C:dd:0,1 -f=D:dd:0,1 -f=E:dd:0,1 + + // i = 11, k = 5, l = 7, j = 8; + long unsigned int idim = 11, kdim = 5, ldim = 7, jdim = 8, mdim = 6; + + Type atype(type(), {idim, mdim}); + Type btype(type(), {idim, kdim, ldim}); + Type ctype(type(), {kdim, jdim}); + Type dtype(type(), {ldim, jdim}); + Type etype(type(), {jdim, mdim}); + + TensorVar A("A", atype, {Dense, Dense}); + TensorVar B("B", btype, {Sparse, Sparse, Sparse}); + TensorVar C("C", ctype, {Dense, Dense}); + TensorVar D("D", dtype, {Dense, Dense}); + TensorVar E("E", etype, {Dense, Dense}); + + TensorVar ws("ws", Type(type(), {jdim}) ); + + IndexStmt fused1 = + forall(i, + forall(k, + forall(l, + forall(j, + forall(m, A(i,m) += B(i,k,l) * C(k,j) * D(l,j) * E(j,m)) + ) + ) + ) + ); + + std::cout << "before topological sort" << fused1 << std::endl; + fused1 = reorderLoopsTopologically(fused1); + std::cout << "after topological sort" << fused1 << std::endl; + + Kernel kernel = compile(fused1); + + IndexStmt fused2 = + forall(i, + where( + forall(j, + forall(m, + A(i,m) += ws(j) * E(j,m) + ) + ) + , + forall(k, + forall(l, + forall(j, + ws(j) += B(i,k,l) * C(k,j) * D(l,j) + ) + ) + ) + ) + ); + + Kernel kernel2 = compile(fused2); + +} + +// ./bin/taco "y(i)=A(i,j)*B(j,k)*v(k)" -f=y:d:0 -f=A:dd:0,1 -f=B:dd:0,1 -f=v:d:0 +TEST(indexstmt, mmPlusSpmv) { + + // + + long unsigned int idim = 11, jdim = 8, kdim = 5; + + Type ytype(type(), {idim}); + Type atype(type(), {idim, jdim}); + Type btype(type(), {jdim, kdim}); + Type vtype(type(), {kdim}); + + TensorVar y("y", ytype, {Dense}); + TensorVar A("A", atype, {Dense, Dense}); + TensorVar B("B", btype, {Dense, Dense}); + TensorVar v("v", vtype, {Dense}); + + TensorVar ws("ws", Type(type(), {jdim}) ); + + IndexStmt fused1 = + forall(i, + forall(j, + forall(k, + forall(m, y(i) += A(i,j) * B(j,k) * v(k)) + ) + ) + ); + + std::cout << "before topological sort" << fused1 << std::endl; + fused1 = reorderLoopsTopologically(fused1); + std::cout << "after topological sort" << fused1 << std::endl; + + Kernel kernel = compile(fused1); + + IndexStmt fused2 = + where( + forall(i, + forall(j, + y(i) += A(i,j) * ws(j) + ) + ) + , + forall(j, + forall(k, + ws(j) += B(j,k) * v(k) + ) + ) + ); + + Kernel kernel2 = compile(fused2); +} + diff --git a/test/tests-scheduling-eval.cpp b/test/tests-scheduling-eval.cpp index 59debc88e..29a7e512e 100644 --- a/test/tests-scheduling-eval.cpp +++ b/test/tests-scheduling-eval.cpp @@ -1,88 +1,8 @@ -#include -#include -#include -#include -#include -#include -#include -#include "taco/cuda.h" -#include "test.h" -#include "test_tensors.h" -#include "taco/tensor.h" -#include "taco/index_notation/index_notation.h" -#include "taco/index_notation/transformations.h" -#include "codegen/codegen.h" -#include "taco/lower/lower.h" -#include "taco/util/timers.h" - - -#define TOOL_BENCHMARK_TIMER(CODE,NAME,TIMER) { \ - if (time) { \ - taco::util::Timer timer; \ - timer.start(); \ - CODE; \ - timer.stop(); \ - taco::util::TimeResults result = timer.getResult(); \ - cout << NAME << " " << result << " ms" << endl; \ - TIMER=result; \ - } \ - else { \ - CODE; \ - } \ -} - -using namespace taco; +#include "util.h" + const IndexVar i("i"), j("j"), k("k"), l("l"), m("m"), n("n"); int WARP_SIZE = 32; -void printToCout(IndexStmt stmt) { - std::shared_ptr codegen = ir::CodeGen::init_default(cout, ir::CodeGen::ImplementationGen); - ir::Stmt compute = lower(stmt, "compute", false, true); - codegen->compile(compute, true); -} - -void printToFile(string filename, IndexStmt stmt) { - stringstream source; - - string file_path = "eval_generated/"; - mkdir(file_path.c_str(), 0777); - - std::shared_ptr codegen = ir::CodeGen::init_default(source, ir::CodeGen::ImplementationGen); - ir::Stmt compute = lower(stmt, "compute", false, true); - codegen->compile(compute, true); - - ofstream source_file; - string file_ending = should_use_CUDA_codegen() ? ".cu" : ".c"; - source_file.open(file_path + filename + file_ending); - source_file << source.str(); - source_file.close(); -} - -void printToFile(string filename, string additional_filename, IndexStmt stmt) { - stringstream source1; - stringstream source2; - - string file_path = "eval_generated/"; - mkdir(file_path.c_str(), 0777); - - std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); - ir::Stmt compute = lower(stmt, "compute", false, true); - codegen->compile(compute, true); - - ofstream source_file; - string file_ending = should_use_CUDA_codegen() ? ".cu" : ".c"; - source_file.open(file_path+filename+file_ending); - source_file << source1.str(); - source_file.close(); - - ofstream additional_source_file; - string additional_file_ending = ".ispc"; - additional_source_file.open(file_path+additional_filename+additional_file_ending); - additional_source_file << source2.str(); - additional_source_file.close(); - -} - IndexStmt scheduleSpMVCPU(IndexStmt stmt, int CHUNK_SIZE=16) { IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1"); return stmt.split(i, i0, i1, CHUNK_SIZE) @@ -909,7 +829,7 @@ TEST(scheduling_eval, spmmISPC) { expected.compute(); ASSERT_TENSOR_EQ(expected, C); - float ERROR_MARGIN = 0.01; + // float ERROR_MARGIN = 0.01; // ASSERT_TENSOR_VAL(expected, y); for (int i = 0; i < NUM_I; i++) { for (int k = 0; k < NUM_K; k++) { @@ -1172,6 +1092,67 @@ TEST(scheduling_eval, sddmmCPU) { ASSERT_TENSOR_EQ(expected, A); } +TEST(scheduling_eval, sddmmSPMMFusedCPU) { + if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) { + return; + } + + int NUM_I = 1021/10; + int NUM_J = 1039/10; + int NUM_K = 1057/10; + float SPARSITY = .3; + Tensor A("A", {NUM_I, NUM_K}, {Dense, Dense}); + Tensor B("B", {NUM_I, NUM_K}, CSR); + Tensor C("C", {NUM_I, NUM_J}, {Dense, Dense}); + Tensor D("D", {NUM_J, NUM_K}, {Dense, Dense}); + + srand(268238); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + C.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + + for (int i = 0; i < NUM_I; i++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, k}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + } + + for (int j = 0; j < NUM_J; j++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + D.insert({j, k}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + + B.pack(); + C.pack(); + D.pack(); + + A(i,k) = B(i,k) * C(i,j) * D(j,k); + + IndexStmt stmt = A.getAssignment().concretize(); + stmt = scheduleSDDMMCPU(stmt, B); + + printToFile("sddmm_cpu_ryan2", stmt); + + A.compile(stmt); + A.assemble(); + A.compute(); + + Tensor expected("expected", {NUM_I, NUM_K}, {Dense, Dense}); + expected(i,k) = B(i,k) * C(i,j) * D(j,k); + expected.compile(); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, A); +} + TEST(scheduling_eval, sddmmcsrCPU) { if (should_use_CUDA_codegen()) { @@ -1246,8 +1227,8 @@ TEST(scheduling_eval, sddmm2CPU) { int NUM_J = 1021/10; int NUM_K = 18; float SPARSITY = .3; - Tensor Y("Y", {NUM_I, NUM_J}, CSR); - Tensor A("A", {NUM_I, NUM_J}, CSR); + Tensor Y("Y", {NUM_I, NUM_J}, {Dense, Compressed(ModeFormat::UNIQUE)}); + Tensor A("A", {NUM_I, NUM_J}, {Dense, Compressed(ModeFormat::UNIQUE)}); Tensor X("X", {NUM_I, NUM_K}, {Dense, Dense}); srand(268238); @@ -1271,23 +1252,23 @@ TEST(scheduling_eval, sddmm2CPU) { A.pack(); X.pack(); - Y(i,j) = A(i,j) * X(i,k) * X(j,k); + Y(i,j) = A(i,j) * X(i,k) * X(k,j); - IndexStmt stmt = A.getAssignment().concretize(); - // stmt = scheduleSDDMMCPU(stmt, B); + // IndexStmt stmt = A.getAssignment().concretize(); + // // stmt = scheduleSDDMMCPU(stmt, A); - //printToFile("sddmm_cpu", stmt); + // printToFile("sddmm2_cpu", stmt); - A.compile(stmt); - A.assemble(); - A.compute(); + // A.compile(stmt); + // A.assemble(); + // A.compute(); - Tensor expected("expected", {NUM_I, NUM_J}, {Dense, Dense}); - expected(i,j) = A(i,j) * X(i,k) * X(j,k); - expected.compile(); - expected.assemble(); - expected.compute(); - ASSERT_TENSOR_EQ(expected, A); + // Tensor expected("expected", {NUM_I, NUM_J}, {Dense, Dense}); + // expected(i,j) = A(i,j) * X(i,k) * X(j,k); + // expected.compile(); + // expected.assemble(); + // expected.compute(); + // ASSERT_TENSOR_EQ(expected, A); } @@ -1365,7 +1346,7 @@ TEST(scheduling_eval, sddmmISPC) { ASSERT_TENSOR_EQ(expected, A); - float ERROR_MARGIN = 0.01; + // float ERROR_MARGIN = 0.01; // ASSERT_TENSOR_VAL(expected, y); for (int i = 0; i < NUM_I; i++) { for (int k = 0; k < NUM_K; k++) { @@ -1447,7 +1428,7 @@ TEST(scheduling_eval, sddmm2ISPC) { ASSERT_TENSOR_EQ(expected, A); - float ERROR_MARGIN = 0.01; + // float ERROR_MARGIN = 0.01; // ASSERT_TENSOR_VAL(expected, y); for (int i = 0; i < NUM_I; i++) { for (int j = 0; j < NUM_J; j++) { @@ -1585,7 +1566,7 @@ TEST(scheduling_eval, spmvISPC) { ASSERT_TENSOR_EQ(expected, y); - float ERROR_MARGIN = 0.01; + // float ERROR_MARGIN = 0.01; // ASSERT_TENSOR_VAL(expected, y); for (int j = 0; j < NUM_J; j++) { if (expected(j) <= y(j) + ERROR_MARGIN && expected(j) >= y(j) - ERROR_MARGIN) { @@ -2015,6 +1996,64 @@ TEST(scheduling_eval, mttkrpCPU) { ASSERT_TENSOR_EQ(expected, A); } +TEST(scheduling_eval, temp) { + if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) { + return; + } + std::default_random_engine gen(0); + std::uniform_real_distribution unif(0.0, 1.0); + // Predeclare the storage formats that the inputs and output will be stored as. + // To define a format, you must specify whether each dimension is dense or sparse + // and (optionally) the order in which dimensions should be stored. The formats + // declared below correspond to doubly compressed sparse row (dcsr), row-major + // dense (rm), and column-major dense (dm). + Format dcsr({Sparse,Sparse}); + Format rm({Dense,Dense}); + Format cm({Dense,Dense}, {1,0}); + + // Load a sparse matrix from file (stored in the Matrix Market format) and + // store it as a doubly compressed sparse row matrix. Matrices correspond to + // order-2 tensors in taco. The matrix in this example can be download from: + // https://www.cise.ufl.edu/research/sparse/MM/Williams/webbase-1M.tar.gz + Tensor B = read("/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx", dcsr); + // Generate a random dense matrix and store it in row-major (dense) format. + Tensor C({B.getDimension(0), 1000}, rm); + for (int i = 0; i < C.getDimension(0); ++i) { + for (int j = 0; j < C.getDimension(1); ++j) { + C.insert({i,j}, unif(gen)); + } + } + C.pack(); + + // Generate another random dense matrix and store it in column-major format. + Tensor D({1000, B.getDimension(1)}, cm); + for (int i = 0; i < D.getDimension(0); ++i) { + for (int j = 0; j < D.getDimension(1); ++j) { + D.insert({i,j}, unif(gen)); + } + } + D.pack(); + + // Declare the output matrix to be a sparse matrix with the same dimensions as + // input matrix B, to be also stored as a doubly compressed sparse row matrix. + Tensor A(B.getDimensions(), dcsr); + + // Define the SDDMM computation using index notation. + IndexVar i, j, k; + A(i,j) = B(i,j) * C(i,k) * D(k,j); + + // At this point, we have defined how entries in the output matrix should be + // computed from entries in the input matrices but have not actually performed + // the computation yet. To do so, we must first tell taco to generate code that + // can be executed to compute the SDDMM operation. + A.compile(); + // We can now call the functions taco generated to assemble the indices of the + // output matrix and then actually compute the SDDMM. + A.assemble(); + A.compute(); + // Write the output of the computation to file (stored in the Matrix Market format). + write("A.mtx", A); +} TEST(scheduling_eval, mttkrpISPC) { if (should_use_CUDA_codegen()) { diff --git a/test/tests-scheduling-fuse.cpp b/test/tests-scheduling-fuse.cpp new file mode 100644 index 000000000..bd77f1d64 --- /dev/null +++ b/test/tests-scheduling-fuse.cpp @@ -0,0 +1,2872 @@ +#include "taco/cuda.h" +#include "taco/tensor.h" +#include "test.h" +#include "util.h" +#include +#include "gtest/gtest.h" +#include +#include + +// #define NUM_THREADS_TO_USE 64 +#define NUM_THREADS_TO_USE 32 + +void handle_error (int retval) +{ + printf("PAPI error %d: %s\n", retval, PAPI_strerror(retval)); + exit(1); +} + +TEST(scheduling_eval, spmvFusedWithSyntheticData) { + if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) { + return; + } + taco_set_num_threads(NUM_THREADS_TO_USE); + + std::default_random_engine gen(0); + std::uniform_real_distribution unif(0.0, 1.0); + + Format csr({dense, sparse}); + Format rm({dense}); + + // uncomment this for reading the csr matrix saved in mtx file + std::cout << "reading B mat mtx\n"; + + int NUM_I = 5; // 1021/10; + int NUM_J = 5; // 1039/10; + int NUM_K = 8; + float SPARSITY = .3; + Tensor B("B", {NUM_I, NUM_J}, csr); + srand(75883); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + } + B.pack(); + + + std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl; + std::cout << "adding c mat\n"; + Tensor C("C", {NUM_J, NUM_K}, csr); + for (int i = 0; i < C.getDimension(0); ++i) { + for (int j = 0; j < C.getDimension(1); ++j) { + C.insert({i,j}, unif(gen)); + } + } + std::cout << "packing C mat\n"; + C.pack(); + + Tensor v("v", {NUM_K}, rm); + for (int i = 0; i < v.getDimension(0); ++i) { + v.insert({i}, unif(gen)); + } + std::cout << "packing D mat\n"; + v.pack(); + + Tensor A("A", {NUM_I}, rm); + Tensor ref("ref", {NUM_I}, rm); + IndexVar i, j, k, l, m; + A(i) = B(i,j) * C(j,k) * v(k); + + // IndexStmt stmt = A.getAssignment().concretize(); + IndexStmt stmt = makeReductionNotation(A.getAssignment()); + stmt = makeConcreteNotation(stmt); + printToFile("SpMVfused", stmt); + stmt = reorderLoopsTopologically(stmt); + stmt = loopFusionOverFission(stmt, A.getAssignment(), "f", 1); + stmt = insertTemporaries(stmt); + stmt = parallelizeOuterLoop(stmt); + + A.compile(stmt); + // We can now call the functions taco generated to assemble the indices of the + // output matrix and then actually compute the MTTKRP. + A.assemble(); + + + // ref(i) = B(i,j) * C(j,k) * v(k); + // IndexStmt refStmt = makeReductionNotation(ref.getAssignment()); + // refStmt = makeConcreteNotation(refStmt); + // refStmt = insertTemporaries(refStmt); + // refStmt = parallelizeOuterLoop(refStmt); + // ref.compile(refStmt); + // ref.assemble(); + + // Tensor ref1({NUM_J}, rm); + // Tensor ref2({NUM_I}, rm); + // ref1(j) = C(j,k) * v(k); + // ref2(i) = B(i,j) * ref1(j); + + // IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment()); + // ref1Stmt = makeConcreteNotation(ref1Stmt); + // ref1Stmt = insertTemporaries(ref1Stmt); + // ref1Stmt = parallelizeOuterLoop(ref1Stmt); + // ref1.compile(ref1Stmt); + // ref1.assemble(); + + // IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment()); + // ref2Stmt = makeConcreteNotation(ref2Stmt); + // ref2Stmt = insertTemporaries(ref2Stmt); + // ref2Stmt = parallelizeOuterLoop(ref2Stmt); + // ref2.compile(ref2Stmt); + // ref2.assemble(); + + std::cout << "compute start\n"; + taco::util::TimeResults timevalue; + bool time = true; + // TOOL_BENCHMARK_TIMER(ref.compute(), "\n\nReference Kernel: ", timevalue); + TOOL_BENCHMARK_TIMER(A.compute(), "\n\nFused Kernel: ", timevalue); + // ASSERT_TENSOR_EQ(ref, A); + + // // check results + // for (int q = 0; q < A.getDimension(0); ++q) { + // if ( abs(A(q) - ref(q))/abs(ref(q)) > ERROR_MARGIN) { + // std::cout << "error: results don't match A("<< q << "): " + // << A(q) << ", ref: " << ref(q) << std::endl; + // ASSERT_TRUE(false); + // } + // } + // // ASSERT_TENSOR_EQ(A, ref); + // TOOL_BENCHMARK_TIMER(ref1.compute(), "\n\nSDDMM Kernel: ", timevalue); + // TOOL_BENCHMARK_TIMER(ref2.compute(), "\n\nSpMM Kernel: ", timevalue); + // ASSERT_TENSOR_EQ(ref, ref2); + + // for (int q = 0; q < ref2.getDimension(0); ++q) { + // for (int w = 0; w < ref2.getDimension(1); ++w) { + // if ( abs(ref2(q,w) - ref(q,w))/abs(ref(q,w)) > ERROR_MARGIN) { + // std::cout << "error: results don't match A("<< q << "," << w << "): " + // << ref2(q,w) << ", ref: " << ref(q,w) << std::endl; + // ASSERT_TRUE(false); + // } + // } + // } + +} + +TEST(scheduling_eval, spmvFused) { + if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) { + return; + } + + ofstream statfile; + statfile.open( + "/home/min/a/kadhitha/workspace/my_taco/taco/test/stats/spmv-spmv.txt", std::ios::app); + if (statfile.is_open()) { + statfile << "\nspmv-spmv execution\n"; + statfile << "\n-----------------------------------------\n"; + } + taco_set_num_threads(NUM_THREADS_TO_USE); + + std::default_random_engine gen(0); + std::uniform_real_distribution unif(0.0, 1.0); + + Format csr({dense, sparse}); + Format rm({dense}); + + + + int filenum = 1; + + std::vector matfiles = { + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx", // 5 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx", // 8 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx", // 10 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx", // 12 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wiki-Talk/wiki-Talk.mtx", // 13 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/com-Orkut/com-Orkut.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx", // 15 + "/home/min/a/kadhitha/workspace/my_taco/FusedMM/dataset/harvard.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/twitter7/twitter7.mtx" + }; + std::vector matfilesrw = { + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/synthetic.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cage3.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/bcsstk17.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/pdb1HYS.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/rma10.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cant.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/consph.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cop20k_A.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/shipsec1.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/scircuit.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/webbase-1M.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/wiki-Talk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/com-Orkut.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/circuit5M.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/harvard.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/twitter7.mtx" + }; + + // uncomment this for reading the csr matrix saved in mtx file + std::cout << "reading B mat mtx\n"; + + + int kDim = 8; + float SPARSITY = .3; + std::string matfile = matfiles[filenum]; + std::cout << "reading B mat mtx\n"; + Tensor B = read(matfile, csr, true); + B.setName("B"); + B.pack(); + + std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl; + std::cout << "adding c mat\n"; + + std::cout << "reading B mat mtx\n"; + Tensor C = read(matfile, csr, true); + C.setName("C"); + C.pack(); + + + Tensor v("v", {C.getDimension(1)}, rm); + for (int i = 0; i < v.getDimension(0); ++i) { + v.insert({i}, unif(gen)); + } + std::cout << "packing D mat\n"; + v.pack(); + + if (statfile.is_open()) { + statfile + << "A(i) = B(i,j) * C(j,k) * v(k);" << std::endl + << "B1_dimension: " << B.getDimension(0) << ", B2_dimension: " << B.getDimension(1) << ", vals: " << B.getStorage().getValues().getSize() << std::endl + << "C1_dimension: " << C.getDimension(0) << ", C2_dimension: " << C.getDimension(1) << ", vals: " << C.getStorage().getValues().getSize() << std::endl + << "D1_dimension: " << v.getDimension(0) << ", vals: " << v.getStorage().getValues().getSize() << std::endl + << std::endl; + } + + Tensor A("A", {B.getDimension(0)}, rm); + Tensor ref("ref", {B.getDimension(0)}, rm); + IndexVar i, j, k, l, m; + A(i) = B(i,j) * C(j,k) * v(k); + + ref(i) = B(i,j) * C(j,k) * v(k); + IndexStmt refStmt = makeReductionNotation(ref.getAssignment()); + refStmt = makeConcreteNotation(refStmt); + refStmt = insertTemporaries(refStmt); + refStmt = parallelizeOuterLoop(refStmt); + ref.compile(refStmt); + ref.assemble(); + + // IndexStmt stmt = A.getAssignment().concretize(); + IndexStmt stmt = makeReductionNotation(A.getAssignment()); + stmt = makeConcreteNotation(stmt); + printToFile("SpMVfused", stmt); + stmt = reorderLoopsTopologically(stmt); + stmt = loopFusionOverFission(stmt, A.getAssignment(), "f", 1); + stmt = insertTemporaries(stmt); + stmt = parallelizeOuterLoop(stmt); + A.compile(stmt); + A.assemble(); + + + // Tensor ref1({NUM_J}, rm); + // Tensor ref2({NUM_I}, rm); + // ref1(j) = C(j,k) * v(k); + // ref2(i) = B(i,j) * ref1(j); + + // IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment()); + // ref1Stmt = makeConcreteNotation(ref1Stmt); + // ref1Stmt = insertTemporaries(ref1Stmt); + // ref1Stmt = parallelizeOuterLoop(ref1Stmt); + // ref1.compile(ref1Stmt); + // ref1.assemble(); + + // IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment()); + // ref2Stmt = makeConcreteNotation(ref2Stmt); + // ref2Stmt = insertTemporaries(ref2Stmt); + // ref2Stmt = parallelizeOuterLoop(ref2Stmt); + // ref2.compile(ref2Stmt); + // ref2.assemble(); + + std::cout << "compute start\n"; + taco::util::TimeResults timevalue; + bool time = true; + std::string sofused = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/spmv_spmv/spmv_fused.so"; + + TOOL_BENCHMARK_TIMER(ref.compute(statfile, sofused), "\n\nReference Kernel: ", timevalue); + + + std::cout << "b1 dim: " << B.getTacoTensorT()->dimensions[1] << std::endl; + // TOOL_BENCHMARK_TIMER(ref.compute(statfile, sofused), "\n\nFused Kernel: ", timevalue); + // ASSERT_TENSOR_EQ(ref, A); + + // // check results + // for (int q = 0; q < A.getDimension(0); ++q) { + // if ( abs(A(q) - ref(q))/abs(ref(q)) > ERROR_MARGIN) { + // std::cout << "error: results don't match A("<< q << "): " + // << A(q) << ", ref: " << ref(q) << std::endl; + // ASSERT_TRUE(false); + // } + // } + // // ASSERT_TENSOR_EQ(A, ref); + // TOOL_BENCHMARK_TIMER(ref1.compute(), "\n\nSDDMM Kernel: ", timevalue); + // TOOL_BENCHMARK_TIMER(ref2.compute(), "\n\nSpMM Kernel: ", timevalue); + // ASSERT_TENSOR_EQ(ref, ref2); + + // for (int q = 0; q < ref2.getDimension(0); ++q) { + // for (int w = 0; w < ref2.getDimension(1); ++w) { + // if ( abs(ref2(q,w) - ref(q,w))/abs(ref(q,w)) > ERROR_MARGIN) { + // std::cout << "error: results don't match A("<< q << "," << w << "): " + // << ref2(q,w) << ", ref: " << ref(q,w) << std::endl; + // ASSERT_TRUE(false); + // } + // } + // } + + if (statfile.is_open()) { + statfile.close(); + } + +} + +TEST(scheduling_eval, sddmmFusedWithSyntheticData) { + if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) { + return; + } + + taco_set_num_threads(NUM_THREADS_TO_USE); + + std::default_random_engine gen(0); + std::uniform_real_distribution unif(0.0, 1.0); + + Format csr({dense, sparse}); + Format rm({dense, dense}); + int ldim = 4; + int kdim = 8; + + // uncomment this for reading the csr matrix saved in mtx file + std::cout << "reading B mat mtx\n"; + + int NUM_I = 1021/10; + int NUM_J = 1039/10; + float SPARSITY = .3; + Tensor B("B", {NUM_I, NUM_J}, csr); + srand(75883); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + } + B.pack(); + write("/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx", B); + + std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl; + std::cout << "adding c mat\n"; + Tensor C({B.getDimension(0), kdim}, rm); + for (int i = 0; i < C.getDimension(0); ++i) { + for (int j = 0; j < C.getDimension(1); ++j) { + C.insert({i,j}, unif(gen)); + } + } + std::cout << "packing C mat\n"; + C.pack(); + + Tensor D({B.getDimension(1), kdim}, rm); + for (int i = 0; i < D.getDimension(0); ++i) { + for (int j = 0; j < D.getDimension(1); ++j) { + D.insert({i,j}, unif(gen)); + } + } + std::cout << "packing D mat\n"; + D.pack(); + + Tensor F({B.getDimension(1), ldim}, rm); + for (int i = 0; i < F.getDimension(0); ++i) { + for (int j = 0; j < F.getDimension(1); ++j) { + F.insert({i,j}, unif(gen)); + } + } + std::cout << "packing F mat\n"; + F.pack(); + + Tensor A({B.getDimension(0), ldim}, rm); + Tensor ref({B.getDimension(0), ldim}, rm); + IndexVar i, j, k, l; + A(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); + + // IndexStmt stmt = A.getAssignment().concretize(); + IndexStmt stmt = makeReductionNotation(A.getAssignment()); + stmt = makeConcreteNotation(stmt); + printToFile("fusedMMConcrete", stmt); + + stmt = reorderLoopsTopologically(stmt); + printToFile("fusedMMOrdered", stmt); + + stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 1); + printToFile("fusedMMFused", stmt); + + stmt = insertTemporaries(stmt); + printToFile("fusedMMWithTemps", stmt); + stmt = parallelizeOuterLoop(stmt); + printToFile("fusedMMFusedPar", stmt); + + A.compile(stmt); + // We can now call the functions taco generated to assemble the indices of the + // output matrix and then actually compute the MTTKRP. + A.assemble(); + + + ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); + IndexStmt refStmt = makeReductionNotation(ref.getAssignment()); + refStmt = makeConcreteNotation(refStmt); + refStmt = insertTemporaries(refStmt); + refStmt = parallelizeOuterLoop(refStmt); + ref.compile(refStmt); + ref.assemble(); + + Tensor ref1({B.getDimension(0), B.getDimension(1)}, csr); + Tensor ref2({B.getDimension(0), ldim}, rm); + ref1(i,j)=B(i,j)*C(i,k)*D(j,k); + ref2(i,l)=ref1(i,j)*F(j,l); + + IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment()); + ref1Stmt = makeConcreteNotation(ref1Stmt); + ref1Stmt = insertTemporaries(ref1Stmt); + ref1Stmt = parallelizeOuterLoop(ref1Stmt); + ref1.compile(ref1Stmt); + ref1.assemble(); + + IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment()); + ref2Stmt = makeConcreteNotation(ref2Stmt); + ref2Stmt = insertTemporaries(ref2Stmt); + ref2Stmt = parallelizeOuterLoop(ref2Stmt); + ref2.compile(ref2Stmt); + ref2.assemble(); + + std::cout << "compute start\n"; + taco::util::TimeResults timevalue; + bool time = true; + TOOL_BENCHMARK_TIMER(ref.compute(), "\n\nReference Kernel: ", timevalue); + TOOL_BENCHMARK_TIMER(A.compute(), "\n\nFused Kernel: ", timevalue); + + // check results + for (int q = 0; q < A.getDimension(0); ++q) { + for (int w = 0; w < A.getDimension(1); ++w) { + if ( abs(A(q,w) - ref(q,w))/abs(ref(q,w)) > ERROR_MARGIN) { + std::cout << "error: results don't match A("<< q << "," << w << "): " + << A(q,w) << ", ref: " << ref(q,w) << std::endl; + ASSERT_TRUE(false); + } + } + } + // ASSERT_TENSOR_EQ(A, ref); + TOOL_BENCHMARK_TIMER(ref1.compute(), "\n\nSDDMM Kernel: ", timevalue); + TOOL_BENCHMARK_TIMER(ref2.compute(), "\n\nSpMM Kernel: ", timevalue); + + for (int q = 0; q < ref2.getDimension(0); ++q) { + for (int w = 0; w < ref2.getDimension(1); ++w) { + if ( abs(ref2(q,w) - ref(q,w))/abs(ref(q,w)) > ERROR_MARGIN) { + std::cout << "error: results don't match A("<< q << "," << w << "): " + << ref2(q,w) << ", ref: " << ref(q,w) << std::endl; + ASSERT_TRUE(false); + } + } + } + +} + + +IndexStmt scheduleSDDMMCPU_forfuse(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i, j, k, l, m; + IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1"); + return stmt.split(i, i0, i1, CHUNK_SIZE) + .pos(k, kpos, B(i,k)) + .split(kpos, kpos0, kpos1, UNROLL_FACTOR) + .reorder({i0, i1, kpos0, j, kpos1}) + .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + .parallelize(kpos1, ParallelUnit::CPUVector, OutputRaceStrategy::ParallelReduction); +} + +TEST(scheduling_eval, sddmmFused) { + if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) { + return; + } + + taco_set_num_threads(NUM_THREADS_TO_USE); + + ofstream statfile; + statfile.open( + "/home/min/a/kadhitha/workspace/my_taco/taco/test/stats/sddmm-spmm.txt", std::ios::app); + if (statfile.is_open()) { + statfile << "\nsddmm-spmm execution\n"; + statfile << "\n-----------------------------------------\n"; + } + + std::default_random_engine gen(0); + std::uniform_real_distribution unif(0.0, 1.0); + + Format csr({dense, sparse}); + Format rm({dense, dense}); + int ldim = 128; + int kdim = 128; + + // vector filenums = {2,3,4,5,6,7,8,9,10,12,15}; + + vector filenums = {1}; + + for (auto filenum : filenums) { + + // int filenum = 5; + + std::vector matfiles = { + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx", // 5 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx", // 8 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx", // 10 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx", // 12 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wiki-Talk/wiki-Talk.mtx", // 13 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/com-Orkut/com-Orkut.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx", // 15 + "/home/min/a/kadhitha/workspace/my_taco/FusedMM/dataset/harvard.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/twitter7/twitter7.mtx" + }; + std::vector matfilesrw = { + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/synthetic.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cage3.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/bcsstk17.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/pdb1HYS.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/rma10.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cant.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/consph.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cop20k_A.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/shipsec1.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/scircuit.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/webbase-1M.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/wiki-Talk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/com-Orkut.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/circuit5M.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/harvard.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/twitter7.mtx" + }; + + std::string matfile = matfiles[filenum]; + std::cout << "reading B mat mtx\n"; + Tensor B = read(matfile, csr, true); + B.setName("B"); + B.pack(); + // write(matfilesrw[filenum], B); + + if (statfile.is_open()) { + statfile << matfile << std::endl; + } + + std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl; + std::cout << "adding c mat\n"; + Tensor C({B.getDimension(0), kdim}, rm); + for (int i = 0; i < C.getDimension(0); ++i) { + for (int j = 0; j < C.getDimension(1); ++j) { + C.insert({i,j}, unif(gen)); + } + } + std::cout << "packing C mat\n"; + C.pack(); + + Tensor D({B.getDimension(1), kdim}, rm); + for (int i = 0; i < D.getDimension(0); ++i) { + for (int j = 0; j < D.getDimension(1); ++j) { + D.insert({i,j}, unif(gen)); + } + } + std::cout << "packing D mat\n"; + D.pack(); + + Tensor F({B.getDimension(1), ldim}, rm); + for (int i = 0; i < F.getDimension(0); ++i) { + for (int j = 0; j < F.getDimension(1); ++j) { + F.insert({i,j}, unif(gen)); + } + } + std::cout << "packing F mat\n"; + F.pack(); + + Tensor A({B.getDimension(0), ldim}, rm); + Tensor ref({B.getDimension(0), ldim}, rm); + IndexVar i, j, k, l, m; + IndexVar i0("i0"), i1("i1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"), k0("k0"), k1("k1"); + A(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); + if (statfile.is_open()) { + statfile + << "ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);" << std::endl + << "B1_dimension: " << B.getDimension(0) << ", B2_dimension: " << B.getDimension(1) << ", vals: " << B.getStorage().getValues().getSize() << std::endl + << "C1_dimension: " << C.getDimension(0) << ", C2_dimension: " << C.getDimension(1) << ", vals: " << C.getStorage().getValues().getSize() << std::endl + << "D1_dimension: " << D.getDimension(0) << ", D2_dimension: " << D.getDimension(1) << ", vals: " << D.getStorage().getValues().getSize() << std::endl + << "E1_dimension: " << F.getDimension(0) << ", E2_dimension: " << F.getDimension(1) << ", vals: " << F.getStorage().getValues().getSize() << std::endl + << std::endl; + } + + // IndexStmt stmt = A.getAssignment().concretize(); + IndexStmt stmt = makeReductionNotation(A.getAssignment()); + stmt = makeConcreteNotation(stmt); + stmt = reorderLoopsTopologically(stmt); + stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 1); + stmt = stmt + .split(i, i0, i1, 16); + stmt = insertTemporaries(stmt); + stmt = parallelizeOuterLoop(stmt); + + A.compile(stmt); + A.assemble(); + + + ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); + IndexStmt refStmt = makeReductionNotation(ref.getAssignment()); + refStmt = makeConcreteNotation(refStmt); + refStmt = insertTemporaries(refStmt); + refStmt = refStmt + .split(i, i0, i1, 16) + .reorder({i0, i1, j, k, l}); + stmt = insertTemporaries(stmt); + refStmt = parallelizeOuterLoop(refStmt); + ref.compile(refStmt); + ref.assemble(); + + Tensor ref1({B.getDimension(0), B.getDimension(1)}, csr); + Tensor ref2({B.getDimension(0), ldim}, rm); + ref1(i,j)=B(i,j)*C(i,k)*D(j,k); + ref2(i,l)=ref1(i,j)*F(j,l); + + IndexStmt ref1Stmt = ref1.getAssignment().concretize(); // anyway Ryan's kernel is used here + + ref1Stmt = ref1Stmt.split(i, i0, i1, 16); + // .pos(j, jpos, B(i,j)); + // .split(k, k0, k1, 8); + // .reorder({i0, i1, jpos0, k, jpos1}); + // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + // .parallelize(jpos1, ParallelUnit::CPUVector, OutputRaceStrategy::ParallelReduction); + // ref1Stmt.split(i, ); + // stmt = scheduleSDDMMCPU_forfuse(ref1Stmt, B); + // IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment()); + // ref1Stmt = makeConcreteNotation(ref1Stmt); + ref1Stmt = insertTemporaries(ref1Stmt); + ref1Stmt = parallelizeOuterLoop(ref1Stmt); + ref1.compile(ref1Stmt); + ref1.assemble(); + + IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment()); // Ryan's SpMM kernel is used here + ref2Stmt = makeConcreteNotation(ref2Stmt); + ref2Stmt = insertTemporaries(ref2Stmt); + ref2Stmt = parallelizeOuterLoop(ref2Stmt); + ref2.compile(ref2Stmt); + ref2.assemble(); + + std::cout << "compute start\n"; + taco::util::TimeResults timevalue; + bool time = true; + + std::string sofile_fused = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/fused_kernel.so"; + TOOL_BENCHMARK_TIMER(A.compute(statfile), "\n\nFused Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "fused time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + statfile << "\nseparate execution\n"; + + // // std::string sofile_sddmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so"; + // std::string sofile_sddmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.so"; + // TOOL_BENCHMARK_TIMER(ref1.compute(statfile, sofile_sddmm), "\n\nSDDMM Kernel: ", timevalue); + // if (statfile.is_open()) { + // statfile << "sddmm time: "; + // statfile << timevalue.mean << std::endl; + // } else { std::cout << " stat file is not open\n"; } + + // std::string sofile_sddmm_ryan = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/sddmm_ryan.so"; + // TOOL_BENCHMARK_TIMER(ref1.compute(statfile, sofile_sddmm_ryan), "\n\nSDDMM Kernel: ", timevalue); + // if (statfile.is_open()) { + // statfile << "sddmm time: "; + // statfile << timevalue.mean << std::endl; + // } else { std::cout << " stat file is not open\n"; } + + // std::string sofile_spmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so"; + // TOOL_BENCHMARK_TIMER(ref2.compute(statfile, sofile_spmm), "\n\nSpMM Kernel: ", timevalue); + // if (statfile.is_open()) { + // statfile << "spmm time: "; + // statfile << timevalue.mean << std::endl; + // } else { std::cout << " stat file is not open\n"; } + + // statfile << "\nreference execution \n"; + + // std::string sofile_original = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/taco_original.so"; + // TOOL_BENCHMARK_TIMER(ref.compute(statfile, sofile_original), "\n\nReference Kernel: ", timevalue); + // if (statfile.is_open()) { + // statfile << "taco reference time: "; + // statfile << timevalue << std::endl; + // } else { std::cout << " stat file is not open\n"; } + + // double* A_vals = (double*) (A.getTacoTensorT()->vals); + // double* ref_vals = (double*) (ref.getTacoTensorT()->vals); + // double* ref2_vals = (double*) (ref2.getTacoTensorT()->vals); + + // // int* A2_pos = (double*) (ref.getTacoTensorT()->vals); + + // // for (size_t q=0; q < B.getStorage().getValues().getSize(); q++) { + // // if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) { + // // std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + // // << "refvals: " << ref_vals[q] << std::endl; + // // ASSERT_TRUE(false); + // // } + // // } + + // for (size_t q=0; q < A.getDimension(0)* A.getDimension(1); q++) { + // if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) { + // std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + // << "refvals: " << ref_vals[q] << std::endl; + // ASSERT_TRUE(false); + // } + // } + // for (size_t q=0; q < A.getDimension(0)* A.getDimension(1); q++) { + // if ( abs(A_vals[q] - ref2_vals[q])/abs(ref2_vals[q]) > ERROR_MARGIN) { + // std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + // << "refvals: " << ref2_vals[q] << std::endl; + // ASSERT_TRUE(false); + // } + // } + // for (int q= 0; q< A_vals + // for (int q = 0; q < A.getDimension(0); ++q) { + // for (int w = 0; w < A.getDimension(1); ++w) { + // if ( abs(A(q,w) - ref(q,w))/abs(ref(q,w)) > ERROR_MARGIN) { + // std::cout << "error: results don't match A("<< q << "," << w << "): " + // << A(q,w) << ", ref: " << ref(q,w) << std::endl; + // ASSERT_TRUE(false); + // } + // } + // } + // ASSERT_TENSOR_EQ(A, ref); + + } // end of for loop + + + if (statfile.is_open()) { + statfile.close(); + } +} + + + + +TEST(scheduling_eval, hadamardFused) { + if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) { + return; + } + + ofstream statfile; + statfile.open( + "/home/min/a/kadhitha/workspace/my_taco/taco/test/stats/hadamard-gemm.txt", std::ios::app); + if (statfile.is_open()) { + statfile << "\nsddmm-spmm execution\n"; + statfile << "\n-----------------------------------------\n"; + } + + std::default_random_engine gen(0); + std::uniform_real_distribution unif(0.0, 1.0); + + Format csr({dense, sparse}); + Format rm({dense, dense}); + int kdim = 128; + int ldim = 128; + + vector filenums = {2,3,4,5,6,7,8,9,10,12,15}; + // vector filenums = {8,9,10,12}; + + for (auto filenum : filenums) { + + // int filenum = 15; + + std::vector matfiles = { + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx", // 2 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx", // 5 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx", // 8 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx", // 10 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx", // 12 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wiki-Talk/wiki-Talk.mtx", // 13 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/com-Orkut/com-Orkut.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx", // 15 + "/home/min/a/kadhitha/workspace/my_taco/FusedMM/dataset/harvard.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/twitter7/twitter7.mtx" + }; + std::vector matfilesrw = { + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/synthetic.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cage3.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/bcsstk17.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/pdb1HYS.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/rma10.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cant.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/consph.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cop20k_A.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/shipsec1.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/scircuit.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/webbase-1M.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/wiki-Talk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/com-Orkut.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/circuit5M.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/harvard.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/twitter7.mtx" + }; + + std::string matfile = matfiles[filenum]; + std::cout << "reading B mat mtx\n"; + Tensor B = read(matfile, csr, true); + B.setName("B"); + B.pack(); + // write(matfilesrw[filenum], B); + + if (statfile.is_open()) { + statfile << matfile << std::endl; + } + + std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl; + std::cout << "adding c mat\n"; + Tensor C({B.getDimension(1), kdim}, rm); + for (int i = 0; i < C.getDimension(0); ++i) { + for (int j = 0; j < C.getDimension(1); ++j) { + C.insert({i,j}, unif(gen)); + } + } + std::cout << "packing C mat\n"; + C.pack(); + + Tensor D({B.getDimension(1), kdim}, rm); + for (int i = 0; i < D.getDimension(0); ++i) { + for (int j = 0; j < D.getDimension(1); ++j) { + D.insert({i,j}, unif(gen)); + } + } + std::cout << "packing D mat\n"; + D.pack(); + + Tensor F({kdim, ldim}, rm); + for (int i = 0; i < F.getDimension(0); ++i) { + for (int j = 0; j < F.getDimension(1); ++j) { + F.insert({i,j}, unif(gen)); + } + } + std::cout << "packing F mat\n"; + F.pack(); + + Tensor A({B.getDimension(0), ldim}, rm); + Tensor ref({B.getDimension(0), ldim}, rm); + IndexVar i, j, k, l, m; + IndexVar i0("i0"), i1("i1"), l0("l0"), l1("l1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"), k0("k0"), k1("k1"); + A(i,l)=B(i,j)*C(j,k)*D(j,k)*F(k,l); + if (statfile.is_open()) { + statfile + << "ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);" << std::endl + << "B1_dimension: " << B.getDimension(0) << ", B2_dimension: " << B.getDimension(1) << ", vals: " << B.getStorage().getValues().getSize() << std::endl + << "C1_dimension: " << C.getDimension(0) << ", C2_dimension: " << C.getDimension(1) << ", vals: " << C.getStorage().getValues().getSize() << std::endl + << "D1_dimension: " << D.getDimension(0) << ", D2_dimension: " << D.getDimension(1) << ", vals: " << D.getStorage().getValues().getSize() << std::endl + << "E1_dimension: " << F.getDimension(0) << ", E2_dimension: " << F.getDimension(1) << ", vals: " << F.getStorage().getValues().getSize() << std::endl + << std::endl; + } + + // IndexStmt stmt = A.getAssignment().concretize(); + IndexStmt stmt = makeReductionNotation(A.getAssignment()); + stmt = makeConcreteNotation(stmt); + stmt = reorderLoopsTopologically(stmt); + stmt = stmt.reorder({i, j, k, l}); + stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 1); + stmt = stmt + .split(i, i0, i1, 16); + stmt = insertTemporaries(stmt); + stmt = parallelizeOuterLoop(stmt); + printToFile("fusedMMFusedPar", stmt); + + A.compile(stmt); + A.assemble(); + + + ref(i,l)=B(i,j)*C(j,k)*D(j,k)*F(k,l); + IndexStmt refStmt = makeReductionNotation(ref.getAssignment()); + refStmt = makeConcreteNotation(refStmt); + refStmt = refStmt + .split(i, i0, i1, 16) + .reorder({i0, i1, j, k, l}); + refStmt = insertTemporaries(refStmt); + refStmt = parallelizeOuterLoop(refStmt); + ref.compile(refStmt); + ref.assemble(); + + Tensor ref1({B.getDimension(0), kdim}, rm); + Tensor ref2({B.getDimension(0), ldim}, rm); + ref1(i,k)=B(i,j)*C(j,k)*D(j,k); + ref2(i,l)=ref1(i,k)*F(k,l); + + // IndexStmt ref1Stmt = ref1.getAssignment().concretize(); + + // ref1Stmt = ref1Stmt.split(i, i0, i1, 16); + // // .pos(j, jpos, B(i,j)); + // // .split(k, k0, k1, 8); + // // .reorder({i0, i1, jpos0, k, jpos1}); + // // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + // // .parallelize(jpos1, ParallelUnit::CPUVector, OutputRaceStrategy::ParallelReduction); + // // ref1Stmt.split(i, ); + // // stmt = scheduleSDDMMCPU_forfuse(ref1Stmt, B); + IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment()); + ref1Stmt = makeConcreteNotation(ref1Stmt); + ref1Stmt = ref1Stmt + .split(i, i0, i1, 16) + .reorder({i0, i1, j, k}); + // .pos(j, jpos, B(i,j)) + // .split(jpos, jpos0, jpos1, 32) + // .split(k, k0, k1, 32) + // .reorder({i0, i1, jpos0, k0, jpos1, k1}); + ref1Stmt = insertTemporaries(ref1Stmt); + ref1Stmt = parallelizeOuterLoop(ref1Stmt); + ref1.compile(ref1Stmt); + ref1.assemble(); + + IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment()); + ref2Stmt = makeConcreteNotation(ref2Stmt); + ref2Stmt = ref2Stmt + .split(i, i0, i1, 32) + .split(k, k0, k1, 32) + .split(l, l0, l1, 32) + .reorder({i0, k0, l0, i1, k1, l1}); + ref2Stmt = insertTemporaries(ref2Stmt); + ref2Stmt = parallelizeOuterLoop(ref2Stmt); + ref2.compile(ref2Stmt); + ref2.assemble(); + + std::cout << "compute start\n"; + taco::util::TimeResults timevalue; + bool time = true; + + TOOL_BENCHMARK_TIMER(A.compute(statfile), "\n\nFused Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "fused time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + // // std::string sofile_sddmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so"; + // std::string sofile_sddmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.so"; + TOOL_BENCHMARK_TIMER(ref1.compute(statfile), "\n\nHadamard Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "hadamard time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + // std::string sofile_sddmm_ryan = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/sddmm_ryan.so"; + // TOOL_BENCHMARK_TIMER(ref1.compute(statfile, sofile_sddmm_ryan), "\n\nSDDMM Kernel: ", timevalue); + // if (statfile.is_open()) { + // statfile << "sddmm time: "; + // statfile << timevalue.mean << std::endl; + // } else { std::cout << " stat file is not open\n"; } + + // std::string sofile_spmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so"; + TOOL_BENCHMARK_TIMER(ref2.compute(statfile), "\n\nGeMM Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "gemm time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + // std::string sofile_original = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/taco_original.so"; + TOOL_BENCHMARK_TIMER(ref.compute(statfile), "\n\nReference Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "taco reference time: "; + statfile << timevalue << std::endl; + } else { std::cout << " stat file is not open\n"; } + + double* A_vals = (double*) (A.getTacoTensorT()->vals); + double* ref_vals = (double*) (ref.getTacoTensorT()->vals); + double* ref2_vals = (double*) (ref2.getTacoTensorT()->vals); + + // // int* A2_pos = (double*) (ref.getTacoTensorT()->vals); + + for (int q=0; q < A.getDimension(0)* A.getDimension(1); q++) { + if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) { + std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + << "refvals: " << ref_vals[q] << std::endl; + ASSERT_TRUE(false); + } + } + + for (int q=0; q < A.getDimension(0)* A.getDimension(1); q++) { + if ( abs(A_vals[q] - ref2_vals[q])/abs(ref2_vals[q]) > ERROR_MARGIN) { + std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + << "refvals: " << ref2_vals[q] << std::endl; + ASSERT_TRUE(false); + } + } + + } // end of for loop + + if (statfile.is_open()) { + statfile.close(); + } + +} + + + + + + +TEST(scheduling_eval, mttkrpFusedWithSyntheticData) { + if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) { + return; + } + taco_set_num_threads(NUM_THREADS_TO_USE); + + std::default_random_engine gen(0); + std::uniform_real_distribution unif(0.0, 1.0); + // Predeclare the storage formats that the inputs and output will be stored as. + // To define a format, you must specify whether each dimension is dense or + // sparse and (optionally) the order in which dimensions should be stored. The + // formats declared below correspond to compressed sparse fiber (csf) and + // row-major dense (rm). + Format csf({Sparse,Sparse,Sparse}); + Format rm({Dense,Dense}); + Format sd({Dense,Dense}); + + int NUM_I = 1021/20; + int NUM_J = 1039/20; + int NUM_K = 1057/20; + int NUM_L = 1232/20; + int NUM_M = 1231/20; + float SPARSITY = .1; + Tensor A("A", {NUM_I, NUM_M}, sd); + Tensor B("B", {NUM_I, NUM_K, NUM_L}, csf); + Tensor C("C", {NUM_K, NUM_J}, rm); + Tensor D("D", {NUM_L, NUM_J}, rm); + Tensor E("E", {NUM_J, NUM_M}, rm); + Tensor ref({NUM_I, NUM_M}, sd); + + srand(549694); + for (int i = 0; i < NUM_I; i++) { + for (int k = 0; k < NUM_K; k++) { + for (int l = 0; l < NUM_L; l++) { + float rand_float = (float) rand() / (float) (RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, k, l}, (double) ((int) (rand_float * 3 / SPARSITY))); + } + } + } + } + B.pack(); + write("/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/synthetic.tns", B); + + // Generate a random dense matrix and store it in row-major (dense) format. + // Matrices correspond to order-2 tensors in taco. + for (int k = 0; k < NUM_K; k++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + C.insert({k, j}, (double) ((int) (rand_float*3))); + } + } + C.pack(); + + for (int l = 0; l < NUM_L; l++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + D.insert({l, j}, (double) ((int) (rand_float*3))); + } + } + D.pack(); + + for (int i = 0; i < E.getDimension(0); ++i) { + for (int j = 0; j < E.getDimension(1); ++j) { + E.insert({i,j}, unif(gen)); + } + } + E.pack(); + + // Define the MTTKRP computation using index notation. + IndexVar i, k, l, j, m; + A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m); + + + IndexStmt stmt = makeReductionNotation(A.getAssignment()); + stmt = makeConcreteNotation(stmt); + printToFile("fusedMTTKRPConcrete", stmt); + + stmt = reorderLoopsTopologically(stmt); + printToFile("fusedMTTKRPOrdered", stmt); + + stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 1); + printToFile("fusedMTTKRPFused", stmt); + + stmt = insertTemporaries(stmt); + printToFile("fusedMTTKRPWithTemps", stmt); + stmt = parallelizeOuterLoop(stmt); + printToFile("fusedMTTKRPFusedPar", stmt); + + + // At this point, we have defined how entries in the output matrix should be + // computed from entries in the input tensor and matrices but have not actually + // performed the computation yet. To do so, we must first tell taco to generate + // code that can be executed to compute the MTTKRP operation. + A.compile(stmt); + // We can now call the functions taco generated to assemble the indices of the + // output matrix and then actually compute the MTTKRP. + A.assemble(); + + + ref(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m); + IndexStmt refStmt = makeReductionNotation(ref.getAssignment()); + refStmt = makeConcreteNotation(refStmt); + refStmt = insertTemporaries(refStmt); + refStmt = parallelizeOuterLoop(refStmt); + ref.compile(refStmt); + ref.assemble(); + + // Tensor ref2({NUM_I, NUM_J}, sd); + // ref2(i,j) = B(i,k,l) * D(l,j) * C(k,j); + // IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment()); + // ref2Stmt = makeConcreteNotation(ref2Stmt); + // ref2Stmt = insertTemporaries(ref2Stmt); + // ref2Stmt = parallelizeOuterLoop(ref2Stmt); + // ref2.compile(ref2Stmt); + // ref2.assemble(); + + // Tensor ref3({NUM_I, NUM_M}, sd); + // ref3(i,m) = ref2(i,j) * E(j,m); + // IndexStmt ref3Stmt = makeReductionNotation(ref3.getAssignment()); + // ref3Stmt = makeConcreteNotation(ref3Stmt); + // ref3Stmt = insertTemporaries(ref3Stmt); + // ref3Stmt = parallelizeOuterLoop(ref3Stmt); + // ref3.compile(ref3Stmt); + // ref3.assemble(); + + std::cout << "compute start\n"; + taco::util::TimeResults timevalue; + bool time = true; + // TOOL_BENCHMARK_TIMER(ref.compute(), "\n\nReference ISPC: ", timevalue); + TOOL_BENCHMARK_TIMER(A.compute(), "\n\nFused MTTKRP+SPMM: ", timevalue); + TOOL_BENCHMARK_TIMER(ref.compute(), "\n\nReference MTTKRP+SPMM: ", timevalue); + // TOOL_BENCHMARK_TIMER(ref2.compute(), "\n\nReference MTTKRP: ", timevalue); + // TOOL_BENCHMARK_TIMER(ref3.compute(), "\n\nReference SPMM: ", timevalue); + ASSERT_TENSOR_EQ(ref, A); + // ASSERT_TENSOR_EQ(ref, ref3); + +} + + +TEST(scheduling_eval, mttkrpFused) { + if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) { + return; + } + + taco_set_num_threads(NUM_THREADS_TO_USE); + + ofstream statfile; + statfile.open( + "/home/min/a/kadhitha/workspace/my_taco/taco/test/stats/mttkrp-spmm.txt", std::ios::app); + if (statfile.is_open()) { + statfile << "\nmttkrp-spmm execution\n"; + statfile << "\n-----------------------------------------\n"; + } + + std::default_random_engine gen(0); + std::uniform_real_distribution unif(0.0, 1.0); + // Predeclare the storage formats that the inputs and output will be stored as. + // To define a format, you must specify whether each dimension is dense or + // sparse and (optionally) the order in which dimensions should be stored. The + // formats declared below correspond to compressed sparse fiber (csf) and + // row-major dense (rm). + Format csf({Dense,Sparse,Sparse}); + Format rm({Dense,Dense}); + Format sd({Dense,Dense}); + int jDim = 32; + int mDim = 64; + + int matfilenum = 3; + + // Load a sparse order-3 tensor from file (stored in the FROSTT format) and + // store it as a compressed sparse fiber tensor. The tensor in this example + // can be download from: http://frostt.io/tensors/nell-2/ + std::vector matfiles = { + "/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns", + "/home/min/a/kadhitha/ispc-examples/data/tns/delicious-3d.tns", + "/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns", // 2 + "/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns", // 3 + "/home/min/a/kadhitha/ispc-examples/data/tns/nell-1.tns", // 4 + "/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns", // 5 + "/home/min/a/kadhitha/ispc-examples/data/tns/darpa1998.tns", // 6 + "/home/min/a/kadhitha/ispc-examples/data/tns/freebase_music.tns", + "/home/min/a/kadhitha/ispc-examples/data/tns/freebase_sampled.tns" // 8 + }; + std::vector matfilesrw = { + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/matmul_5-5-5.tns", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/delicious-3d.tns", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/flickr-3d.tns", // 2 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/nell-2.tns", // 3 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/nell-1.tns", // 4 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/vast-2015-mc1-3d.tns", // 5 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/darpa1998.tns", // 6 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/freebase_music.tns", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/freebase_sampled.tns" + }; + std::string matfile = matfiles[matfilenum]; + Tensor B = read(matfile, csf, true); + // write(matfilesrw[matfilenum], B); + + // Generate a random dense matrix and store it in row-major (dense) format. + // Matrices correspond to order-2 tensors in taco. + Tensor C({B.getDimension(1), jDim}, rm); + for (int i = 0; i < C.getDimension(0); ++i) { + for (int j = 0; j < C.getDimension(1); ++j) { + C.insert({i,j}, unif(gen)); + } + } + C.pack(); + + // Generate another random dense matrix and store it in row-major format. + Tensor D({B.getDimension(2), jDim}, rm); + for (int i = 0; i < D.getDimension(0); ++i) { + for (int j = 0; j < D.getDimension(1); ++j) { + D.insert({i,j}, unif(gen)); + } + } + D.pack(); + + Tensor E({jDim, mDim}, rm); + for (int i = 0; i < E.getDimension(0); ++i) { + for (int j = 0; j < E.getDimension(1); ++j) { + E.insert({i,j}, unif(gen)); + } + } + E.pack(); + + if (statfile.is_open()) { + statfile + << matfile << std::endl + << "A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m)" << std::endl + << "B1_dimension: " << B.getDimension(0) << ", B2_dimension: " << B.getDimension(1) << ", B3_dimension: " << B.getDimension(0) << ", vals: " << B.getStorage().getValues().getSize() << std::endl + << "C1_dimension: " << C.getDimension(0) << ", C2_dimension: " << C.getDimension(1) << ", vals: " << C.getStorage().getValues().getSize() << std::endl + << "D1_dimension: " << D.getDimension(0) << ", D2_dimension: " << D.getDimension(1) << ", vals: " << D.getStorage().getValues().getSize() << std::endl + << "E1_dimension: " << E.getDimension(0) << ", E2_dimension: " << E.getDimension(1) << ", vals: " << E.getStorage().getValues().getSize() << std::endl + << std::endl; + } + + // Declare the output matrix to be a dense matrix with 25 columns and the same + // number of rows as the number of slices along the first dimension of input + // tensor B, to be also stored as a row-major dense matrix. + Tensor A({B.getDimension(0), mDim}, sd); + Tensor ref({B.getDimension(0), mDim}, sd); + + // Define the MTTKRP computation using index notation. + IndexVar i, k, l, j, m; + IndexVar i1("i1"), i2("i2"), j1("j1"), j2("j2"), m1("m1"), m2("m2"); + + A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m); + + IndexStmt stmt = makeReductionNotation(A.getAssignment()); + stmt = makeConcreteNotation(stmt); + stmt = reorderLoopsTopologically(stmt); + // stmt = stmt.reorder({i,j,k,l,m}); + stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 1); + stmt = stmt.split(i, i1, i2, 16); + stmt = insertTemporaries(stmt); + stmt = parallelizeOuterLoop(stmt); + printToFile("fusedMTTKRPFusedPar", stmt); + A.compile(stmt); + A.assemble(); + + + ref(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m); + IndexStmt refStmt = makeReductionNotation(ref.getAssignment()); + refStmt = makeConcreteNotation(refStmt); + refStmt = refStmt + .split(i, i1, i2, 16); + refStmt = insertTemporaries(refStmt); + refStmt = parallelizeOuterLoop(refStmt); + ref.compile(refStmt); + ref.assemble(); + + Tensor ref2({B.getDimension(0), jDim}, sd); + ref2(i,j) = B(i,k,l) * D(l,j) * C(k,j); + IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment()); + ref2Stmt = makeConcreteNotation(ref2Stmt); + ref2Stmt = ref2Stmt + .split(i, i1, i2, 16); + ref2Stmt = insertTemporaries(ref2Stmt); + ref2Stmt = parallelizeOuterLoop(ref2Stmt); + ref2.compile(ref2Stmt); + ref2.assemble(); + + Tensor ref2_ryan({B.getDimension(0), jDim}, sd); + ref2_ryan(i,j) = B(i,k,l) * D(l,j) * C(k,j); + + IndexStmt ref2RyanStmt = makeReductionNotation(ref2_ryan.getAssignment()); + ref2RyanStmt = makeConcreteNotation(ref2RyanStmt); + + IndexExpr precomputeExpr = ref2RyanStmt.as().getStmt().as().getStmt() + .as().getStmt().as().getStmt() + .as().getRhs().as().getA(); + TensorVar w("w", Type(Float64, {Dimension(j)}), taco::dense); + ref2RyanStmt = ref2RyanStmt.split(i, i1, i2, 16) + .reorder({i1, i2, k, l, j}) + .precompute(precomputeExpr, j, j, w) + .parallelize(i1, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); + ref2RyanStmt = insertTemporaries(ref2RyanStmt); + // ref2RyanStmt = parallelizeOuterLoop(ref2RyanStmt); + ref2_ryan.compile(ref2RyanStmt); + ref2_ryan.assemble(); + + Tensor ref3({B.getDimension(0), mDim}, sd); + ref3(i,m) = ref2(i,j) * E(j,m); + IndexStmt ref3Stmt = makeReductionNotation(ref3.getAssignment()); + ref3Stmt = makeConcreteNotation(ref3Stmt); + ref3Stmt = ref3Stmt + .split(i, i1, i2, 16) + .split(j, j1, j2, 16) + .split(m, m1, m2, 16) + .reorder({i1, j1, m1, i2, j2, m2}) + .parallelize(i1, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); + ref3Stmt = insertTemporaries(ref3Stmt); + ref3Stmt = parallelizeOuterLoop(ref3Stmt); + ref3.compile(ref3Stmt); + ref3.assemble(); + + + std::cout << "compute start\n"; + taco::util::TimeResults timevalue; + bool time = true; + + TOOL_BENCHMARK_TIMER(ref2.compute(statfile), "\n\nDefault MTTKRP: ", timevalue); + if (statfile.is_open()) { + statfile << "default mttkrp time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + TOOL_BENCHMARK_TIMER(ref2_ryan.compute(statfile), "\n\nRyan MTTKRP workspace: ", timevalue); + if (statfile.is_open()) { + statfile << "ryan mttkrp workspace time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + double* ref2_vals = (double*) (ref2.getTacoTensorT()->vals); + double* ref2_ryan_vals = (double*) (ref2_ryan.getTacoTensorT()->vals); + for (int q=0; q < B.getDimension(0)* jDim; q++) { + if ( abs(ref2_vals[q] - ref2_ryan_vals[q])/abs(ref2_ryan_vals[q]) > ERROR_MARGIN) { + std::cout << "error: results don't match i: " << q << ", avals: " << ref2_vals[q] << " " + << "refvals: " << ref2_ryan_vals[q] << std::endl; + ASSERT_TRUE(false); + } + } + + TOOL_BENCHMARK_TIMER(ref3.compute(statfile), "\n\nGeMM time: ", timevalue); + if (statfile.is_open()) { + statfile << "GeMM time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + + TOOL_BENCHMARK_TIMER(ref.compute(statfile), "\n\nReference MTTKRP+GEMM: ", timevalue); + if (statfile.is_open()) { + statfile << "reference asymptotic blowup time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + double* ref3_vals = (double*) (ref3.getTacoTensorT()->vals); + double* ref_vals = (double*) (ref.getTacoTensorT()->vals); + for (int q=0; q < B.getDimension(0)* mDim; q++) { + if ( abs(ref3_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) { + std::cout << "error: results don't match i: " << q << ", avals: " << ref3_vals[q] << " " + << "refvals: " << ref_vals[q] << std::endl; + ASSERT_TRUE(false); + } + } + + TOOL_BENCHMARK_TIMER(A.compute(statfile), "\n\nFused MTTKRP+GEMM: ", timevalue); + if (statfile.is_open()) { + statfile << "fused mttkrp+gemm time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + if (statfile.is_open()) { + statfile.close(); + } + + double* A_vals = (double*) (A.getTacoTensorT()->vals); + for (int q=0; q < B.getDimension(0)* mDim; q++) { + if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) { + std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + << "refvals: " << ref_vals[q] << std::endl; + ASSERT_TRUE(false); + } + } + + +} + +TEST(scheduling_eval, ttmFusedWithSyntheticData) { + if (should_use_CUDA_codegen()) { + return; + } + + taco_set_num_threads(NUM_THREADS_TO_USE); + + std::default_random_engine gen(0); + std::uniform_real_distribution unif(0.0, 1.0); + Format csf({Sparse,Sparse,Sparse}); + Format custom({Sparse,Sparse,Dense}); + Format rm({Dense,Dense}); + + int NUM_I = 5; + int NUM_J = 5; + int NUM_K = 5; + int NUM_L = 64; + int NUM_M = 1024; + float SPARSITY = .1; + + Tensor B("B", {NUM_I, NUM_J, NUM_K}, csf); + srand(549694); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float) rand() / (float) (RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, j, k}, (double) ((int) (rand_float * 3 / SPARSITY))); + } + } + } + } + B.pack(); + write("/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/synthetic.tns", B); + + // Generate a random dense matrix and store it in row-major (dense) format. + // Matrices correspond to order-2 tensors in taco. + Tensor C({B.getDimension(2), NUM_L}, rm); + for (int i = 0; i < C.getDimension(0); ++i) { + for (int j = 0; j < C.getDimension(1); ++j) { + C.insert({i,j}, unif(gen)); + } + } + C.pack(); + + // Generate another random dense matrix and store it in row-major format. + Tensor D({NUM_L, NUM_M}, rm); + for (int i = 0; i < D.getDimension(0); ++i) { + for (int j = 0; j < D.getDimension(1); ++j) { + D.insert({i,j}, unif(gen)); + } + } + D.pack(); + + Tensor A({B.getDimension(0), B.getDimension(1), NUM_M}, custom); + Tensor ref({B.getDimension(0), B.getDimension(1), NUM_M}, custom); + + // Define the MTTKRP computation using index notation. + IndexVar i, j, k, l, m; + A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m); + + IndexStmt stmt = makeReductionNotation(A.getAssignment()); + stmt = makeConcreteNotation(stmt); + printToFile("fusedTTMTTKRPConcrete", stmt); + + stmt = reorderLoopsTopologically(stmt); + printToFile("fusedTTMOrdered", stmt); + + stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 1); + printToFile("fusedTTMFused", stmt); + + stmt = insertTemporaries(stmt); + printToFile("fusedTTMWithTemps", stmt); + stmt = parallelizeOuterLoop(stmt); + printToFile("fusedTTMFinal", stmt); + + + // At this point, we have defined how entries in the output matrix should be + // computed from entries in the input tensor and matrices but have not actually + // performed the computation yet. To do so, we must first tell taco to generate + // code that can be executed to compute the MTTKRP operation. + A.compile(stmt); + // We can now call the functions taco generated to assemble the indices of the + // output matrix and then actually compute the MTTKRP. + A.assemble(); + + + ref(i,j,m) = B(i,j,k) * C(k,l) * D(l,m); + IndexStmt refStmt = makeReductionNotation(ref.getAssignment()); + refStmt = makeConcreteNotation(refStmt); + refStmt = insertTemporaries(refStmt); + refStmt = parallelizeOuterLoop(refStmt); + printToFile("tacoFusedTTM", refStmt); + ref.compile(refStmt); + ref.assemble(); + + Tensor ref1({B.getDimension(0), B.getDimension(1), NUM_L}, custom); + ref1(i,j,l) = B(i,j,k) * C(k,l); + IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment()); + ref1Stmt = makeConcreteNotation(ref1Stmt); + ref1Stmt = insertTemporaries(ref1Stmt); + ref1Stmt = parallelizeOuterLoop(ref1Stmt); + ref1.compile(ref1Stmt); + ref1.assemble(); + + Tensor ref2({B.getDimension(0), B.getDimension(1), NUM_M}, custom); + ref2(i,j,m) = ref1(i,j,l) * D(l,m); + IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment()); + ref2Stmt = makeConcreteNotation(ref2Stmt); + ref2Stmt = insertTemporaries(ref2Stmt); + ref2Stmt = parallelizeOuterLoop(ref2Stmt); + ref2.compile(ref2Stmt); + ref2.assemble(); + + Tensor ref3({B.getDimension(2), NUM_M}, rm); + ref3(k,m) = C(k,l) * D(l,m); + IndexStmt ref3Stmt = makeReductionNotation(ref3.getAssignment()); + ref3Stmt = makeConcreteNotation(ref3Stmt); + ref3Stmt = insertTemporaries(ref3Stmt); + ref3Stmt = parallelizeOuterLoop(ref3Stmt); + ref3.compile(ref3Stmt); + ref3.assemble(); + + Tensor ref4({B.getDimension(0), B.getDimension(1), NUM_M}, custom); + ref4(i,j,m) = B(i,j,k) * ref3(k,m); + IndexStmt ref4Stmt = makeReductionNotation(ref4.getAssignment()); + ref4Stmt = makeConcreteNotation(ref4Stmt); + ref4Stmt = insertTemporaries(ref4Stmt); + ref4Stmt = parallelizeOuterLoop(ref4Stmt); + ref4.compile(ref4Stmt); + ref4.assemble(); + + std::cout << "compute start\n"; + taco::util::TimeResults timevalue; + bool time = true; + // TOOL_BENCHMARK_TIMER(ref.compute(), "\n\nReference ISPC: ", timevalue); + TOOL_BENCHMARK_TIMER(A.compute(), "\n\nFused TTM->TTM: ", timevalue); + TOOL_BENCHMARK_TIMER(ref.compute(), "\n\nReference TTM->TTM: ", timevalue); + TOOL_BENCHMARK_TIMER(ref1.compute(), "\n\nTTM1: ", timevalue); + TOOL_BENCHMARK_TIMER(ref2.compute(), "\n\nTTM1: ", timevalue); + TOOL_BENCHMARK_TIMER(ref3.compute(), "\n\ndense: ", timevalue); + TOOL_BENCHMARK_TIMER(ref4.compute(), "\n\nTTM after dense: ", timevalue); + ASSERT_TENSOR_EQ(ref, A); + ASSERT_TENSOR_EQ(ref, ref2); + ASSERT_TENSOR_EQ(ref, ref4); + + for (int q = 0; q < A.getDimension(0); ++q) { + for (int w = 0; w < A.getDimension(1); ++w) { + for (int z = 0; z < A.getDimension(2); ++z) { + // std::cout << "(" << q << "," << w << "," << z << ")" + // << "a: " << A(q,w,z) << ", ref: " << ref(q,w,z) << std::endl; + if ( abs(A(q,w,z) - ref(q,w,z))/abs(ref(q,w,z)) > ERROR_MARGIN) { + std::cout << "error: results don't match A: " + << A(q,w,z) << ", ref: " << ref(q,w,z) << std::endl; + ASSERT_TRUE(false); + } + } + } + } + +} + +TEST(scheduling_eval, ttmFused) { + if (should_use_CUDA_codegen()) { + return; + } + + int retval, EventSet = PAPI_NULL; + retval = PAPI_hl_region_begin("dummy"); + if ( retval != PAPI_OK ) handle_error(1); + + retval = PAPI_hl_region_end("dummy"); + if ( retval != PAPI_OK ) handle_error(1); + + taco_set_num_threads(NUM_THREADS_TO_USE); + + ofstream statfile; + statfile.open( + "/home/min/a/kadhitha/workspace/my_taco/taco/test/stats/ttm-ttm.txt", std::ios::app); + if (statfile.is_open()) { + statfile << "\nttm-ttm execution\n"; + statfile << "\n-----------------------------------------\n"; + } + + std::default_random_engine gen(0); + std::uniform_real_distribution unif(0.0, 1.0); + Format csf({Dense,Sparse,Sparse}); + Format custom({Dense,Sparse,Dense}); + Format rm({Dense,Dense}); + int ldim = 32; + int mdim = 64; + + int64_t dummy_array_size = 2e6; + int64_t* dummy_array_to_flush_cache = (int64_t*) malloc(dummy_array_size*sizeof(int64_t)); + + vector matfilenums = {5}; + + for (auto matfilenum : matfilenums) { + + // int matfilenum = 0; + + + + // Load a sparse order-3 tensor from file (stored in the FROSTT format) and + // store it as a compressed sparse fiber tensor. The tensor in this example + // can be download from: http://frostt.io/tensors/nell-2/ + std::vector matfiles = { + "/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns", + "/home/min/a/kadhitha/ispc-examples/data/tns/delicious-3d.tns", + "/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns", // 2 + "/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns", // 3 + "/home/min/a/kadhitha/ispc-examples/data/tns/nell-1.tns", // 4 + "/home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns", // 5 + "/home/min/a/kadhitha/workspace/my_taco/tns/darpa1998.tns", // 6 + "/home/min/a/kadhitha/ispc-examples/data/tns/freebase_music.tns", + "/home/min/a/kadhitha/ispc-examples/data/tns/freebase_sampled.tns" + }; + std::vector matfilesrw = { + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/matmul_5-5-5.tns", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/delicious-3d.tns", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/flickr-3d.tns", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/nell-2.tns", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/nell-1.tns", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/vast-2015-mc1-3d.tns", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/darpa1998.tns", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/freebase_music.tns", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/freebase_sampled.tns" + }; + statfile << "\nfile: " << matfiles[matfilenum] << std::endl; + statfile << "----------------------------------------------------------------\n"; + + std::string matfile = matfiles[matfilenum]; + Tensor B = read(matfile, csf); + B.setName("B"); + B.pack(); + // write(matfilesrw[matfilenum], B); + + // Generate a random dense matrix and store it in row-major (dense) format. + // Matrices correspond to order-2 tensors in taco. + Tensor C("C", {B.getDimension(2), ldim}, rm); + for (int i = 0; i < C.getDimension(0); ++i) { + for (int j = 0; j < C.getDimension(1); ++j) { + C.insert({i,j}, unif(gen)); + } + } + C.pack(); + + // Generate another random dense matrix and store it in row-major format. + Tensor D("D", {ldim, mdim}, rm); + for (int i = 0; i < D.getDimension(0); ++i) { + for (int j = 0; j < D.getDimension(1); ++j) { + D.insert({i,j}, unif(gen)); + } + } + D.pack(); + + if (statfile.is_open()) { + statfile + << matfile << std::endl + << "A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)" << std::endl + << "B1_dimension: " << B.getDimension(0) << ", B2_dimension: " << B.getDimension(1) << ", B3_dimension: " << B.getDimension(2) << ", vals: " << B.getStorage().getValues().getSize() << std::endl + << "C1_dimension: " << C.getDimension(0) << ", C2_dimension: " << C.getDimension(1) << ", vals: " << C.getStorage().getValues().getSize() << std::endl + << "D1_dimension: " << D.getDimension(0) << ", D2_dimension: " << D.getDimension(1) << ", vals: " << D.getStorage().getValues().getSize() << std::endl + << std::endl; + } + + Tensor A({B.getDimension(0), B.getDimension(1), mdim}, custom); + Tensor ref({B.getDimension(0), B.getDimension(1), mdim}, custom); + Tensor refn({B.getDimension(0), B.getDimension(1), mdim}, custom); + + // Define the MTTKRP computation using index notation. + IndexVar i, j, k, l, m; + IndexVar i0,i1, j0, j1, k0, k1, l0, l1, m0, m1; + A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m); + + + IndexStmt stmt = makeReductionNotation(A.getAssignment()); + stmt = makeConcreteNotation(stmt); + stmt = reorderLoopsTopologically(stmt); + stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 1); + stmt = stmt.split(i, i0, i1, 16); + stmt = insertTemporaries(stmt); + stmt = parallelizeOuterLoop(stmt); + printToFile("fusedTTMFinal", stmt); + + A.compile(stmt); + A.assemble(); + + + ref(i,j,m) = B(i,j,k) * C(k,l) * D(l,m); // TTM->TTM TACO + IndexStmt refStmt = makeReductionNotation(ref.getAssignment()); + refStmt = makeConcreteNotation(refStmt); + refStmt = refStmt + .split(i, i0, i1, 16); + refStmt = insertTemporaries(refStmt); + refStmt = parallelizeOuterLoop(refStmt); + printToFile("tacoFusedTTM", refStmt); + ref.compile(refStmt); + ref.assemble(); + + refn(i,j,m) = B(i,j,k) * C(k,l) * D(l,m); // TTM->TTM TACO + IndexStmt refnStmt = makeReductionNotation(refn.getAssignment()); + refnStmt = makeConcreteNotation(refnStmt); + refnStmt = refnStmt + .split(i, i0, i1, 16) + .reorder({i0, i1, j, k, l, m}); + refnStmt = insertTemporaries(refnStmt); + refnStmt = parallelizeOuterLoop(refnStmt); + printToFile("tacoFusedTTM", refnStmt); + refn.compile(refnStmt); + refn.assemble(); + + Tensor ref1({B.getDimension(0), B.getDimension(1), ldim}, custom); + ref1(i,j,l) = B(i,j,k) * C(k,l); // TTM1 + IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment()); + ref1Stmt = makeConcreteNotation(ref1Stmt); + // ref1Stmt = ref1Stmt.split(i, i0, i1, 16); + ref1Stmt = insertTemporaries(ref1Stmt); + ref1Stmt = parallelizeOuterLoop(ref1Stmt); + ref1.compile(ref1Stmt); + ref1.assemble(); + + Tensor ref2({B.getDimension(0), B.getDimension(1), mdim}, custom); + ref2(i,j,m) = ref1(i,j,l) * D(l,m); // TTM2 + IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment()); + ref2Stmt = makeConcreteNotation(ref2Stmt); + // ref2Stmt = ref2Stmt.split(i, i0, i1, 16); + ref2Stmt = insertTemporaries(ref2Stmt); + ref2Stmt = parallelizeOuterLoop(ref2Stmt); + ref2.compile(ref2Stmt); + ref2.assemble(); + + Tensor ref3({B.getDimension(2), mdim}, rm); + ref3(k,m) = C(k,l) * D(l,m); // GeMM + IndexStmt ref3Stmt = makeReductionNotation(ref3.getAssignment()); + ref3Stmt = makeConcreteNotation(ref3Stmt); + ref3Stmt = ref3Stmt + .split(k, k0, k1, 32) + .split(l, l0, l1, 32) + .split(m, m0, m1, 32) + .reorder({k0, l0, m0, k1, l1, m1}); + ref3Stmt = insertTemporaries(ref3Stmt); + ref3Stmt = parallelizeOuterLoop(ref3Stmt); + ref3.compile(ref3Stmt); + ref3.assemble(); + + Tensor ref4({B.getDimension(0), B.getDimension(1), mdim}, custom); + ref4(i,j,m) = B(i,j,k) * ref3(k,m); // TTM1 + IndexStmt ref4Stmt = makeReductionNotation(ref4.getAssignment()); + ref4Stmt = makeConcreteNotation(ref4Stmt); + // ref4Stmt = ref4Stmt + // .split(i, i0, i1, 16); + // // .split(k, k0, k1, 16) + // .split(m, m0, m1, 16) + // .reorder({i0, i1, j, m0, k, m1}); + ref4Stmt = insertTemporaries(ref4Stmt); + ref4Stmt = parallelizeOuterLoop(ref4Stmt); + ref4.compile(ref4Stmt); + ref4.assemble(); + + std::cout << "compute start\n"; + taco::util::TimeResults timevalue; + bool time = true; + + int r = rand(); + for (int64_t i=0; iTTM: ", timevalue); + retval = PAPI_hl_region_end("fusedTTM"); if ( retval != PAPI_OK ) handle_error(1); + if (statfile.is_open()) { + statfile << "fused time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + r = rand(); + for (int64_t i=0; iTTM: ", timevalue); + retval = PAPI_hl_region_end("referenceTTM"); if ( retval != PAPI_OK ) handle_error(1); + if (statfile.is_open()) { + statfile << "reference time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + r = rand(); + for (int64_t i=0; iTTM: ", timevalue); + retval = PAPI_hl_region_end("ref2TTM"); if ( retval != PAPI_OK ) handle_error(1); + if (statfile.is_open()) { + statfile << "reference new time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + statfile << "\nschedule 1\n"; + + r = rand(); + for (int64_t i=0; ivals); + double* ref_vals = (double*) (ref.getTacoTensorT()->vals); + double* ref2_vals = (double*) (ref2.getTacoTensorT()->vals); + double* ref4_vals = (double*) (ref4.getTacoTensorT()->vals); + + // int* A2_pos = (double*) (ref.getTacoTensorT()->vals); + + // for (size_t q=0; q < B.getStorage().getValues().getSize(); q++) { + // if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) { + // std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + // << "refvals: " << ref_vals[q] << std::endl; + // ASSERT_TRUE(false); + // } + // } + + // std::cout << "our fused vs taco original fused check\n"; + // for (size_t q=0; q < A.getStorage().getValues().getSize(); q++) { + // if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) { + // std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + // << "refvals: " << ref_vals[q] << std::endl; + // ASSERT_TRUE(false); + // } + // } + // std::cout << "taco original fused vs TTM1, TTM2 check\n"; + // for (size_t q=0; q < A.getStorage().getValues().getSize(); q++) { + // if ( abs(ref_vals[q] - ref2_vals[q])/abs(ref2_vals[q]) > ERROR_MARGIN) { + // std::cout << "error: results don't match i: " << q << ", avals: " << ref_vals[q] << " " + // << "refvals: " << ref2_vals[q] << std::endl; + // ASSERT_TRUE(false); + // } + // } + // std::cout << "taco original fused vs GeMM, TTM1 check\n"; + // for (size_t q=0; q < A.getStorage().getValues().getSize(); q++) { + // if ( abs(ref_vals[q] - ref4_vals[q])/abs(ref4_vals[q]) > ERROR_MARGIN) { + // std::cout << "error: results don't match i: " << q << ", avals: " << ref_vals[q] << " " + // << "refvals: " << ref4_vals[q] << std::endl; + // ASSERT_TRUE(false); + // } + // } + + } // end of forloop + + if (statfile.is_open()) { + statfile.close(); + } + +} + + + + +TEST(scheduling_eval, spmmFusedWithSyntheticData) { + if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) { + return; + } + + taco_set_num_threads(NUM_THREADS_TO_USE); + + std::default_random_engine gen(0); + std::uniform_real_distribution unif(0.0, 1.0); + + Format csr({dense, sparse}); + Format rm({dense, dense}); + int ldim = 32; + int kdim = 64; + + // uncomment this for reading the csr matrix saved in mtx file + std::cout << "reading B mat mtx\n"; + + int NUM_I = 128; + int NUM_J = 96; + int NUM_K = 64; + float SPARSITY = .3; + Tensor B("B", {NUM_I, NUM_J}, csr); + srand(75883); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + } + B.pack(); + + Tensor C("C", {NUM_J, NUM_K}, csr); + for (int j = 0; j < NUM_J; j++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({j, k}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + } + C.pack(); + // write("/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx", B); + + std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl; + std::cout << "adding c mat\n"; + Tensor D({C.getDimension(1), ldim}, rm); + for (int i = 0; i < D.getDimension(0); ++i) { + for (int j = 0; j < D.getDimension(1); ++j) { + D.insert({i,j}, unif(gen)); + } + } + std::cout << "packing C mat\n"; + D.pack(); + + // Tensor E({B.getDimension(1), kdim}, rm); + // for (int i = 0; i < D.getDimension(0); ++i) { + // for (int j = 0; j < D.getDimension(1); ++j) { + // D.insert({i,j}, unif(gen)); + // } + // } + // std::cout << "packing D mat\n"; + // D.pack(); + + // Tensor F({B.getDimension(1), ldim}, rm); + // for (int i = 0; i < F.getDimension(0); ++i) { + // for (int j = 0; j < F.getDimension(1); ++j) { + // F.insert({i,j}, unif(gen)); + // } + // } + // std::cout << "packing F mat\n"; + // F.pack(); + + Tensor A({B.getDimension(0), ldim}, rm); + Tensor ref({B.getDimension(0), ldim}, rm); + IndexVar i, j, k, l; + A(i,l)=B(i,j)*C(j,k)*D(k,l); + + // IndexStmt stmt = A.getAssignment().concretize(); + IndexStmt stmt = makeReductionNotation(A.getAssignment()); + stmt = makeConcreteNotation(stmt); + printToFile("fusedMMConcrete", stmt); + + stmt = reorderLoopsTopologically(stmt); + printToFile("fusedMMOrdered", stmt); + + stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 1); + printToFile("fusedMMFused", stmt); + + stmt = insertTemporaries(stmt); + printToFile("fusedMMWithTemps", stmt); + stmt = parallelizeOuterLoop(stmt); + printToFile("fusedMMFusedPar", stmt); + + A.compile(stmt); + // We can now call the functions taco generated to assemble the indices of the + // output matrix and then actually compute the MTTKRP. + A.assemble(); + + + ref(i,l)=B(i,j)*C(j,k)*D(k,l); + IndexStmt refStmt = makeReductionNotation(ref.getAssignment()); + refStmt = makeConcreteNotation(refStmt); + refStmt = insertTemporaries(refStmt); + refStmt = parallelizeOuterLoop(refStmt); + ref.compile(refStmt); + ref.assemble(); + + // Tensor ref1({B.getDimension(0), B.getDimension(1)}, csr); + // Tensor ref2({B.getDimension(0), ldim}, rm); + // ref1(i,j)=B(i,j)*C(i,k)*D(j,k); + // ref2(i,l)=ref1(i,j)*F(j,l); + + // IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment()); + // ref1Stmt = makeConcreteNotation(ref1Stmt); + // ref1Stmt = insertTemporaries(ref1Stmt); + // ref1Stmt = parallelizeOuterLoop(ref1Stmt); + // ref1.compile(ref1Stmt); + // ref1.assemble(); + + // IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment()); + // ref2Stmt = makeConcreteNotation(ref2Stmt); + // ref2Stmt = insertTemporaries(ref2Stmt); + // ref2Stmt = parallelizeOuterLoop(ref2Stmt); + // ref2.compile(ref2Stmt); + // ref2.assemble(); + + std::cout << "compute start\n"; + taco::util::TimeResults timevalue; + bool time = true; + TOOL_BENCHMARK_TIMER(ref.compute(), "\n\nReference Kernel: ", timevalue); + TOOL_BENCHMARK_TIMER(A.compute(), "\n\nFused Kernel: ", timevalue); + + // check results + for (int q = 0; q < A.getDimension(0); ++q) { + for (int w = 0; w < A.getDimension(1); ++w) { + if ( abs(A(q,w) - ref(q,w))/abs(ref(q,w)) > ERROR_MARGIN) { + std::cout << "error: results don't match A("<< q << "," << w << "): " + << A(q,w) << ", ref: " << ref(q,w) << std::endl; + ASSERT_TRUE(false); + } + } + } + // // ASSERT_TENSOR_EQ(A, ref); + // TOOL_BENCHMARK_TIMER(ref1.compute(), "\n\nSDDMM Kernel: ", timevalue); + // TOOL_BENCHMARK_TIMER(ref2.compute(), "\n\nSpMM Kernel: ", timevalue); + + // for (int q = 0; q < ref2.getDimension(0); ++q) { + // for (int w = 0; w < ref2.getDimension(1); ++w) { + // if ( abs(ref2(q,w) - ref(q,w))/abs(ref(q,w)) > ERROR_MARGIN) { + // std::cout << "error: results don't match A("<< q << "," << w << "): " + // << ref2(q,w) << ", ref: " << ref(q,w) << std::endl; + // ASSERT_TRUE(false); + // } + // } + // } + +} + + +TEST(scheduling_eval, spmmFused) { + if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) { + return; + } + + int retval, EventSet = PAPI_NULL; + retval = PAPI_hl_region_begin("dummy"); + if ( retval != PAPI_OK ) handle_error(1); + + /* Do some computation */ + + retval = PAPI_hl_region_end("dummy"); + if ( retval != PAPI_OK ) handle_error(1); + + taco_set_num_threads(NUM_THREADS_TO_USE); + + ofstream statfile; + statfile.open( + "/home/min/a/kadhitha/workspace/my_taco/taco/test/stats/spmm-spmm.txt", std::ios::app); + if (statfile.is_open()) { + statfile << "\nspmm-spmm execution\n"; + statfile << "\n-----------------------------------------\n"; + } + + std::default_random_engine gen(0); + std::uniform_real_distribution unif(0.0, 1.0); + + Format csr({dense, sparse}); + Format rm({dense, dense}); + int kdim = 128; + int ldim = 64; + + // vector filenums = {2,3,4,5,6,7,8,9,10,12,15}; + vector filenums = {3}; + + for (auto filenum : filenums) { + + + statfile << "filenum: " << filenum << std::endl; + statfile << "---------------------------------\n"; + // int filenum = 7; + + std::vector matfiles = { + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx", // 2 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx", // 10 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx", // 12 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wiki-Talk/wiki-Talk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/com-Orkut/com-Orkut.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx", // 15 + "/home/min/a/kadhitha/workspace/my_taco/FusedMM/dataset/harvard.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/twitter7/twitter7.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k.mtx", + }; + std::vector matfilesrw = { + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/synthetic.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cage3.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/bcsstk17.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/pdb1HYS.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/rma10.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cant.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/consph.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cop20k_A.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/shipsec1.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/scircuit.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/webbase-1M.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/wiki-Talk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/com-Orkut.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/circuit5M.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/harvard.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/twitter7.mtx" + }; + + std::string matfile = matfiles[filenum]; + std::cout << "reading B mat mtx\n"; + Tensor B = read(matfile, csr); + B.pack(); + // write(matfilesrw[filenum], B); + + if (statfile.is_open()) { + statfile << matfile << std::endl; + } + + std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl; + std::cout << "adding c mat\n"; + // Tensor C = read(matfiles2[filenum], csr, true); + // std::cout << "packing C mat\n"; + + std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl; + std::cout << "adding c mat\n"; + Tensor C("C", {B.getDimension(1), kdim}, rm); + for (int i = 0; i < C.getDimension(0); ++i) { + for (int j = 0; j < C.getDimension(1); ++j) { + C.insert({i,j}, unif(gen)); + } + } + std::cout << "packing C mat\n"; + C.pack(); + + Tensor D({C.getDimension(1), ldim}, rm); + for (int i = 0; i < D.getDimension(0); ++i) { + for (int j = 0; j < D.getDimension(1); ++j) { + D.insert({i,j}, unif(gen)); + } + } + std::cout << "packing D mat\n"; + D.pack(); + + // Tensor F({B.getDimension(1), ldim}, rm); + // for (int i = 0; i < F.getDimension(0); ++i) { + // for (int j = 0; j < F.getDimension(1); ++j) { + // F.insert({i,j}, unif(gen)); + // } + // } + // std::cout << "packing F mat\n"; + // F.pack(); + + Tensor A({B.getDimension(0), ldim}, rm); + Tensor ref({B.getDimension(0), ldim}, rm); + Tensor refn({B.getDimension(0), ldim}, rm); + IndexVar i, j, k, l; + IndexVar i0, i1, j0, j1, k0, k1, l0, l1; + + A(i,l)=B(i,j)*C(j,k)*D(k,l); + if (statfile.is_open()) { + statfile + << "ref(i,l)=B(i,j)*C(i,k)*D(j,k);" << std::endl + << "B1_dimension: " << B.getDimension(0) << ", B2_dimension: " << B.getDimension(1) << ", vals: " << B.getStorage().getValues().getSize() << std::endl + << "C1_dimension: " << C.getDimension(0) << ", C2_dimension: " << C.getDimension(1) << ", vals: " << C.getStorage().getValues().getSize() << std::endl + << "D1_dimension: " << D.getDimension(0) << ", D2_dimension: " << D.getDimension(1) << ", vals: " << D.getStorage().getValues().getSize() << std::endl + // << "E1_dimension: " << F.getDimension(0) << ", E2_dimension: " << F.getDimension(1) << ", vals: " << F.getStorage().getValues().getSize() << std::endl + << std::endl; + } + + // IndexStmt stmt = A.getAssignment().concretize(); + IndexStmt stmt = makeReductionNotation(A.getAssignment()); + stmt = makeConcreteNotation(stmt); + stmt = reorderLoopsTopologically(stmt); + stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 1); + stmt = stmt.split(i, i0, i1, 16); + stmt = insertTemporaries(stmt); + stmt = parallelizeOuterLoop(stmt); + + A.compile(stmt); + A.assemble(); + + + ref(i,l)=B(i,j)*C(j,k)*D(k,l); + refn(i,l)=B(i,j)*C(j,k)*D(k,l); + // IndexStmt refStmt = ref.getAssignment().concretize(); + + // ref1Stmt = ref1Stmt.split(i, i0, i1, 16); + // .pos(j, jpos, B(i,j)); + // .split(k, k0, k1, 8); + // .reorder({i0, i1, jpos0, k, jpos1}); + // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + // .parallelize(jpos1, ParallelUnit::CPUVector, OutputRaceStrategy::ParallelReduction); + IndexStmt refStmt = makeReductionNotation(ref.getAssignment()); + refStmt = makeConcreteNotation(refStmt); + refStmt = refStmt + .split(i, i0, i1, 16) + .split(k, k0, k1, 32) + .split(l, l0, l1, 32) + .reorder({i0, i1, j, k0, l0, k1, l1}); + refStmt = insertTemporaries(refStmt); + refStmt = parallelizeOuterLoop(refStmt); + ref.compile(refStmt); + ref.assemble(); + + IndexStmt refnStmt = makeReductionNotation(refn.getAssignment()); + refnStmt = makeConcreteNotation(refnStmt); + refnStmt = refnStmt + .split(i, i0, i1, 16); + refnStmt = insertTemporaries(refnStmt); + refnStmt = parallelizeOuterLoop(refnStmt); + refn.compile(refnStmt); + refn.assemble(); + + // SpMM , GEMM + + Tensor ref1({B.getDimension(0), kdim}, rm); + Tensor ref2({B.getDimension(0), ldim}, rm); + Tensor ref2_2({B.getDimension(0), ldim}, rm); + + ref1(i,k)=B(i,j)*C(j,k); + ref2(i,l)=ref1(i,k)*D(k,l); + ref2_2(i,l)=ref1(i,k)*D(k,l); + + IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment()); + ref1Stmt = makeConcreteNotation(ref1Stmt); + ref1Stmt = insertTemporaries(ref1Stmt); + ref1Stmt = parallelizeOuterLoop(ref1Stmt); + ref1.compile(ref1Stmt); + ref1.assemble(); + + IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment()); + ref2Stmt = makeConcreteNotation(ref2Stmt); + ref2Stmt = insertTemporaries(ref2Stmt); + ref2Stmt = ref2Stmt.split(i, i0, i1, 16); + ref2Stmt = parallelizeOuterLoop(ref2Stmt); + ref2.compile(ref2Stmt); + ref2.assemble(); + + IndexStmt ref2Stmt2 = makeReductionNotation(ref2_2.getAssignment()); + ref2Stmt2 = makeConcreteNotation(ref2Stmt2); + ref2Stmt2 = ref2Stmt2 + .split(i, i0, i1, 32) + .split(k,k0,k1, 32) + .split(l, l0, l1, 32) + .reorder({i0, k0, l0, i1, k1, l1}) + .parallelize(j0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); + ref2Stmt2 = insertTemporaries(ref2Stmt2); + // ref2Stmt2 = parallelizeOuterLoop(ref2Stmt2); + ref2_2.compile(ref2Stmt2); + ref2_2.assemble(); + + + // -------------- GeMM and SpMM + + Tensor ref3({C.getDimension(0), ldim}, rm); + Tensor ref4({C.getDimension(0), ldim}, rm); + ref3(j,l)=C(j,k)*D(k,l); // GEMM + ref4(i,l) = B(i,j)*ref3(j,l); // SpMM + + IndexStmt ref3Stmt = ref3.getAssignment().concretize(); + ref3Stmt = ref3Stmt + .split(j, j0, j1, 32) // changed to 32 + .split(k, k0, k1, 32) + .split(l, l0, l1, 32) + .reorder({j0, k0, l0, j1, k1, l1}) + .parallelize(j0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); + ref2Stmt2 = insertTemporaries(ref2Stmt2); + ref3.compile(ref3Stmt); + ref3.assemble(); + + IndexStmt ref4Stmt = makeReductionNotation(ref4.getAssignment()); // SpMM operation + ref4Stmt = makeConcreteNotation(ref4Stmt); + ref4Stmt = ref4Stmt.split(i, i0, i1, 16); + ref4Stmt = insertTemporaries(ref4Stmt); + ref4Stmt = parallelizeOuterLoop(ref4Stmt); + ref4.compile(ref4Stmt); + ref4.assemble(); + + + std::cout << "compute start\n"; + taco::util::TimeResults timevalue; + bool time = true; + + statfile << "\n--------- 1st pattern computation TTM, GEMM\n"; + + retval = PAPI_hl_region_begin("spmm"); + if ( retval != PAPI_OK ) handle_error(1); + TOOL_BENCHMARK_TIMER(ref1.compute(statfile), "\n\nSpMM Kernel: ", timevalue); + retval = PAPI_hl_region_end("spmm"); + if ( retval != PAPI_OK ) handle_error(1); + if (statfile.is_open()) { + statfile << "SpMM time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + std::string sofile_spmm_template = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so"; + retval = PAPI_hl_region_begin("spmmtemplate"); + if ( retval != PAPI_OK ) handle_error(1); + TOOL_BENCHMARK_TIMER(ref1.compute(statfile, sofile_spmm_template), "\n\nSpMM template Kernel: ", timevalue); + retval = PAPI_hl_region_end("spmmtemplate"); + if ( retval != PAPI_OK ) handle_error(1); + if (statfile.is_open()) { + statfile << "SpMM template time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + retval = PAPI_hl_region_begin("gemm"); + if ( retval != PAPI_OK ) handle_error(1); + TOOL_BENCHMARK_TIMER(ref2.compute(statfile), "\n\nGeMM Kernel: ", timevalue); + retval = PAPI_hl_region_end("gemm"); + if ( retval != PAPI_OK ) handle_error(1); + if (statfile.is_open()) { + statfile << "GeMM time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + retval = PAPI_hl_region_begin("gemmtemplate"); + if ( retval != PAPI_OK ) handle_error(1); + TOOL_BENCHMARK_TIMER(ref2_2.compute(statfile), "\n\nref GeMM template Kernel: ", timevalue); + retval = PAPI_hl_region_end("gemmtemplate"); + if ( retval != PAPI_OK ) handle_error(1); + if (statfile.is_open()) { + statfile << "ref 2 GeMM template time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + // std::string sofile_gemm_template = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/spmm_gemm/spmm_template.so"; + statfile << "\n--------- 2nd pattern computation GEMM, SpMM\n"; + retval = PAPI_hl_region_begin("gemmtemplate2"); + if ( retval != PAPI_OK ) handle_error(1); + TOOL_BENCHMARK_TIMER(ref3.compute(statfile), "\n\nGeMM template ref3 Kernel: ", timevalue); + retval = PAPI_hl_region_end("gemmtemplate2"); + if ( retval != PAPI_OK ) handle_error(1); + if (statfile.is_open()) { + statfile << "ref3 GeMM template time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + retval = PAPI_hl_region_begin("spmm2"); + if ( retval != PAPI_OK ) handle_error(1); + TOOL_BENCHMARK_TIMER(ref4.compute(statfile, sofile_spmm_template), "\n\nSpMM template Kernel ref4: ", timevalue); + retval = PAPI_hl_region_end("spmm2"); + if ( retval != PAPI_OK ) handle_error(1); + if (statfile.is_open()) { + statfile << "SpMM template time ref4: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + + statfile << "\n-------- reference pattern computation\n"; + + retval = PAPI_hl_region_begin("ref"); + if ( retval != PAPI_OK ) handle_error(1); + TOOL_BENCHMARK_TIMER(ref.compute(statfile), "\n\nReference Kernel: ", timevalue); + retval = PAPI_hl_region_end("ref"); + if ( retval != PAPI_OK ) handle_error(1); + if (statfile.is_open()) { + statfile << "taco reference time: "; + statfile << timevalue << std::endl; + } else { std::cout << " stat file is not open\n"; } + + retval = PAPI_hl_region_begin("refnew"); + if ( retval != PAPI_OK ) handle_error(1); + TOOL_BENCHMARK_TIMER(refn.compute(statfile), "\n\nReference new Kernel: ", timevalue); + retval = PAPI_hl_region_end("refnew"); + if ( retval != PAPI_OK ) handle_error(1); + if (statfile.is_open()) { + statfile << "taco reference new time: "; + statfile << timevalue << std::endl; + } else { std::cout << " stat file is not open\n"; } + + + retval = PAPI_hl_region_begin("sparselnr"); + if ( retval != PAPI_OK ) handle_error(1); + TOOL_BENCHMARK_TIMER(A.compute(statfile), "\n\nFused Kernel: ", timevalue); + retval = PAPI_hl_region_end("sparselnr"); + if ( retval != PAPI_OK ) handle_error(1); + if (statfile.is_open()) { + statfile << "fused time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + + double* A_vals = (double*) (A.getTacoTensorT()->vals); + double* ref_vals = (double*) (ref.getTacoTensorT()->vals); + double* ref2_vals = (double*) (ref2.getTacoTensorT()->vals); + double* ref4_vals = (double*) (ref2.getTacoTensorT()->vals); + + // int* A2_pos = (double*) (ref.getTacoTensorT()->vals); + + // for (size_t q=0; q < B.getStorage().getValues().getSize(); q++) { + // if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) { + // std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + // << "refvals: " << ref_vals[q] << std::endl; + // ASSERT_TRUE(false); + // } + // } + + for (int q=0; q < A.getDimension(0)* A.getDimension(1); q++) { + if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) { + std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + << "refvals: " << ref_vals[q] << std::endl; + ASSERT_TRUE(false); + } + } + for (int q=0; q < A.getDimension(0)* A.getDimension(1); q++) { + if ( abs(A_vals[q] - ref2_vals[q])/abs(ref2_vals[q]) > ERROR_MARGIN) { + std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + << "refvals: " << ref2_vals[q] << std::endl; + ASSERT_TRUE(false); + } + } + for (int q=0; q < A.getDimension(0)* A.getDimension(1); q++) { + if ( abs(A_vals[q] - ref4_vals[q])/abs(ref4_vals[q]) > ERROR_MARGIN) { + std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + << "refvals: " << ref4_vals[q] << std::endl; + ASSERT_TRUE(false); + } + } + + } // end of file num for loop + + if (statfile.is_open()) { + statfile.close(); + } + + + // unsigned int native = 0x0; + + // retval = PAPI_library_init(PAPI_VER_CURRENT); + + // if (retval != PAPI_VER_CURRENT) { + // printf("PAPI library init error!\n"); + // exit(1); + // } else { + // printf("PAPI library init success\n"); + // } + + // if (PAPI_create_eventset(&EventSet) != PAPI_OK) { + // handle_error(1); + // } + + // /* Add the native event */ + // native = () + + retval = PAPI_hl_region_begin("computation1"); + if ( retval != PAPI_OK ) + handle_error(1); + + /* Do some computation */ + + retval = PAPI_hl_region_end("computation1"); + if ( retval != PAPI_OK ) + handle_error(1); + + retval = PAPI_hl_region_begin("computation2"); + if ( retval != PAPI_OK ) + handle_error(1); + + /* Do some computation */ + + retval = PAPI_hl_region_end("computation2"); + if ( retval != PAPI_OK ) + handle_error(1); +} + + + + + + +TEST(scheduling_eval, sddmmspmmFused) { + if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) { + return; + } + + taco_set_num_threads(NUM_THREADS_TO_USE); + + ofstream statfile; + statfile.open( + "/home/min/a/kadhitha/workspace/my_taco/taco/test/stats/sddmm-spmm-gemm.txt", std::ios::app); + if (statfile.is_open()) { + statfile << "\nsddmm-spmm-gemm execution\n"; + statfile << "\n-----------------------------------------\n"; + } + + std::default_random_engine gen(0); + std::uniform_real_distribution unif(0.0, 1.0); + + Format csr({dense, sparse}); + Format rm({dense, dense}); + + int kdim = 64; + int ldim = 64; + int mdim = 64; + + vector filenums{2, 3,4,5,6,7,8,9,10,12,15}; + + for (auto filenum : filenums) { + + + std::vector matfiles = { + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx", // 5 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx", // 8 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx", // 10 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx", // 12 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wiki-Talk/wiki-Talk.mtx", // 13 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/com-Orkut/com-Orkut.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx", // 15 + "/home/min/a/kadhitha/workspace/my_taco/FusedMM/dataset/harvard.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/twitter7/twitter7.mtx" + }; + std::vector matfilesrw = { + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/synthetic.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cage3.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/bcsstk17.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/pdb1HYS.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/rma10.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cant.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/consph.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cop20k_A.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/shipsec1.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/scircuit.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/webbase-1M.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/wiki-Talk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/com-Orkut.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/circuit5M.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/harvard.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/twitter7.mtx" + }; + + std::string matfile = matfiles[filenum]; + std::cout << "reading B mat mtx\n"; + Tensor B = read(matfile, csr, true); + B.setName("B"); + B.pack(); + // write(matfilesrw[filenum], B); + + if (statfile.is_open()) { + statfile << matfile << std::endl; + } + + std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl; + std::cout << "adding c mat\n"; + Tensor C({B.getDimension(0), kdim}, rm); + for (int i = 0; i < C.getDimension(0); ++i) { + for (int j = 0; j < C.getDimension(1); ++j) { + C.insert({i,j}, unif(gen)); + } + } + std::cout << "packing C mat\n"; + C.pack(); + + Tensor D({B.getDimension(1), kdim}, rm); + for (int i = 0; i < D.getDimension(0); ++i) { + for (int j = 0; j < D.getDimension(1); ++j) { + D.insert({i,j}, unif(gen)); + } + } + std::cout << "packing D mat\n"; + D.pack(); + + Tensor F({B.getDimension(1), ldim}, rm); + for (int i = 0; i < F.getDimension(0); ++i) { + for (int j = 0; j < F.getDimension(1); ++j) { + F.insert({i,j}, unif(gen)); + } + } + std::cout << "packing F mat\n"; + F.pack(); + + Tensor G({ldim, mdim}, rm); + for (int i = 0; i < G.getDimension(0); ++i) { + for (int j = 0; j < G.getDimension(1); ++j) { + G.insert({i,j}, unif(gen)); + } + } + std::cout << "packing F mat\n"; + G.pack(); + + Tensor A({B.getDimension(0), mdim}, rm); + Tensor ref({B.getDimension(0), mdim}, rm); + IndexVar i, j, k, l, m; + IndexVar i0("i0"), i1("i1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"), k0("k0"), k1("k1"); + IndexVar l0("l0"), l1("l1"), m0("m0"), m1("m1"); + + A(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); + + if (statfile.is_open()) { + statfile + << "ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);" << std::endl + << "B1_dimension: " << B.getDimension(0) << ", B2_dimension: " << B.getDimension(1) << ", vals: " << B.getStorage().getValues().getSize() << std::endl + << "C1_dimension: " << C.getDimension(0) << ", C2_dimension: " << C.getDimension(1) << ", vals: " << C.getStorage().getValues().getSize() << std::endl + << "D1_dimension: " << D.getDimension(0) << ", D2_dimension: " << D.getDimension(1) << ", vals: " << D.getStorage().getValues().getSize() << std::endl + << "E1_dimension: " << F.getDimension(0) << ", E2_dimension: " << F.getDimension(1) << ", vals: " << F.getStorage().getValues().getSize() << std::endl + << "G1_dimension: " << F.getDimension(0) << ", G2_dimension: " << G.getDimension(1) << ", vals: " << G.getStorage().getValues().getSize() << std::endl + << std::endl; + } + + // IndexStmt stmt = A.getAssignment().concretize(); + IndexStmt stmt = makeReductionNotation(A.getAssignment()); + stmt = makeConcreteNotation(stmt); + stmt = reorderLoopsTopologically(stmt); + stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 2); + stmt = stmt.split(i, i0, i1, 16); + + stmt = insertTemporaries(stmt); + stmt = parallelizeOuterLoop(stmt); + printToFile("sddmmSpMMGeMM", stmt); + + A.compile(stmt); + A.assemble(); + + + ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); + IndexStmt refStmt = makeReductionNotation(ref.getAssignment()); + refStmt = makeConcreteNotation(refStmt); + refStmt = refStmt.split(i, i0, i1, 16); + refStmt = insertTemporaries(refStmt); + refStmt = parallelizeOuterLoop(refStmt); + ref.compile(refStmt); + ref.assemble(); + + Tensor ref1({B.getDimension(0), B.getDimension(1)}, csr); + Tensor ref2({B.getDimension(0), ldim}, rm); + Tensor ref3({B.getDimension(0), mdim}, rm); + ref1(i,j)=B(i,j)*C(i,k)*D(j,k); + ref2(i,l)=ref1(i,j)*F(j,l); + ref3(i,m)=ref2(i,l)*G(l,m); + + IndexStmt ref1Stmt = ref1.getAssignment().concretize(); + + ref1Stmt = ref1Stmt.split(i, i0, i1, 16); + // // .pos(j, jpos, B(i,j)); + // // .split(k, k0, k1, 8); + // // .reorder({i0, i1, jpos0, k, jpos1}); + // // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + // // .parallelize(jpos1, ParallelUnit::CPUVector, OutputRaceStrategy::ParallelReduction); + // // ref1Stmt.split(i, ); + // // stmt = scheduleSDDMMCPU_forfuse(ref1Stmt, B); + // IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment()); + // ref1Stmt = makeConcreteNotation(ref1Stmt); + ref1Stmt = insertTemporaries(ref1Stmt); + ref1Stmt = parallelizeOuterLoop(ref1Stmt); + ref1.compile(ref1Stmt); + ref1.assemble(); + + IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment()); + ref2Stmt = makeConcreteNotation(ref2Stmt); + ref2Stmt = insertTemporaries(ref2Stmt); + ref2Stmt = parallelizeOuterLoop(ref2Stmt); + ref2.compile(ref2Stmt); + ref2.assemble(); + + // ref3(i,m)=ref2(i,l)*G(l,m); + IndexStmt ref3Stmt = makeReductionNotation(ref3.getAssignment()); + ref3Stmt = makeConcreteNotation(ref3Stmt); + ref3Stmt = ref3Stmt + .split(i, i0, i1, 32) + .split(l, l0, l1, 32) + .split(m, m0, m1, 32) + .reorder({i0, l0, m0, i1, l1, m1}); + ref3Stmt = insertTemporaries(ref3Stmt); + ref3Stmt = parallelizeOuterLoop(ref3Stmt); + ref3.compile(ref3Stmt); + ref3.assemble(); + + std::cout << "compute start\n"; + taco::util::TimeResults timevalue; + bool time = true; + + // std::string sofile_fused = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/fused_kernel.so"; + TOOL_BENCHMARK_TIMER(A.compute(statfile), "\n\nFused Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "fused time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + // std::string sofile_sddmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so"; + std::string sofile_sddmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.so"; + TOOL_BENCHMARK_TIMER(ref1.compute(statfile, sofile_sddmm), "\n\nSDDMM Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "sddmm time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + std::string sofile_sddmm_ryan = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/sddmm_ryan.so"; + TOOL_BENCHMARK_TIMER(ref1.compute(statfile, sofile_sddmm_ryan), "\n\nSDDMM ryan Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "sddmm ryan time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + std::string sofile_spmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so"; + TOOL_BENCHMARK_TIMER(ref2.compute(statfile, sofile_spmm), "\n\nSpMM ryan Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "spmm ryan time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + // std::string sofile_spmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so"; + TOOL_BENCHMARK_TIMER(ref3.compute(statfile), "\n\nGeMM Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "gemm time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + // std::string sofile_original = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/taco_original.so"; + TOOL_BENCHMARK_TIMER(ref.compute(statfile), "\n\nReference Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "taco reference time: "; + statfile << timevalue << std::endl; + } else { std::cout << " stat file is not open\n"; } + + double* A_vals = (double*) (A.getTacoTensorT()->vals); + double* ref_vals = (double*) (ref.getTacoTensorT()->vals); + double* ref3_vals = (double*) (ref3.getTacoTensorT()->vals); + + // int* A2_pos = (double*) (ref.getTacoTensorT()->vals); + + for (int q=0; q < A.getDimension(0)* A.getDimension(1); q++) { + if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) { + std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + << "refvals: " << ref_vals[q] << std::endl; + ASSERT_TRUE(false); + } + } + + for (int q=0; q < A.getDimension(0)* A.getDimension(1); q++) { + if ( abs(ref3_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) { + std::cout << "error: results don't match i: " << q << ", avals: " << ref3_vals[q] << " " + << "refvals: " << ref_vals[q] << std::endl; + ASSERT_TRUE(false); + } + } + + + + } + + // int filenum = 3; + + + // for (size_t q=0; q < A.getDimension(0)* A.getDimension(1); q++) { + // if ( abs(A_vals[q] - ref3_vals[q])/abs(ref3_vals[q]) > ERROR_MARGIN) { + // std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + // << "refvals: " << ref3_vals[q] << std::endl; + // ASSERT_TRUE(false); + // } + // } + // for (int q= 0; q< A_vals + // for (int q = 0; q < A.getDimension(0); ++q) { + // for (int w = 0; w < A.getDimension(1); ++w) { + // if ( abs(A(q,w) - ref(q,w))/abs(ref(q,w)) > ERROR_MARGIN) { + // std::cout << "error: results don't match A("<< q << "," << w << "): " + // << A(q,w) << ", ref: " << ref(q,w) << std::endl; + // ASSERT_TRUE(false); + // } + // } + // } + // ASSERT_TENSOR_EQ(A, ref); + + if (statfile.is_open()) { + statfile.close(); + } + +} \ No newline at end of file diff --git a/test/tests-scheduling-ispc-eval.cpp b/test/tests-scheduling-ispc-eval.cpp new file mode 100644 index 000000000..139597f9c --- /dev/null +++ b/test/tests-scheduling-ispc-eval.cpp @@ -0,0 +1,2 @@ + + diff --git a/test/tests-transformation.cpp b/test/tests-transformation.cpp index abfec3d45..9a472906f 100644 --- a/test/tests-transformation.cpp +++ b/test/tests-transformation.cpp @@ -255,6 +255,8 @@ INSTANTIATE_TEST_CASE_P(parallelize, apply, struct reorderLoopsTopologically : public TestWithParam {}; + +// TEST_P(reorderLoopsTopologically, test) { IndexStmt actual = taco::reorderLoopsTopologically(GetParam().actual); ASSERT_NOTATION_EQ(GetParam().expected, actual); diff --git a/test/util.h b/test/util.h new file mode 100644 index 000000000..f96087ba1 --- /dev/null +++ b/test/util.h @@ -0,0 +1,113 @@ +#ifndef __SCHEDULE_UTIL_HH__ +#define __SCHEDULE_UTIL_HH__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "taco/cuda.h" +#include "test.h" +#include "test_tensors.h" +#include "taco/tensor.h" +#include "taco/index_notation/index_notation.h" +#include "taco/index_notation/transformations.h" +#include "codegen/codegen.h" +#include "taco/lower/lower.h" +#include "taco/util/timers.h" + +using namespace taco; + +#define ERROR_MARGIN (1.0e-2) + +#define TOOL_BENCHMARK_TIMER(CODE,NAME,TIMER) { \ + if (time) { \ + taco::util::Timer timer; \ + timer.start(); \ + CODE; \ + timer.stop(); \ + taco::util::TimeResults result = timer.getResult(); \ + cout << NAME << " " << result << " ms" << endl; \ + TIMER=result; \ + } \ + else { \ + CODE; \ + } \ +} + +#define TOOL_BENCHMARK_TIMER2(CODE,NAME,TIMER) { \ + if (time) { \ + taco::util::Timer timer; \ + timer.start(); \ + CODE; \ + timer.stop(); \ + taco::util::TimeResults result = timer.getResult(); \ + if (statfile.is_open()) { \ + statfile << NAME << " " << result << " ms" << endl; \ + } else { \ + cout << NAME << " " << result << " ms" << endl; \ + } \ + TIMER=result; \ + } \ + else { \ + CODE; \ + } \ +} + +static void printToCout(IndexStmt stmt); +static void printToFile(string filename, IndexStmt stmt); +static void printToFile(string filename, string additional_filename, IndexStmt stmt); + + +static void printToCout(IndexStmt stmt) { + std::shared_ptr codegen = ir::CodeGen::init_default(cout, ir::CodeGen::ImplementationGen); + ir::Stmt compute = lower(stmt, "compute", false, true); + codegen->compile(compute, true); +} + +void printToFile(string filename, IndexStmt stmt) { + stringstream source; + + string file_path = "eval_generated/"; + mkdir(file_path.c_str(), 0777); + + std::shared_ptr codegen = ir::CodeGen::init_default(source, ir::CodeGen::ImplementationGen); + ir::Stmt compute = lower(stmt, "compute", false, true); + codegen->compile(compute, true); + + ofstream source_file; + string file_ending = should_use_CUDA_codegen() ? ".cu" : ".c"; + source_file.open(file_path + filename + file_ending); + source_file << source.str(); + source_file.close(); +} + +void printToFile(string filename, string additional_filename, IndexStmt stmt) { + stringstream source1; + stringstream source2; + + string file_path = "eval_generated/"; + mkdir(file_path.c_str(), 0777); + + std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); + ir::Stmt compute = lower(stmt, "compute", false, true); + codegen->compile(compute, true); + + ofstream source_file; + string file_ending = should_use_CUDA_codegen() ? ".cu" : ".c"; + source_file.open(file_path+filename+file_ending); + source_file << source1.str(); + source_file.close(); + + ofstream additional_source_file; + string additional_file_ending = ".ispc"; + additional_source_file.open(file_path+additional_filename+additional_file_ending); + additional_source_file << source2.str(); + additional_source_file.close(); + +} + +#endif // __SCHEDULE_UTIL_HH__ \ No newline at end of file diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt index 922f7e52e..41699d3fd 100644 --- a/tools/CMakeLists.txt +++ b/tools/CMakeLists.txt @@ -4,6 +4,7 @@ foreach(TOOL_SOURCE ${TOOL_SOURCES}) get_filename_component(TOOL ${TOOL_SOURCE} NAME_WE) add_executable("${TOOL}-tool" ${TOOL_SOURCE}) target_link_libraries("${TOOL}-tool" taco) + target_link_libraries("${TOOL}-tool" papi) target_include_directories("${TOOL}-tool" PRIVATE "${CMAKE_BINARY_DIR}/include") SET_TARGET_PROPERTIES("${TOOL}-tool" PROPERTIES OUTPUT_NAME ${TOOL}) install(TARGETS "${TOOL}-tool" DESTINATION bin) diff --git a/tools/taco.cpp b/tools/taco.cpp index bf7e7c9dc..7384874ec 100644 --- a/tools/taco.cpp +++ b/tools/taco.cpp @@ -9,6 +9,7 @@ #include "taco.h" #include "taco/error.h" +#include "taco/index_notation/index_notation.h" #include "taco/parser/lexer.h" #include "taco/parser/parser.h" #include "taco/parser/schedule_parser.h" @@ -313,7 +314,9 @@ static void printCommandLine(ostream& os, int argc, char* argv[]) { } } -static int setSchedulingCommands(vector> scheduleCommands, parser::Parser& parser, IndexStmt& stmt) { +static int setSchedulingCommands(vector> scheduleCommands, + parser::Parser& parser, IndexStmt& stmt, Assignment assignment) { + std::cout << "setting scheduling commands\n"; auto findVar = [&stmt](string name) { ProvenanceGraph graph(stmt); @@ -364,6 +367,16 @@ static int setSchedulingCommands(vector> scheduleCommands, parser IndexVar fused(f); stmt = stmt.fuse(findVar(i), findVar(j), fused); + } else if (command == "loopfuse") { + taco_uassert(scheduleCommand.size() == 2) + << "'loopfuse' scheduling directive takes 2 parameters: fuse(b, 2)"; + std::string side = scheduleCommand[0]; + taco_uassert(side == "b" || side == "f") + << "first parameter must be either 'f' or 'b'"; + + int iters = std::stoi(scheduleCommand[1]); + + stmt = loopFusionOverFission(stmt, assignment, side, iters); } else if (command == "split") { taco_uassert(scheduleCommand.size() == 4) << "'split' scheduling directive takes 4 parameters: split(i, i1, i2, splitFactor)"; @@ -1048,6 +1061,7 @@ int main(int argc, char* argv[]) { map loadedTensors; TensorBase temp_tensor; parser::Parser temp_parser(exprStr, formats, dataTypes, tensorsDimensions, loadedTensors, 42); + std::cout << exprStr << std::endl; try { temp_parser.parse(); temp_tensor = temp_parser.getResultTensor(); @@ -1148,19 +1162,27 @@ int main(int argc, char* argv[]) { taco_set_parallel_schedule(sched, chunkSize); taco_set_num_threads(nthreads); - IndexStmt stmt = - makeConcreteNotation(makeReductionNotation(tensor.getAssignment())); + Assignment assignment = tensor.getAssignment(); + std::cout << "tensor.getAssignment(): " << assignment << std::endl; + + IndexStmt stmt2 = makeReductionNotation(tensor.getAssignment()); + std::cout << "reducedNotation: " << stmt2 << std::endl; + // IndexStmt stmt = + // makeConcreteNotation(makeReductionNotation(tensor.getAssignment())); + IndexStmt stmt = makeConcreteNotation(stmt2); std::cout << "concrete index statement: " << stmt << std::endl; - stmt = justTraverseThroughTheIndexStmt(stmt); stmt = reorderLoopsTopologically(stmt); + std::cout << "topologically reordered loops statement: " << stmt << std::endl; if (setSchedule) { - int val = setSchedulingCommands(scheduleCommands, parser, stmt); + int val = setSchedulingCommands(scheduleCommands, parser, stmt, tensor.getAssignment()); + // stmt = loopFusionOverFission(stmt, tensor.getAssignment()); cuda |= (val==1); ispc |= (val==2); } else { + // stmt = loopFusionOverFission(stmt, tensor.getAssignment()); stmt = insertTemporaries(stmt); stmt = parallelizeOuterLoop(stmt); } @@ -1186,12 +1208,15 @@ int main(int argc, char* argv[]) { set_ISPC_codegen_enabled(false); } - std::cout << "running scalar promote\n" << std::endl; + std::cout << "running scalar promote\n" << std::endl; // stmt = scalarPromote(stmt); + std::cout << "\nafter scalar promote: \n" << stmt << std::endl << std::endl; + if (printConcrete) { cout << stmt << endl; } + // lower index statement to ir statement Kernel kernel; if (benchmark) { if (time) cout << endl; @@ -1278,6 +1303,11 @@ int main(int argc, char* argv[]) { compute = lower(stmt, prefix+"compute", computeWithAssemble, true); assemble = lower(stmt, prefix+"assemble", true, false); evaluate = lower(stmt, prefix+"evaluate", true, true); + + std::cout << "\n\ncompute kernel\n------------\n" << compute << std::endl << std::endl; + // compute kernel is the most basic kernel after lowering phase + + std::cout << "\n\nevaluate kernel\n------------\n" << evaluate << std::endl << std::endl; } string packComment = @@ -1411,7 +1441,7 @@ int main(int argc, char* argv[]) { } IterationGraph iterationGraph; - if (printIterationGraph) { + if (printIterationGraph) { // print iteration graph iterationGraph = IterationGraph::make(tensor.getAssignment()); } From 43d1bf7f03397c0445f74b7d78643313968e3d0a Mon Sep 17 00:00:00 2001 From: Adhhitha Dias Date: Tue, 10 May 2022 11:14:59 -0400 Subject: [PATCH 10/10] add results --- CMakeLists.txt | 2 +- test/stats/hadamard-gemm.txt | 172 +++++++ test/stats/sddmm-spmm-gemm.txt | 318 +++++++++++++ test/stats/sddmm-spmm.txt | 821 +++++++++++++++++++++++++++++++++ test/stats/spmm-spmm.txt | 172 +++++++ test/tests-scheduling-fuse.cpp | 239 +++++----- 6 files changed, 1613 insertions(+), 111 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index aff905db5..c9012ca2d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,7 +16,7 @@ option(OPENMP "Build with OpenMP execution support" ON) option(COVERAGE "Build with code coverage analysis" OFF) set(TACO_FEATURE_CUDA 0) set(TACO_FEATURE_ISPC 0) -set(TACO_FEATURE_OPENMP 0) +set(TACO_FEATURE_OPENMP 1) set(TACO_FEATURE_PYTHON 0) if(CUDA) message("-- Searching for CUDA Installation") diff --git a/test/stats/hadamard-gemm.txt b/test/stats/hadamard-gemm.txt index 7de96d3c5..6e730cf50 100644 --- a/test/stats/hadamard-gemm.txt +++ b/test/stats/hadamard-gemm.txt @@ -747,3 +747,175 @@ gemm time: 61505.4 kernel execution time: 245613 ms taco reference time: 245614 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 128, E2_dimension: 128, vals: 16384 + + +kernel execution time: 18.3809 ms +fused time: 19.1229 + +kernel execution time: 0.635828 ms +hadamard time: 0.983143 + +kernel execution time: 30.5122 ms +gemm time: 30.7819 + +kernel execution time: 23.6746 ms +taco reference time: 24.0784 +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 548551, B2_dimension: 548551, vals: 1851744 +C1_dimension: 548551, C2_dimension: 128, vals: 70214528 +D1_dimension: 548551, D2_dimension: 128, vals: 70214528 +E1_dimension: 128, E2_dimension: 128, vals: 16384 + + +kernel execution time: 3580.2 ms +fused time: 3581 + +kernel execution time: 567.762 ms +hadamard time: 568.301 + +kernel execution time: 6079.96 ms +gemm time: 6080.46 + +kernel execution time: 8129.78 ms +taco reference time: 8130.38 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 128, E2_dimension: 128, vals: 16384 + + +kernel execution time: 18.4625 ms +fused time: 19.1824 + +kernel execution time: 0.520446 ms +hadamard time: 0.824011 + +kernel execution time: 30.2097 ms +gemm time: 30.46 + +kernel execution time: 23.4681 ms +taco reference time: 23.826 +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 548551, B2_dimension: 548551, vals: 1851744 +C1_dimension: 548551, C2_dimension: 128, vals: 70214528 +D1_dimension: 548551, D2_dimension: 128, vals: 70214528 +E1_dimension: 128, E2_dimension: 128, vals: 16384 + + +kernel execution time: 3528.39 ms +fused time: 3529.23 + +kernel execution time: 558.625 ms +hadamard time: 559.16 + +kernel execution time: 6157.3 ms +gemm time: 6158.14 + +kernel execution time: 8131.73 ms +taco reference time: 8132.69 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 128, E2_dimension: 128, vals: 16384 + + +kernel execution time: 2.27347 ms +fused time: 2.7115 + +kernel execution time: 0.180952 ms +hadamard time: 0.76318 + +kernel execution time: 2.72672 ms +gemm time: 3.22211 + +kernel execution time: 5.227 ms +taco reference time: 5.75632 +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 548551, B2_dimension: 548551, vals: 1851744 +C1_dimension: 548551, C2_dimension: 128, vals: 70214528 +D1_dimension: 548551, D2_dimension: 128, vals: 70214528 +E1_dimension: 128, E2_dimension: 128, vals: 16384 + + +kernel execution time: 164.815 ms +fused time: 165.539 + +kernel execution time: 96.629 ms +hadamard time: 97.303 + +kernel execution time: 202.068 ms +gemm time: 202.628 + +kernel execution time: 273.96 ms +taco reference time: 274.643 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 128, E2_dimension: 128, vals: 16384 + + +kernel execution time: 2.37004 ms +fused time: 3.11591 + +kernel execution time: 0.176612 ms +hadamard time: 0.833621 + +kernel execution time: 2.08823 ms +gemm time: 2.59022 + +kernel execution time: 3.36531 ms +taco reference time: 4.11087 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 128, E2_dimension: 128, vals: 16384 + + +kernel execution time: 19.3307 ms +fused time: 20.0662 + +kernel execution time: 0.496176 ms +hadamard time: 0.931803 + +kernel execution time: 30.1194 ms +gemm time: 30.3654 + +kernel execution time: 23.3946 ms +taco reference time: 23.7411 diff --git a/test/stats/sddmm-spmm-gemm.txt b/test/stats/sddmm-spmm-gemm.txt index 7bd2084ed..02665478f 100644 --- a/test/stats/sddmm-spmm-gemm.txt +++ b/test/stats/sddmm-spmm-gemm.txt @@ -1151,3 +1151,321 @@ gemm time: 421.26 kernel execution time: 326305 ms taco reference time: 326311 + +sddmm-spmm-gemm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 64, vals: 173312 +D1_dimension: 2708, D2_dimension: 64, vals: 173312 +E1_dimension: 2708, E2_dimension: 64, vals: 173312 +G1_dimension: 2708, G2_dimension: 64, vals: 4096 + + +kernel execution time: 5.08607 ms +fused time: 5.61989 + +kernel execution time: 0.557608 ms +sddmm time: 0.871642 + +kernel execution time: 0.465526 ms +sddmm ryan time: 0.7713 + +kernel execution time: 0.498686 ms +spmm ryan time: 0.739309 + +kernel execution time: 0.7957 ms +gemm time: 1.05919 + +kernel execution time: 42.447 ms +taco reference time: 42.885 +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 548551, B2_dimension: 548551, vals: 1851744 +C1_dimension: 548551, C2_dimension: 64, vals: 35107264 +D1_dimension: 548551, D2_dimension: 64, vals: 35107264 +E1_dimension: 548551, E2_dimension: 64, vals: 35107264 +G1_dimension: 548551, G2_dimension: 64, vals: 4096 + + +kernel execution time: 89.9099 ms +fused time: 90.5117 + +kernel execution time: 29.9086 ms +sddmm time: 30.4936 + +kernel execution time: 29.1529 ms +sddmm ryan time: 29.7063 + +kernel execution time: 34.6318 ms +spmm ryan time: 35.1535 + +kernel execution time: 66.4663 ms +gemm time: 67.0316 + +kernel execution time: 6272.25 ms +taco reference time: 6273.42 + +sddmm-spmm-gemm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 64, vals: 173312 +D1_dimension: 2708, D2_dimension: 64, vals: 173312 +E1_dimension: 2708, E2_dimension: 64, vals: 173312 +G1_dimension: 2708, G2_dimension: 64, vals: 4096 + + +kernel execution time: 3.72391 ms +fused time: 4.19698 + +kernel execution time: 0.585647 ms +sddmm time: 0.893112 + +kernel execution time: 0.483056 ms +sddmm ryan time: 0.79108 + +kernel execution time: 0.567518 ms +spmm ryan time: 0.808711 + +kernel execution time: 0.929183 ms +gemm time: 1.32543 + +kernel execution time: 35.7066 ms +taco reference time: 36.3331 +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 548551, B2_dimension: 548551, vals: 1851744 +C1_dimension: 548551, C2_dimension: 64, vals: 35107264 +D1_dimension: 548551, D2_dimension: 64, vals: 35107264 +E1_dimension: 548551, E2_dimension: 64, vals: 35107264 +G1_dimension: 548551, G2_dimension: 64, vals: 4096 + + +kernel execution time: 94.9377 ms +fused time: 95.7687 + +kernel execution time: 32.2051 ms +sddmm time: 32.7881 + +kernel execution time: 30.3982 ms +sddmm ryan time: 30.95 + +kernel execution time: 34.4172 ms +spmm ryan time: 34.9049 + +kernel execution time: 67.2709 ms +gemm time: 67.8035 + +kernel execution time: 6215.08 ms +taco reference time: 6216.26 + +sddmm-spmm-gemm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 64, vals: 173312 +D1_dimension: 2708, D2_dimension: 64, vals: 173312 +E1_dimension: 2708, E2_dimension: 64, vals: 173312 +G1_dimension: 2708, G2_dimension: 64, vals: 4096 + + +kernel execution time: 6.99173 ms +fused time: 7.86448 + +kernel execution time: 0.78061 ms +sddmm time: 1.28867 + +kernel execution time: 0.554227 ms +sddmm ryan time: 0.837111 + +kernel execution time: 0.909912 ms +spmm ryan time: 1.12908 + +kernel execution time: 7.60724 ms +gemm time: 7.85047 + +kernel execution time: 652.888 ms +taco reference time: 653.271 +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 548551, B2_dimension: 548551, vals: 1851744 +C1_dimension: 548551, C2_dimension: 64, vals: 35107264 +D1_dimension: 548551, D2_dimension: 64, vals: 35107264 +E1_dimension: 548551, E2_dimension: 64, vals: 35107264 +G1_dimension: 548551, G2_dimension: 64, vals: 4096 + + +kernel execution time: 1236.33 ms +fused time: 1236.87 + +kernel execution time: 249.805 ms +sddmm time: 250.356 + +kernel execution time: 247.195 ms +sddmm ryan time: 247.729 + +kernel execution time: 285.764 ms +spmm ryan time: 286.235 + +kernel execution time: 1529.34 ms +gemm time: 1529.83 + +kernel execution time: 190620 ms +taco reference time: 190621 + +sddmm-spmm-gemm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 64, vals: 173312 +D1_dimension: 2708, D2_dimension: 64, vals: 173312 +E1_dimension: 2708, E2_dimension: 64, vals: 173312 +G1_dimension: 2708, G2_dimension: 64, vals: 4096 + + +kernel execution time: 1.86163 ms +fused time: 2.34746 + +kernel execution time: 0.542927 ms +sddmm time: 1.05528 + +kernel execution time: 0.541998 ms +sddmm ryan time: 1.07672 + +kernel execution time: 0.524767 ms +spmm ryan time: 0.944293 + +kernel execution time: 0.75947 ms +gemm time: 1.2162 + +kernel execution time: 36.3755 ms +taco reference time: 37.0989 + +sddmm-spmm-gemm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 64, vals: 173312 +D1_dimension: 2708, D2_dimension: 64, vals: 173312 +E1_dimension: 2708, E2_dimension: 64, vals: 173312 +G1_dimension: 2708, G2_dimension: 64, vals: 4096 + + +kernel execution time: 1.97375 ms +fused time: 2.84436 + +kernel execution time: 0.881212 ms +sddmm time: 1.38907 + +kernel execution time: 0.545557 ms +sddmm ryan time: 1.0807 + +kernel execution time: 0.548488 ms +spmm ryan time: 0.978813 + +kernel execution time: 0.72955 ms +gemm time: 1.2023 + +kernel execution time: 34.867 ms +taco reference time: 35.5819 + +sddmm-spmm-gemm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 64, vals: 173312 +D1_dimension: 2708, D2_dimension: 64, vals: 173312 +E1_dimension: 2708, E2_dimension: 64, vals: 173312 +G1_dimension: 2708, G2_dimension: 64, vals: 4096 + + +kernel execution time: 1.69165 ms +fused time: 2.2114 + +kernel execution time: 0.908102 ms +sddmm time: 1.19792 + +kernel execution time: 0.513137 ms +sddmm ryan time: 0.807571 + +kernel execution time: 0.510327 ms +spmm ryan time: 0.76134 + +kernel execution time: 0.803101 ms +gemm time: 1.0684 + +kernel execution time: 45.9784 ms +taco reference time: 46.3901 + +sddmm-spmm-gemm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 64, vals: 173312 +D1_dimension: 2708, D2_dimension: 64, vals: 173312 +E1_dimension: 2708, E2_dimension: 64, vals: 173312 +G1_dimension: 2708, G2_dimension: 64, vals: 4096 + + +kernel execution time: 1.82354 ms +fused time: 2.81223 + +kernel execution time: 0.926052 ms +sddmm time: 1.48292 + +kernel execution time: 0.564157 ms +sddmm ryan time: 1.14611 + +kernel execution time: 0.512447 ms +spmm ryan time: 0.925102 + +kernel execution time: 0.689109 ms +gemm time: 1.08196 + +kernel execution time: 34.7847 ms +taco reference time: 35.4182 + +sddmm-spmm-gemm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 64, vals: 173312 +D1_dimension: 2708, D2_dimension: 64, vals: 173312 +E1_dimension: 2708, E2_dimension: 64, vals: 173312 +G1_dimension: 2708, G2_dimension: 64, vals: 4096 + + +kernel execution time: 6.8174 ms +fused time: 7.69061 + +kernel execution time: 0.935843 ms +sddmm time: 1.46847 + +kernel execution time: 0.612468 ms +sddmm ryan time: 0.880662 + +kernel execution time: 0.831351 ms +spmm ryan time: 1.05745 + +kernel execution time: 7.58342 ms +gemm time: 7.82297 + +kernel execution time: 566.881 ms +taco reference time: 567.264 diff --git a/test/stats/sddmm-spmm.txt b/test/stats/sddmm-spmm.txt index cc1713e9f..df8d924b8 100644 --- a/test/stats/sddmm-spmm.txt +++ b/test/stats/sddmm-spmm.txt @@ -5172,3 +5172,824 @@ separate execution kernel execution time: 4107.02 ms sddmm time: 4122.77 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 5, B2_dimension: 5, vals: 19 +C1_dimension: 5, C2_dimension: 128, vals: 640 +D1_dimension: 5, D2_dimension: 128, vals: 640 +E1_dimension: 5, E2_dimension: 128, vals: 640 + + +kernel execution time: 0.115981 ms +fused time: 0.499507 + +separate execution + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 5, B2_dimension: 5, vals: 19 +C1_dimension: 5, C2_dimension: 128, vals: 640 +D1_dimension: 5, D2_dimension: 128, vals: 640 +E1_dimension: 5, E2_dimension: 128, vals: 640 + + +kernel execution time: 0.133052 ms +fused time: 3.69599 + +separate execution + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2700, B2_dimension: 2700, vals: 5400 +C1_dimension: 2700, C2_dimension: 128, vals: 345600 +D1_dimension: 2700, D2_dimension: 128, vals: 345600 +E1_dimension: 2700, E2_dimension: 128, vals: 345600 + + +kernel execution time: 0.606469 ms +fused time: 4.32552 + +separate execution + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2700, B2_dimension: 2700, vals: 5400 +C1_dimension: 2700, C2_dimension: 128, vals: 345600 +D1_dimension: 2700, D2_dimension: 128, vals: 345600 +E1_dimension: 2700, E2_dimension: 128, vals: 345600 + + +kernel execution time: 0.650529 ms +fused time: 1.40893 + +separate execution + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5400 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 0.620999 ms +fused time: 1.38301 + +separate execution + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5400 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 0.652959 ms +fused time: 3.94184 + +separate execution + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 0.597158 ms +fused time: 4.27836 + +separate execution + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 0.659809 ms +fused time: 4.6484 + +separate execution + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 0.591018 ms +fused time: 2.44084 + +separate execution + +kernel execution time: 0.607388 ms +sddmm time: 0.891202 + +kernel execution time: 0.857981 ms +sddmm time: 1.16087 + +kernel execution time: 0.922992 ms +spmm time: 1.60378 + +reference execution + +kernel execution time: 4.47191 ms +taco reference time: 5.26226 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 0.658879 ms +fused time: 4.15402 + +separate execution + +kernel execution time: 0.70888 ms +sddmm time: 1.21343 + +kernel execution time: 0.531398 ms +sddmm time: 1.30729 + +kernel execution time: 0.965464 ms +spmm time: 2.35378 + +reference execution + +kernel execution time: 3.48771 ms +taco reference time: 7.55141 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 0.616739 ms +fused time: 4.4146 + +separate execution + +kernel execution time: 0.556318 ms +sddmm time: 3.03196 + +kernel execution time: 0.945623 ms +sddmm time: 1.89019 + +kernel execution time: 0.777471 ms +spmm time: 3.57728 + +reference execution + +kernel execution time: 3.22827 ms +taco reference time: 7.39799 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 0.65531 ms +fused time: 4.08374 + +separate execution + +kernel execution time: 0.666219 ms +sddmm time: 1.20641 + +kernel execution time: 0.941573 ms +sddmm time: 1.73185 + +kernel execution time: 1.01493 ms +spmm time: 1.75608 + +reference execution + +kernel execution time: 5.25507 ms +taco reference time: 6.04624 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 0.670959 ms +fused time: 1.50328 + +separate execution + +kernel execution time: 0.600268 ms +sddmm time: 1.32833 + +kernel execution time: 0.476237 ms +sddmm time: 0.792151 + +kernel execution time: 0.781091 ms +spmm time: 1.10271 + +reference execution + +kernel execution time: 3.07623 ms +taco reference time: 3.53829 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 0.760541 ms +fused time: 1.49073 + +separate execution + +kernel execution time: 0.639829 ms +sddmm time: 1.21327 + +kernel execution time: 0.576218 ms +sddmm time: 1.14083 + +kernel execution time: 0.829512 ms +spmm time: 1.33624 + +reference execution + +kernel execution time: 4.14591 ms +taco reference time: 4.82508 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 0.638949 ms +fused time: 1.02277 + +separate execution + +kernel execution time: 0.945034 ms +sddmm time: 1.20456 + +kernel execution time: 0.6772 ms +sddmm time: 0.943263 + +kernel execution time: 0.888033 ms +spmm time: 1.133 + +reference execution + +kernel execution time: 3.82989 ms +taco reference time: 4.18452 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 0.7361 ms +fused time: 1.45315 + +separate execution + +kernel execution time: 0.7335 ms +sddmm time: 1.25184 + +kernel execution time: 0.642509 ms +sddmm time: 1.16064 + +kernel execution time: 1.02361 ms +spmm time: 1.48614 + +reference execution + +kernel execution time: 4.12035 ms +taco reference time: 4.75857 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 334863, B2_dimension: 334863, vals: 777323 +C1_dimension: 334863, C2_dimension: 128, vals: 42862464 +D1_dimension: 334863, D2_dimension: 128, vals: 42862464 +E1_dimension: 334863, E2_dimension: 128, vals: 42862464 + + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 548551, B2_dimension: 548551, vals: 925872 +C1_dimension: 548551, C2_dimension: 128, vals: 70214528 +D1_dimension: 548551, D2_dimension: 128, vals: 70214528 +E1_dimension: 548551, E2_dimension: 128, vals: 70214528 + + +kernel execution time: 66.4595 ms +fused time: 66.9196 + +separate execution + +kernel execution time: 22.9317 ms +sddmm time: 23.4738 + +kernel execution time: 22.4453 ms +sddmm time: 23.0045 + +kernel execution time: 44.2796 ms +spmm time: 44.8052 + +reference execution + +kernel execution time: 187.6 ms +taco reference time: 188.247 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 548551, B2_dimension: 548551, vals: 1851744 +C1_dimension: 548551, C2_dimension: 128, vals: 70214528 +D1_dimension: 548551, D2_dimension: 128, vals: 70214528 +E1_dimension: 548551, E2_dimension: 128, vals: 70214528 + + +kernel execution time: 103.551 ms +fused time: 104.018 + +separate execution + +kernel execution time: 39.9535 ms +sddmm time: 40.5639 + +kernel execution time: 39.2683 ms +sddmm time: 39.8581 + +kernel execution time: 65.8336 ms +spmm time: 66.417 + +reference execution + +kernel execution time: 306.901 ms +taco reference time: 307.61 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 548551, B2_dimension: 548551, vals: 1851744 +C1_dimension: 548551, C2_dimension: 128, vals: 70214528 +D1_dimension: 548551, D2_dimension: 128, vals: 70214528 +E1_dimension: 548551, E2_dimension: 128, vals: 70214528 + + +kernel execution time: 106.782 ms +fused time: 107.261 + +separate execution + +kernel execution time: 40.7961 ms +sddmm time: 41.3604 + +kernel execution time: 39.8676 ms +sddmm time: 40.4959 + +kernel execution time: 66.2656 ms +spmm time: 66.8105 + +reference execution + +kernel execution time: 367.416 ms +taco reference time: 368.086 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 548551, B2_dimension: 548551, vals: 1851744 +C1_dimension: 548551, C2_dimension: 128, vals: 70214528 +D1_dimension: 548551, D2_dimension: 128, vals: 70214528 +E1_dimension: 548551, E2_dimension: 128, vals: 70214528 + + +kernel execution time: 108.809 ms +fused time: 109.274 + +separate execution + +kernel execution time: 42.2311 ms +sddmm time: 42.826 + +kernel execution time: 41.711 ms +sddmm time: 42.3721 + +kernel execution time: 65.9512 ms +spmm time: 66.5647 + +reference execution + +kernel execution time: 360.581 ms +taco reference time: 361.225 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 548551, B2_dimension: 548551, vals: 1851744 +C1_dimension: 548551, C2_dimension: 128, vals: 70214528 +D1_dimension: 548551, D2_dimension: 128, vals: 70214528 +E1_dimension: 548551, E2_dimension: 128, vals: 70214528 + + +kernel execution time: 922.149 ms +fused time: 922.605 + +separate execution + +kernel execution time: 392.18 ms +sddmm time: 392.716 + +kernel execution time: 393.251 ms +sddmm time: 393.777 + +kernel execution time: 520.496 ms +spmm time: 521.007 + +reference execution + +kernel execution time: 9912.29 ms +taco reference time: 9913.37 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 2.15935 ms +fused time: 2.88765 + +separate execution + +kernel execution time: 1.09729 ms +sddmm time: 1.64867 + +kernel execution time: 0.987463 ms +sddmm time: 1.50853 + +kernel execution time: 2.22996 ms +spmm time: 2.71273 + +reference execution + +kernel execution time: 29.4617 ms +taco reference time: 29.8511 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 0.667108 ms +fused time: 1.05163 + +separate execution + +kernel execution time: 0.680159 ms +sddmm time: 0.994963 + +kernel execution time: 0.611478 ms +sddmm time: 1.1057 + +kernel execution time: 0.988313 ms +spmm time: 1.4939 + +reference execution + +kernel execution time: 3.64386 ms +taco reference time: 4.33446 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 0.691709 ms +fused time: 1.07767 + +separate execution + +kernel execution time: 0.516997 ms +sddmm time: 0.77957 + +kernel execution time: 0.458366 ms +sddmm time: 0.73026 + +kernel execution time: 0.777811 ms +spmm time: 1.01678 + +reference execution + +kernel execution time: 3.47463 ms +taco reference time: 3.82426 +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 548551, B2_dimension: 548551, vals: 1851744 +C1_dimension: 548551, C2_dimension: 128, vals: 70214528 +D1_dimension: 548551, D2_dimension: 128, vals: 70214528 +E1_dimension: 548551, E2_dimension: 128, vals: 70214528 + + +kernel execution time: 104.681 ms +fused time: 105.128 + +separate execution + +kernel execution time: 39.5478 ms +sddmm time: 40.1164 + +kernel execution time: 40.2068 ms +sddmm time: 40.7802 + +kernel execution time: 67.2769 ms +spmm time: 67.8666 + +reference execution + +kernel execution time: 378.806 ms +taco reference time: 379.526 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 2.0421 ms +fused time: 2.77318 + +separate execution + +kernel execution time: 0.890922 ms +sddmm time: 1.4406 + +kernel execution time: 0.673509 ms +sddmm time: 0.955103 + +kernel execution time: 1.93153 ms +spmm time: 2.18341 + +reference execution + +kernel execution time: 33.2851 ms +taco reference time: 33.6343 +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 548551, B2_dimension: 548551, vals: 1851744 +C1_dimension: 548551, C2_dimension: 128, vals: 70214528 +D1_dimension: 548551, D2_dimension: 128, vals: 70214528 +E1_dimension: 548551, E2_dimension: 128, vals: 70214528 + + +kernel execution time: 913.728 ms +fused time: 914.178 + +separate execution + +kernel execution time: 389.744 ms +sddmm time: 390.317 + +kernel execution time: 389.105 ms +sddmm time: 389.68 + +kernel execution time: 520.43 ms +spmm time: 520.979 + +reference execution + +kernel execution time: 9970.19 ms +taco reference time: 9971.18 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 1.81249 ms +fused time: 2.53831 + +separate execution + +kernel execution time: 1.41327 ms +sddmm time: 1.9866 + +kernel execution time: 0.687839 ms +sddmm time: 0.957583 + +kernel execution time: 1.99132 ms +spmm time: 2.2301 + +reference execution + +kernel execution time: 33.8389 ms +taco reference time: 34.1855 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 2.08639 ms +fused time: 2.81403 + +separate execution + +kernel execution time: 0.75901 ms +sddmm time: 1.27309 + +kernel execution time: 0.72208 ms +sddmm time: 1.00494 + +kernel execution time: 1.95748 ms +spmm time: 2.20503 + +reference execution + +kernel execution time: 33.4827 ms +taco reference time: 33.8347 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 2.09414 ms +fused time: 2.82691 + +separate execution + +kernel execution time: 1.03623 ms +sddmm time: 1.58316 + +kernel execution time: 0.653819 ms +sddmm time: 0.926463 + +kernel execution time: 1.88145 ms +spmm time: 2.12517 + +reference execution + +kernel execution time: 33.3395 ms +taco reference time: 33.6915 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 1.70968 ms +fused time: 2.43176 + +separate execution + +kernel execution time: 0.76455 ms +sddmm time: 1.31209 + +kernel execution time: 0.664099 ms +sddmm time: 0.932353 + +kernel execution time: 1.92536 ms +spmm time: 2.17072 + +reference execution + +kernel execution time: 32.5601 ms +taco reference time: 32.9017 diff --git a/test/stats/spmm-spmm.txt b/test/stats/spmm-spmm.txt index 2cc71e519..329aacd65 100644 --- a/test/stats/spmm-spmm.txt +++ b/test/stats/spmm-spmm.txt @@ -3430,3 +3430,175 @@ spmm-spmm execution ----------------------------------------- filenum: 3 --------------------------------- + +spmm-spmm execution + +----------------------------------------- +filenum: 0 +--------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +--------- 1st pattern computation TTM, GEMM + +kernel execution time: 0.924512 ms +SpMM time: 1.22967 + +kernel execution time: 1.23287 ms +SpMM template time: 1.51353 + +kernel execution time: 20.7805 ms +GeMM time: 21.0769 + +kernel execution time: 19.6116 ms +ref 2 GeMM template time: 19.8379 + +--------- 2nd pattern computation GEMM, SpMM + +kernel execution time: 14.7563 ms +ref3 GeMM template time: 15.0245 + +kernel execution time: 0.823641 ms +SpMM template time ref4: 1.05233 + +-------- reference pattern computation + +kernel execution time: 34.1041 ms +taco reference time: 34.4607 + +kernel execution time: 41.9195 ms +taco reference new time: 42.2061 + +kernel execution time: 4.76242 ms +fused time: 5.04101 +filenum: 1 +--------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 548551, B2_dimension: 548551, vals: 1851744 +C1_dimension: 548551, C2_dimension: 128, vals: 70214528 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +--------- 1st pattern computation TTM, GEMM + +kernel execution time: 394.8 ms +SpMM time: 395.503 + +kernel execution time: 473.148 ms +SpMM template time: 473.684 + +kernel execution time: 4117.68 ms +GeMM time: 4118.6 + +kernel execution time: 3957.31 ms +ref 2 GeMM template time: 3958.16 + +--------- 2nd pattern computation GEMM, SpMM + +kernel execution time: 3017.13 ms +ref3 GeMM template time: 3017.67 + +kernel execution time: 314.652 ms +SpMM template time ref4: 315.164 + +-------- reference pattern computation + +kernel execution time: 11644.6 ms +taco reference time: 11645.6 + +kernel execution time: 14402.6 ms +taco reference new time: 14403.6 + +kernel execution time: 1261.33 ms +fused time: 1261.88 + +spmm-spmm execution + +----------------------------------------- +filenum: 0 +--------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +--------- 1st pattern computation TTM, GEMM + +kernel execution time: 0.209133 ms +SpMM time: 0.517016 + +kernel execution time: 0.579748 ms +SpMM template time: 0.864251 + +kernel execution time: 1.0574 ms +GeMM time: 1.37727 + +kernel execution time: 19.621 ms +ref 2 GeMM template time: 19.8504 + +--------- 2nd pattern computation GEMM, SpMM + +kernel execution time: 1.44618 ms +ref3 GeMM template time: 1.72243 + +kernel execution time: 0.384425 ms +SpMM template time ref4: 0.610708 + +-------- reference pattern computation + +kernel execution time: 3.59893 ms +taco reference time: 3.95508 + +kernel execution time: 4.81855 ms +taco reference new time: 5.10349 + +kernel execution time: 1.47107 ms +fused time: 1.90463 +filenum: 1 +--------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 548551, B2_dimension: 548551, vals: 1851744 +C1_dimension: 548551, C2_dimension: 128, vals: 70214528 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +--------- 1st pattern computation TTM, GEMM + +kernel execution time: 50.1795 ms +SpMM time: 50.5567 + +kernel execution time: 64.2504 ms +SpMM template time: 64.8179 + +kernel execution time: 96.8464 ms +GeMM time: 97.4123 + +kernel execution time: 3949.87 ms +ref 2 GeMM template time: 3950.93 + +--------- 2nd pattern computation GEMM, SpMM + +kernel execution time: 123.802 ms +ref3 GeMM template time: 124.342 + +kernel execution time: 39.2723 ms +SpMM template time ref4: 39.8322 + +-------- reference pattern computation + +kernel execution time: 457.271 ms +taco reference time: 457.979 + +kernel execution time: 427.194 ms +taco reference new time: 427.789 + +kernel execution time: 93.1417 ms +fused time: 93.7188 diff --git a/test/tests-scheduling-fuse.cpp b/test/tests-scheduling-fuse.cpp index bd77f1d64..41fb86f6f 100644 --- a/test/tests-scheduling-fuse.cpp +++ b/test/tests-scheduling-fuse.cpp @@ -7,8 +7,8 @@ #include #include -// #define NUM_THREADS_TO_USE 64 -#define NUM_THREADS_TO_USE 32 +#define NUM_THREADS_TO_USE 1 +// #define NUM_THREADS_TO_USE 32 void handle_error (int retval) { @@ -518,13 +518,15 @@ TEST(scheduling_eval, sddmmFused) { // vector filenums = {2,3,4,5,6,7,8,9,10,12,15}; - vector filenums = {1}; + vector filenums = {0}; for (auto filenum : filenums) { // int filenum = 5; std::vector matfiles = { + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx", + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx", "/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx", "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx", "/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx", @@ -545,6 +547,8 @@ TEST(scheduling_eval, sddmmFused) { "/home/min/a/kadhitha/ispc-examples/data/suitesparse/twitter7/twitter7.mtx" }; std::vector matfilesrw = { + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/rw/cora.mtx", + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/rw/amazon.mtx", "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/synthetic.mtx", "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cage3.mtx", "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/bcsstk17.mtx", @@ -688,66 +692,66 @@ TEST(scheduling_eval, sddmmFused) { statfile << "\nseparate execution\n"; - // // std::string sofile_sddmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so"; - // std::string sofile_sddmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.so"; - // TOOL_BENCHMARK_TIMER(ref1.compute(statfile, sofile_sddmm), "\n\nSDDMM Kernel: ", timevalue); - // if (statfile.is_open()) { - // statfile << "sddmm time: "; - // statfile << timevalue.mean << std::endl; - // } else { std::cout << " stat file is not open\n"; } + // std::string sofile_sddmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so"; + std::string sofile_sddmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.so"; + TOOL_BENCHMARK_TIMER(ref1.compute(statfile, sofile_sddmm), "\n\nSDDMM Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "sddmm time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } - // std::string sofile_sddmm_ryan = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/sddmm_ryan.so"; - // TOOL_BENCHMARK_TIMER(ref1.compute(statfile, sofile_sddmm_ryan), "\n\nSDDMM Kernel: ", timevalue); - // if (statfile.is_open()) { - // statfile << "sddmm time: "; - // statfile << timevalue.mean << std::endl; - // } else { std::cout << " stat file is not open\n"; } + std::string sofile_sddmm_ryan = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/sddmm_ryan.so"; + TOOL_BENCHMARK_TIMER(ref1.compute(statfile, sofile_sddmm_ryan), "\n\nSDDMM Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "sddmm time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } - // std::string sofile_spmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so"; - // TOOL_BENCHMARK_TIMER(ref2.compute(statfile, sofile_spmm), "\n\nSpMM Kernel: ", timevalue); - // if (statfile.is_open()) { - // statfile << "spmm time: "; - // statfile << timevalue.mean << std::endl; - // } else { std::cout << " stat file is not open\n"; } + std::string sofile_spmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so"; + TOOL_BENCHMARK_TIMER(ref2.compute(statfile, sofile_spmm), "\n\nSpMM Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "spmm time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } - // statfile << "\nreference execution \n"; + statfile << "\nreference execution \n"; - // std::string sofile_original = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/taco_original.so"; - // TOOL_BENCHMARK_TIMER(ref.compute(statfile, sofile_original), "\n\nReference Kernel: ", timevalue); - // if (statfile.is_open()) { - // statfile << "taco reference time: "; - // statfile << timevalue << std::endl; - // } else { std::cout << " stat file is not open\n"; } - - // double* A_vals = (double*) (A.getTacoTensorT()->vals); - // double* ref_vals = (double*) (ref.getTacoTensorT()->vals); - // double* ref2_vals = (double*) (ref2.getTacoTensorT()->vals); + std::string sofile_original = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/taco_original.so"; + TOOL_BENCHMARK_TIMER(ref.compute(statfile, sofile_original), "\n\nReference Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "taco reference time: "; + statfile << timevalue << std::endl; + } else { std::cout << " stat file is not open\n"; } - // // int* A2_pos = (double*) (ref.getTacoTensorT()->vals); + double* A_vals = (double*) (A.getTacoTensorT()->vals); + double* ref_vals = (double*) (ref.getTacoTensorT()->vals); + double* ref2_vals = (double*) (ref2.getTacoTensorT()->vals); - // // for (size_t q=0; q < B.getStorage().getValues().getSize(); q++) { - // // if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) { - // // std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " - // // << "refvals: " << ref_vals[q] << std::endl; - // // ASSERT_TRUE(false); - // // } - // // } + // int* A2_pos = (double*) (ref.getTacoTensorT()->vals); - // for (size_t q=0; q < A.getDimension(0)* A.getDimension(1); q++) { + // for (size_t q=0; q < B.getStorage().getValues().getSize(); q++) { // if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) { // std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " // << "refvals: " << ref_vals[q] << std::endl; // ASSERT_TRUE(false); // } // } - // for (size_t q=0; q < A.getDimension(0)* A.getDimension(1); q++) { - // if ( abs(A_vals[q] - ref2_vals[q])/abs(ref2_vals[q]) > ERROR_MARGIN) { - // std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " - // << "refvals: " << ref2_vals[q] << std::endl; - // ASSERT_TRUE(false); - // } - // } - // for (int q= 0; q< A_vals + + for (size_t q=0; q < A.getDimension(0)* A.getDimension(1); q++) { + if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) { + std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + << "refvals: " << ref_vals[q] << std::endl; + ASSERT_TRUE(false); + } + } + for (size_t q=0; q < A.getDimension(0)* A.getDimension(1); q++) { + if ( abs(A_vals[q] - ref2_vals[q])/abs(ref2_vals[q]) > ERROR_MARGIN) { + std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + << "refvals: " << ref2_vals[q] << std::endl; + ASSERT_TRUE(false); + } + } + // // for (int q= 0; q< A_vals // for (int q = 0; q < A.getDimension(0); ++q) { // for (int w = 0; w < A.getDimension(1); ++w) { // if ( abs(A(q,w) - ref(q,w))/abs(ref(q,w)) > ERROR_MARGIN) { @@ -775,6 +779,8 @@ TEST(scheduling_eval, hadamardFused) { return; } + taco_set_num_threads(NUM_THREADS_TO_USE); + ofstream statfile; statfile.open( "/home/min/a/kadhitha/workspace/my_taco/taco/test/stats/hadamard-gemm.txt", std::ios::app); @@ -791,14 +797,16 @@ TEST(scheduling_eval, hadamardFused) { int kdim = 128; int ldim = 128; - vector filenums = {2,3,4,5,6,7,8,9,10,12,15}; - // vector filenums = {8,9,10,12}; + // vector filenums = {2,3,4,5,6,7,8,9,10,12,15}; + vector filenums = {0}; for (auto filenum : filenums) { // int filenum = 15; std::vector matfiles = { + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx", + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx", "/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx", "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx", "/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx", // 2 @@ -819,6 +827,8 @@ TEST(scheduling_eval, hadamardFused) { "/home/min/a/kadhitha/ispc-examples/data/suitesparse/twitter7/twitter7.mtx" }; std::vector matfilesrw = { + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/rw/cora.mtx", + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/rw/amazon.mtx", "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/synthetic.mtx", "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cage3.mtx", "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/bcsstk17.mtx", @@ -2121,20 +2131,20 @@ TEST(scheduling_eval, spmmFused) { return; } - int retval, EventSet = PAPI_NULL; - retval = PAPI_hl_region_begin("dummy"); - if ( retval != PAPI_OK ) handle_error(1); + // int retval, EventSet = PAPI_NULL; + // retval = PAPI_hl_region_begin("dummy"); + // if ( retval != PAPI_OK ) handle_error(1); /* Do some computation */ - retval = PAPI_hl_region_end("dummy"); - if ( retval != PAPI_OK ) handle_error(1); + // retval = PAPI_hl_region_end("dummy"); + // if ( retval != PAPI_OK ) handle_error(1); taco_set_num_threads(NUM_THREADS_TO_USE); ofstream statfile; statfile.open( - "/home/min/a/kadhitha/workspace/my_taco/taco/test/stats/spmm-spmm.txt", std::ios::app); + "/home/min/a/kadhitha/workspace/my_taco/taco/test/stats/spmm-gemm.txt", std::ios::app); if (statfile.is_open()) { statfile << "\nspmm-spmm execution\n"; statfile << "\n-----------------------------------------\n"; @@ -2149,7 +2159,7 @@ TEST(scheduling_eval, spmmFused) { int ldim = 64; // vector filenums = {2,3,4,5,6,7,8,9,10,12,15}; - vector filenums = {3}; + vector filenums = {0}; for (auto filenum : filenums) { @@ -2159,6 +2169,8 @@ TEST(scheduling_eval, spmmFused) { // int filenum = 7; std::vector matfiles = { + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx", + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx", "/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx", "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx", "/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx", // 2 @@ -2180,6 +2192,8 @@ TEST(scheduling_eval, spmmFused) { "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k.mtx", }; std::vector matfilesrw = { + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/rw/cora.mtx", + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/rw/amazon.mtx", "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/synthetic.mtx", "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cage3.mtx", "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/bcsstk17.mtx", @@ -2377,42 +2391,42 @@ TEST(scheduling_eval, spmmFused) { statfile << "\n--------- 1st pattern computation TTM, GEMM\n"; - retval = PAPI_hl_region_begin("spmm"); - if ( retval != PAPI_OK ) handle_error(1); + // retval = PAPI_hl_region_begin("spmm"); + // if ( retval != PAPI_OK ) handle_error(1); TOOL_BENCHMARK_TIMER(ref1.compute(statfile), "\n\nSpMM Kernel: ", timevalue); - retval = PAPI_hl_region_end("spmm"); - if ( retval != PAPI_OK ) handle_error(1); + // retval = PAPI_hl_region_end("spmm"); + // if ( retval != PAPI_OK ) handle_error(1); if (statfile.is_open()) { statfile << "SpMM time: "; statfile << timevalue.mean << std::endl; } else { std::cout << " stat file is not open\n"; } std::string sofile_spmm_template = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so"; - retval = PAPI_hl_region_begin("spmmtemplate"); - if ( retval != PAPI_OK ) handle_error(1); + // retval = PAPI_hl_region_begin("spmmtemplate"); + // if ( retval != PAPI_OK ) handle_error(1); TOOL_BENCHMARK_TIMER(ref1.compute(statfile, sofile_spmm_template), "\n\nSpMM template Kernel: ", timevalue); - retval = PAPI_hl_region_end("spmmtemplate"); - if ( retval != PAPI_OK ) handle_error(1); + // retval = PAPI_hl_region_end("spmmtemplate"); + // if ( retval != PAPI_OK ) handle_error(1); if (statfile.is_open()) { statfile << "SpMM template time: "; statfile << timevalue.mean << std::endl; } else { std::cout << " stat file is not open\n"; } - retval = PAPI_hl_region_begin("gemm"); - if ( retval != PAPI_OK ) handle_error(1); + // retval = PAPI_hl_region_begin("gemm"); + // if ( retval != PAPI_OK ) handle_error(1); TOOL_BENCHMARK_TIMER(ref2.compute(statfile), "\n\nGeMM Kernel: ", timevalue); - retval = PAPI_hl_region_end("gemm"); - if ( retval != PAPI_OK ) handle_error(1); + // retval = PAPI_hl_region_end("gemm"); + // if ( retval != PAPI_OK ) handle_error(1); if (statfile.is_open()) { statfile << "GeMM time: "; statfile << timevalue.mean << std::endl; } else { std::cout << " stat file is not open\n"; } - retval = PAPI_hl_region_begin("gemmtemplate"); - if ( retval != PAPI_OK ) handle_error(1); + // retval = PAPI_hl_region_begin("gemmtemplate"); + // if ( retval != PAPI_OK ) handle_error(1); TOOL_BENCHMARK_TIMER(ref2_2.compute(statfile), "\n\nref GeMM template Kernel: ", timevalue); - retval = PAPI_hl_region_end("gemmtemplate"); - if ( retval != PAPI_OK ) handle_error(1); + // retval = PAPI_hl_region_end("gemmtemplate"); + // if ( retval != PAPI_OK ) handle_error(1); if (statfile.is_open()) { statfile << "ref 2 GeMM template time: "; statfile << timevalue.mean << std::endl; @@ -2420,21 +2434,21 @@ TEST(scheduling_eval, spmmFused) { // std::string sofile_gemm_template = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/spmm_gemm/spmm_template.so"; statfile << "\n--------- 2nd pattern computation GEMM, SpMM\n"; - retval = PAPI_hl_region_begin("gemmtemplate2"); - if ( retval != PAPI_OK ) handle_error(1); + // retval = PAPI_hl_region_begin("gemmtemplate2"); + // if ( retval != PAPI_OK ) handle_error(1); TOOL_BENCHMARK_TIMER(ref3.compute(statfile), "\n\nGeMM template ref3 Kernel: ", timevalue); - retval = PAPI_hl_region_end("gemmtemplate2"); - if ( retval != PAPI_OK ) handle_error(1); + // retval = PAPI_hl_region_end("gemmtemplate2"); + // if ( retval != PAPI_OK ) handle_error(1); if (statfile.is_open()) { statfile << "ref3 GeMM template time: "; statfile << timevalue.mean << std::endl; } else { std::cout << " stat file is not open\n"; } - retval = PAPI_hl_region_begin("spmm2"); - if ( retval != PAPI_OK ) handle_error(1); + // retval = PAPI_hl_region_begin("spmm2"); + // if ( retval != PAPI_OK ) handle_error(1); TOOL_BENCHMARK_TIMER(ref4.compute(statfile, sofile_spmm_template), "\n\nSpMM template Kernel ref4: ", timevalue); - retval = PAPI_hl_region_end("spmm2"); - if ( retval != PAPI_OK ) handle_error(1); + // retval = PAPI_hl_region_end("spmm2"); + // if ( retval != PAPI_OK ) handle_error(1); if (statfile.is_open()) { statfile << "SpMM template time ref4: "; statfile << timevalue.mean << std::endl; @@ -2443,32 +2457,32 @@ TEST(scheduling_eval, spmmFused) { statfile << "\n-------- reference pattern computation\n"; - retval = PAPI_hl_region_begin("ref"); - if ( retval != PAPI_OK ) handle_error(1); + // retval = PAPI_hl_region_begin("ref"); + // if ( retval != PAPI_OK ) handle_error(1); TOOL_BENCHMARK_TIMER(ref.compute(statfile), "\n\nReference Kernel: ", timevalue); - retval = PAPI_hl_region_end("ref"); - if ( retval != PAPI_OK ) handle_error(1); + // retval = PAPI_hl_region_end("ref"); + // if ( retval != PAPI_OK ) handle_error(1); if (statfile.is_open()) { statfile << "taco reference time: "; statfile << timevalue << std::endl; } else { std::cout << " stat file is not open\n"; } - retval = PAPI_hl_region_begin("refnew"); - if ( retval != PAPI_OK ) handle_error(1); + // retval = PAPI_hl_region_begin("refnew"); + // if ( retval != PAPI_OK ) handle_error(1); TOOL_BENCHMARK_TIMER(refn.compute(statfile), "\n\nReference new Kernel: ", timevalue); - retval = PAPI_hl_region_end("refnew"); - if ( retval != PAPI_OK ) handle_error(1); + // retval = PAPI_hl_region_end("refnew"); + // if ( retval != PAPI_OK ) handle_error(1); if (statfile.is_open()) { statfile << "taco reference new time: "; statfile << timevalue << std::endl; } else { std::cout << " stat file is not open\n"; } - retval = PAPI_hl_region_begin("sparselnr"); - if ( retval != PAPI_OK ) handle_error(1); + // retval = PAPI_hl_region_begin("sparselnr"); + // if ( retval != PAPI_OK ) handle_error(1); TOOL_BENCHMARK_TIMER(A.compute(statfile), "\n\nFused Kernel: ", timevalue); - retval = PAPI_hl_region_end("sparselnr"); - if ( retval != PAPI_OK ) handle_error(1); + // retval = PAPI_hl_region_end("sparselnr"); + // if ( retval != PAPI_OK ) handle_error(1); if (statfile.is_open()) { statfile << "fused time: "; statfile << timevalue.mean << std::endl; @@ -2537,25 +2551,25 @@ TEST(scheduling_eval, spmmFused) { // /* Add the native event */ // native = () - retval = PAPI_hl_region_begin("computation1"); - if ( retval != PAPI_OK ) - handle_error(1); + // retval = PAPI_hl_region_begin("computation1"); + // if ( retval != PAPI_OK ) + // handle_error(1); - /* Do some computation */ + // /* Do some computation */ - retval = PAPI_hl_region_end("computation1"); - if ( retval != PAPI_OK ) - handle_error(1); + // retval = PAPI_hl_region_end("computation1"); + // if ( retval != PAPI_OK ) + // handle_error(1); - retval = PAPI_hl_region_begin("computation2"); - if ( retval != PAPI_OK ) - handle_error(1); + // retval = PAPI_hl_region_begin("computation2"); + // if ( retval != PAPI_OK ) + // handle_error(1); - /* Do some computation */ + // /* Do some computation */ - retval = PAPI_hl_region_end("computation2"); - if ( retval != PAPI_OK ) - handle_error(1); + // retval = PAPI_hl_region_end("computation2"); + // if ( retval != PAPI_OK ) + // handle_error(1); } @@ -2588,12 +2602,15 @@ TEST(scheduling_eval, sddmmspmmFused) { int ldim = 64; int mdim = 64; - vector filenums{2, 3,4,5,6,7,8,9,10,12,15}; + // vector filenums{2, 3,4,5,6,7,8,9,10,12,15}; + vector filenums{0}; for (auto filenum : filenums) { std::vector matfiles = { + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx", + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx", "/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx", "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx", "/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx", @@ -2614,6 +2631,8 @@ TEST(scheduling_eval, sddmmspmmFused) { "/home/min/a/kadhitha/ispc-examples/data/suitesparse/twitter7/twitter7.mtx" }; std::vector matfilesrw = { + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/rw/cora.mtx", + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/rw/amazon.mtx", "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/synthetic.mtx", "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cage3.mtx", "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/bcsstk17.mtx",