From 2edd480ba9eed293e0d9c9eab8ee1bb3792286ea Mon Sep 17 00:00:00 2001 From: Adhhitha Dias Date: Mon, 28 Jun 2021 16:30:46 -0400 Subject: [PATCH 01/16] initial commit of ispc codegen files --- src/codegen/codegen_ispc.cpp | 606 +++++++++++++++++++++++++++++++++++ src/codegen/codegen_ispc.h | 63 ++++ 2 files changed, 669 insertions(+) create mode 100644 src/codegen/codegen_ispc.cpp create mode 100644 src/codegen/codegen_ispc.h diff --git a/src/codegen/codegen_ispc.cpp b/src/codegen/codegen_ispc.cpp new file mode 100644 index 000000000..4b0e82903 --- /dev/null +++ b/src/codegen/codegen_ispc.cpp @@ -0,0 +1,606 @@ +#include +#include +#include +#include +#include +#include + +#include "taco/ir/ir_visitor.h" +#include "codegen_ispc.h" +#include "taco/error.h" +#include "taco/util/strings.h" +#include "taco/util/collections.h" + +using namespace std; + +namespace taco { +namespace ir { + +// Some helper functions +namespace { + +// Include stdio.h for printf +// stdlib.h for malloc/realloc +// math.h for sqrt +// MIN preprocessor macro +// This *must* be kept in sync with taco_tensor_t.h +const string cHeaders = + "#ifndef TACO_C_HEADERS\n" + "#define TACO_C_HEADERS\n" + "#include \n" + "#include \n" + "#include \n" + "#include \n" + "#include \n" + "#include \n" + "#include \n" + "#if _OPENMP\n" + "#include \n" + "#endif\n" + "#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))\n" + "#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))\n" + "#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)\n" + "#ifndef TACO_TENSOR_T_DEFINED\n" + "#define TACO_TENSOR_T_DEFINED\n" + "typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;\n" + "typedef struct {\n" + " int32_t order; // tensor order (number of modes)\n" + " int32_t* dimensions; // tensor dimensions\n" + " int32_t csize; // component size\n" + " int32_t* mode_ordering; // mode storage ordering\n" + " taco_mode_t* mode_types; // mode storage types\n" + " uint8_t*** indices; // tensor index data (per mode)\n" + " uint8_t* vals; // tensor values\n" + " int32_t vals_size; // values array size\n" + "} taco_tensor_t;\n" + "#endif\n" + "#if !_OPENMP\n" + "int omp_get_thread_num() { return 0; }\n" + "int omp_get_max_threads() { return 1; }\n" + "#endif\n" + "int cmp(const void *a, const void *b) {\n" + " return *((const int*)a) - *((const int*)b);\n" + "}\n" + "int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {\n" + " if (array[arrayStart] >= target) {\n" + " return arrayStart;\n" + " }\n" + " int lowerBound = arrayStart; // always < target\n" + " int upperBound = arrayEnd; // always >= target\n" + " while (upperBound - lowerBound > 1) {\n" + " int mid = (upperBound + lowerBound) / 2;\n" + " int midValue = array[mid];\n" + " if (midValue < target) {\n" + " lowerBound = mid;\n" + " }\n" + " else if (midValue > target) {\n" + " upperBound = mid;\n" + " }\n" + " else {\n" + " return mid;\n" + " }\n" + " }\n" + " return upperBound;\n" + "}\n" + "int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {\n" + " if (array[arrayEnd] <= target) {\n" + " return arrayEnd;\n" + " }\n" + " int lowerBound = arrayStart; // always <= target\n" + " int upperBound = arrayEnd; // always > target\n" + " while (upperBound - lowerBound > 1) {\n" + " int mid = (upperBound + lowerBound) / 2;\n" + " int midValue = array[mid];\n" + " if (midValue < target) {\n" + " lowerBound = mid;\n" + " }\n" + " else if (midValue > target) {\n" + " upperBound = mid;\n" + " }\n" + " else {\n" + " return mid;\n" + " }\n" + " }\n" + " return lowerBound;\n" + "}\n" + "taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,\n" + " int32_t* dimensions, int32_t* mode_ordering,\n" + " taco_mode_t* mode_types) {\n" + " taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));\n" + " t->order = order;\n" + " t->dimensions = (int32_t *) malloc(order * sizeof(int32_t));\n" + " t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));\n" + " t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));\n" + " t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***));\n" + " t->csize = csize;\n" + " for (int32_t i = 0; i < order; i++) {\n" + " t->dimensions[i] = dimensions[i];\n" + " t->mode_ordering[i] = mode_ordering[i];\n" + " t->mode_types[i] = mode_types[i];\n" + " switch (t->mode_types[i]) {\n" + " case taco_mode_dense:\n" + " t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));\n" + " break;\n" + " case taco_mode_sparse:\n" + " t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));\n" + " break;\n" + " }\n" + " }\n" + " return t;\n" + "}\n" + "void deinit_taco_tensor_t(taco_tensor_t* t) {\n" + " for (int i = 0; i < t->order; i++) {\n" + " free(t->indices[i]);\n" + " }\n" + " free(t->indices);\n" + " free(t->dimensions);\n" + " free(t->mode_ordering);\n" + " free(t->mode_types);\n" + " free(t);\n" + "}\n" + "#endif\n"; +} // anonymous namespace + +// find variables for generating declarations +// generates a single var for each GetProperty +class CodeGen_ISPC::FindVars : public IRVisitor { +public: + map varMap; + + // the variables for which we need to add declarations + map varDecls; + + vector localVars; + + // this maps from tensor, property, mode, index to the unique var + map, string> canonicalPropertyVar; + + // this is for convenience, recording just the properties unpacked + // from the output tensor so we can re-save them at the end + map, string> outputProperties; + + // TODO: should replace this with an unordered set + vector outputTensors; + vector inputTensors; + + CodeGen_ISPC *codeGen; + + // copy inputs and outputs into the map + FindVars(vector inputs, vector outputs, CodeGen_ISPC *codeGen) + : codeGen(codeGen) { + for (auto v: inputs) { + auto var = v.as(); + taco_iassert(var) << "Inputs must be vars in codegen"; + taco_iassert(varMap.count(var)==0) << "Duplicate input found in codegen"; + inputTensors.push_back(v); + varMap[var] = var->name; + } + for (auto v: outputs) { + auto var = v.as(); + taco_iassert(var) << "Outputs must be vars in codegen"; + taco_iassert(varMap.count(var)==0) << "Duplicate output found in codegen"; + outputTensors.push_back(v); + varMap[var] = var->name; + } + } + +protected: + using IRVisitor::visit; + + virtual void visit(const Var *op) { + if (varMap.count(op) == 0) { + varMap[op] = op->is_ptr? op->name : codeGen->genUniqueName(op->name); + } + } + + virtual void visit(const VarDecl *op) { + if (!util::contains(localVars, op->var)) { + localVars.push_back(op->var); + } + op->var.accept(this); + op->rhs.accept(this); + } + + virtual void visit(const For *op) { + if (!util::contains(localVars, op->var)) { + localVars.push_back(op->var); + } + op->var.accept(this); + op->start.accept(this); + op->end.accept(this); + op->increment.accept(this); + op->contents.accept(this); + } + + virtual void visit(const GetProperty *op) { + if (!util::contains(inputTensors, op->tensor) && + !util::contains(outputTensors, op->tensor)) { + // Don't create header unpacking code for temporaries + return; + } + + if (varMap.count(op) == 0) { + auto key = + tuple(op->tensor,op->property, + (size_t)op->mode, + (size_t)op->index); + if (canonicalPropertyVar.count(key) > 0) { + varMap[op] = canonicalPropertyVar[key]; + } else { + auto unique_name = codeGen->genUniqueName(op->name); + canonicalPropertyVar[key] = unique_name; + varMap[op] = unique_name; + varDecls[op] = unique_name; + if (util::contains(outputTensors, op->tensor)) { + outputProperties[key] = unique_name; + } + } + } + } +}; + +CodeGen_ISPC::CodeGen_ISPC(std::ostream &dest, OutputKind outputKind, bool simplify) + : CodeGen(dest, false, simplify, C), out(dest), outputKind(outputKind) {} + +CodeGen_ISPC::~CodeGen_ISPC() {} + +void CodeGen_ISPC::compile(Stmt stmt, bool isFirst) { + varMap = {}; + localVars = {}; + + if (isFirst) { + // output the headers + out << cHeaders; + } + out << endl; + // generate code for the Stmt + stmt.accept(this); +} + +void CodeGen_ISPC::visit(const Function* func) { + // if generating a header, protect the function declaration with a guard + if (outputKind == HeaderGen) { + out << "#ifndef TACO_GENERATED_" << func->name << "\n"; + out << "#define TACO_GENERATED_" << func->name << "\n"; + } + + int numYields = countYields(func); + emittingCoroutine = (numYields > 0); + funcName = func->name; + labelCount = 0; + + resetUniqueNameCounters(); + FindVars inputVarFinder(func->inputs, {}, this); + func->body.accept(&inputVarFinder); + FindVars outputVarFinder({}, func->outputs, this); + func->body.accept(&outputVarFinder); + + // output function declaration + doIndent(); + out << printFuncName(func, inputVarFinder.varDecls, outputVarFinder.varDecls); + + // if we're just generating a header, this is all we need to do + if (outputKind == HeaderGen) { + out << ";\n"; + out << "#endif\n"; + return; + } + + out << " {\n"; + + indent++; + + // find all the vars that are not inputs or outputs and declare them + resetUniqueNameCounters(); + FindVars varFinder(func->inputs, func->outputs, this); + func->body.accept(&varFinder); + varMap = varFinder.varMap; + localVars = varFinder.localVars; + + // Print variable declarations + out << printDecls(varFinder.varDecls, func->inputs, func->outputs) << endl; + + if (emittingCoroutine) { + out << printContextDeclAndInit(varMap, localVars, numYields, func->name) + << endl; + } + + // output body + print(func->body); + + // output repack only if we allocated memory + if (checkForAlloc(func)) + out << endl << printPack(varFinder.outputProperties, func->outputs); + + if (emittingCoroutine) { + out << printCoroutineFinish(numYields, funcName); + } + + doIndent(); + out << "return 0;\n"; + indent--; + + doIndent(); + out << "}\n"; +} + +void CodeGen_ISPC::visit(const VarDecl* op) { + if (emittingCoroutine) { + doIndent(); + op->var.accept(this); + parentPrecedence = Precedence::TOP; + stream << " = "; + op->rhs.accept(this); + stream << ";"; + stream << endl; + } else { + IRPrinter::visit(op); + } +} + +void CodeGen_ISPC::visit(const Yield* op) { + printYield(op, localVars, varMap, labelCount, funcName); +} + +// For Vars, we replace their names with the generated name, +// since we match by reference (not name) +void CodeGen_ISPC::visit(const Var* op) { + taco_iassert(varMap.count(op) > 0) << + "Var " << op->name << " not found in varMap"; + if (emittingCoroutine) { +// out << "TACO_DEREF("; + } + out << varMap[op]; + if (emittingCoroutine) { +// out << ")"; + } +} + +static string genVectorizePragma(int width) { + stringstream ret; + ret << "#pragma clang loop interleave(enable) "; + if (!width) + ret << "vectorize(enable)"; + else + ret << "vectorize_width(" << width << ")"; + + return ret.str(); +} + +static string getParallelizePragma(LoopKind kind) { + stringstream ret; + ret << "#pragma omp parallel for schedule"; + switch (kind) { + case LoopKind::Static: + ret << "(static, 1)"; + break; + case LoopKind::Dynamic: + ret << "(dynamic, 1)"; + break; + case LoopKind::Runtime: + ret << "(runtime)"; + break; + case LoopKind::Static_Chunked: + ret << "(static)"; + break; + default: + break; + } + return ret.str(); +} + +static string getUnrollPragma(size_t unrollFactor) { + return "#pragma unroll " + std::to_string(unrollFactor); +} + +static string getAtomicPragma() { + return "#pragma omp atomic"; +} + +// The next two need to output the correct pragmas depending +// on the loop kind (Serial, Static, Dynamic, Vectorized) +// +// Docs for vectorization pragmas: +// http://clang.llvm.org/docs/LanguageExtensions.html#extensions-for-loop-hint-optimizations +void CodeGen_ISPC::visit(const For* op) { + switch (op->kind) { + case LoopKind::Vectorized: + doIndent(); + out << genVectorizePragma(op->vec_width); + out << "\n"; + break; + case LoopKind::Static: + case LoopKind::Dynamic: + case LoopKind::Runtime: + case LoopKind::Static_Chunked: + doIndent(); + out << getParallelizePragma(op->kind); + out << "\n"; + break; + default: + if (op->unrollFactor > 0) { + doIndent(); + out << getUnrollPragma(op->unrollFactor) << endl; + } + break; + } + + doIndent(); + stream << keywordString("for") << " ("; + if (!emittingCoroutine) { + stream << keywordString(util::toString(op->var.type())) << " "; + } + op->var.accept(this); + stream << " = "; + op->start.accept(this); + stream << keywordString("; "); + op->var.accept(this); + stream << " < "; + parentPrecedence = BOTTOM; + op->end.accept(this); + stream << keywordString("; "); + op->var.accept(this); + + auto lit = op->increment.as(); + if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || + (lit->type.isUInt() && lit->equalsScalar(1)))) { + stream << "++"; + } + else { + stream << " += "; + op->increment.accept(this); + } + stream << ") {\n"; + + op->contents.accept(this); + doIndent(); + stream << "}"; + stream << endl; +} + +void CodeGen_ISPC::visit(const While* op) { + // it's not clear from documentation that clang will vectorize + // while loops + // however, we'll output the pragmas anyway + if (op->kind == LoopKind::Vectorized) { + doIndent(); + out << genVectorizePragma(op->vec_width); + out << "\n"; + } + + IRPrinter::visit(op); +} + +void CodeGen_ISPC::visit(const GetProperty* op) { + taco_iassert(varMap.count(op) > 0) << + "Property " << Expr(op) << " of " << op->tensor << " not found in varMap"; + out << varMap[op]; +} + +void CodeGen_ISPC::visit(const Min* op) { + if (op->operands.size() == 1) { + op->operands[0].accept(this); + return; + } + for (size_t i=0; ioperands.size()-1; i++) { + stream << "TACO_MIN("; + op->operands[i].accept(this); + stream << ","; + } + op->operands.back().accept(this); + for (size_t i=0; ioperands.size()-1; i++) { + stream << ")"; + } +} + +void CodeGen_ISPC::visit(const Max* op) { + if (op->operands.size() == 1) { + op->operands[0].accept(this); + return; + } + for (size_t i=0; ioperands.size()-1; i++) { + stream << "TACO_MAX("; + op->operands[i].accept(this); + stream << ","; + } + op->operands.back().accept(this); + for (size_t i=0; ioperands.size()-1; i++) { + stream << ")"; + } +} + +void CodeGen_ISPC::visit(const Allocate* op) { + string elementType = printCType(op->var.type(), false); + + doIndent(); + op->var.accept(this); + stream << " = ("; + stream << elementType << "*"; + stream << ")"; + if (op->is_realloc) { + stream << "realloc("; + op->var.accept(this); + stream << ", "; + } + else { + // If the allocation was requested to clear the allocated memory, + // use calloc instead of malloc. + if (op->clear) { + stream << "calloc(1, "; + } else { + stream << "malloc("; + } + } + stream << "sizeof(" << elementType << ")"; + stream << " * "; + parentPrecedence = MUL; + op->num_elements.accept(this); + parentPrecedence = TOP; + stream << ");"; + stream << endl; +} + +void CodeGen_ISPC::visit(const Sqrt* op) { + taco_tassert(op->type.isFloat() && op->type.getNumBits() == 64) << + "Codegen doesn't currently support non-double sqrt"; + stream << "sqrt("; + op->a.accept(this); + stream << ")"; +} + +void CodeGen_ISPC::visit(const Assign* op) { + if (op->use_atomics) { + doIndent(); + stream << getAtomicPragma() << endl; + } + IRPrinter::visit(op); +} + +void CodeGen_ISPC::visit(const Store* op) { + if (op->use_atomics) { + doIndent(); + stream << getAtomicPragma() << endl; + } + IRPrinter::visit(op); +} + +void CodeGen_ISPC::generateShim(const Stmt& func, stringstream &ret) { + const Function *funcPtr = func.as(); + + ret << "int _shim_" << funcPtr->name << "(void** parameterPack) {\n"; + ret << " return " << funcPtr->name << "("; + + size_t i=0; + string delimiter = ""; + + const auto returnType = funcPtr->getReturnType(); + if (returnType.second != Datatype()) { + ret << "(void**)(parameterPack[0]), "; + ret << "(char*)(parameterPack[1]), "; + ret << "(" << returnType.second << "*)(parameterPack[2]), "; + ret << "(int32_t*)(parameterPack[3])"; + + i = 4; + delimiter = ", "; + } + + for (auto output : funcPtr->outputs) { + auto var = output.as(); + auto cast_type = var->is_tensor ? "taco_tensor_t*" + : printCType(var->type, var->is_ptr); + + ret << delimiter << "(" << cast_type << ")(parameterPack[" << i++ << "])"; + delimiter = ", "; + } + for (auto input : funcPtr->inputs) { + auto var = input.as(); + auto cast_type = var->is_tensor ? "taco_tensor_t*" + : printCType(var->type, var->is_ptr); + ret << delimiter << "(" << cast_type << ")(parameterPack[" << i++ << "])"; + delimiter = ", "; + } + ret << ");\n"; + ret << "}\n"; +} +} +} diff --git a/src/codegen/codegen_ispc.h b/src/codegen/codegen_ispc.h new file mode 100644 index 000000000..e3c87ece5 --- /dev/null +++ b/src/codegen/codegen_ispc.h @@ -0,0 +1,63 @@ +#ifndef TACO_BACKEND_C_H +#define TACO_BACKEND_C_H +#include +#include + +#include "taco/ir/ir.h" +#include "taco/ir/ir_printer.h" +#include "codegen.h" + +namespace taco { +namespace ir { + + +class CodeGen_ISPC : public CodeGen { +public: + /// Initialize a code generator that generates code to an + /// output stream. + CodeGen_ISPC(std::ostream &dest, OutputKind outputKind, bool simplify=true); + ~CodeGen_ISPC(); + + /// Compile a lowered function + void compile(Stmt stmt, bool isFirst=false); + + /// Generate shims that unpack an array of pointers representing + /// a mix of taco_tensor_t* and scalars into a function call + static void generateShim(const Stmt& func, std::stringstream &stream); + +protected: + using IRPrinter::visit; + + void visit(const Function*); + void visit(const VarDecl*); + void visit(const Yield*); + void visit(const Var*); + void visit(const For*); + void visit(const While*); + void visit(const GetProperty*); + void visit(const Min*); + void visit(const Max*); + void visit(const Allocate*); + void visit(const Sqrt*); + void visit(const Store*); + void visit(const Assign*); + + std::map varMap; + std::vector localVars; + std::ostream &out; + + OutputKind outputKind; + + std::string funcName; + int labelCount; + bool emittingCoroutine; + + class FindVars; + +private: + virtual std::string restrictKeyword() const { return "restrict"; } +}; + +} // namespace ir +} // namespace taco +#endif From 7d4b8b66415709d996061a6311ea2d6fdba78cf5 Mon Sep 17 00:00:00 2001 From: Adhhitha Dias Date: Mon, 28 Jun 2021 17:36:53 -0400 Subject: [PATCH 02/16] minimal changes to support ispc exec --- .gitignore | 3 ++ CMakeLists.txt | 7 ++++ include/taco/cuda.h | 10 +++++ include/taco/version.h.in | 1 + src/codegen/codegen.cpp | 4 ++ src/codegen/codegen_ispc.h | 4 +- src/codegen/module.cpp | 7 ++++ src/cuda.cpp | 11 ++++++ test/tests-scheduling-eval.cpp | 70 +++++++++++++++++++++++++++++++++- tools/taco.cpp | 19 +++++++++ 10 files changed, 132 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 16389f34e..9abc3adc7 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,6 @@ CMakeCache.txt doc apps/tensor_times_vector/tensor_times_vector + +.cache +compile_commands.json diff --git a/CMakeLists.txt b/CMakeLists.txt index a6a80d9d1..7e9359e01 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,10 +10,12 @@ project(taco LANGUAGES C CXX ) option(CUDA "Build for NVIDIA GPU (CUDA must be preinstalled)" OFF) +option(ISPC "Build for Intel ISPC Compiler (ISPC Compiler must be preinstalled)" OFF) option(PYTHON "Build TACO for python environment" OFF) option(OPENMP "Build with OpenMP execution support" OFF) option(COVERAGE "Build with code coverage analysis" OFF) set(TACO_FEATURE_CUDA 0) +set(TACO_FEATURE_ISPC 0) set(TACO_FEATURE_OPENMP 0) set(TACO_FEATURE_PYTHON 0) if(CUDA) @@ -22,6 +24,11 @@ if(CUDA) add_definitions(-DCUDA_BUILT) set(TACO_FEATURE_CUDA 1) endif(CUDA) +if(ISPC) + message("-- Searching for ISPC Installation") + add_definitions(-DISPC_BUILT) + set(TACO_FEATURE_ISPC 1) +endif(ISPC) if(OPENMP) message("-- Will use OpenMP for parallel execution") add_definitions(-DUSE_OPENMP) diff --git a/include/taco/cuda.h b/include/taco/cuda.h index aad6b5229..7ed545c6d 100644 --- a/include/taco/cuda.h +++ b/include/taco/cuda.h @@ -9,7 +9,17 @@ #define CUDA_BUILT false #endif +#ifndef ISPC_BUILT + #define ISPC_BUILT false +#endif + namespace taco { + +/// Functions used by taco to interface with ISPC +bool should_use_ISPC_codegen(); +void set_ISPC_codegen_enabled(bool enabled); + + /// Functions used by taco to interface with CUDA (especially unified memory) /// Check if should use CUDA codegen bool should_use_CUDA_codegen(); diff --git a/include/taco/version.h.in b/include/taco/version.h.in index bc5559d7d..8ef507598 100644 --- a/include/taco/version.h.in +++ b/include/taco/version.h.in @@ -20,5 +20,6 @@ #define TACO_FEATURE_OPENMP @TACO_FEATURE_OPENMP@ #define TACO_FEATURE_PYTHON @TACO_FEATURE_PYTHON@ #define TACO_FEATURE_CUDA @TACO_FEATURE_CUDA@ +#define TACO_FEATURE_ISPC @TACO_FEATURE_ISPC@ #endif /* TACO_VERSION_H */ diff --git a/src/codegen/codegen.cpp b/src/codegen/codegen.cpp index f0c09d98a..f57f9950f 100644 --- a/src/codegen/codegen.cpp +++ b/src/codegen/codegen.cpp @@ -2,6 +2,7 @@ #include "taco/cuda.h" #include "codegen_cuda.h" #include "codegen_c.h" +#include "codegen_ispc.h" #include #include @@ -26,6 +27,9 @@ shared_ptr CodeGen::init_default(std::ostream &dest, OutputKind outputK if (should_use_CUDA_codegen()) { return make_shared(dest, outputKind); } + else if (should_use_ISPC_codegen()) { + return make_shared(dest, outputKind); + } else { return make_shared(dest, outputKind); } diff --git a/src/codegen/codegen_ispc.h b/src/codegen/codegen_ispc.h index e3c87ece5..35da5a01b 100644 --- a/src/codegen/codegen_ispc.h +++ b/src/codegen/codegen_ispc.h @@ -1,5 +1,5 @@ -#ifndef TACO_BACKEND_C_H -#define TACO_BACKEND_C_H +#ifndef TACO_BACKEND_ISPC_H +#define TACO_BACKEND_ISPC_H #include #include diff --git a/src/codegen/module.cpp b/src/codegen/module.cpp index bd0f487b1..409ed4a83 100644 --- a/src/codegen/module.cpp +++ b/src/codegen/module.cpp @@ -13,6 +13,7 @@ #include "taco/util/strings.h" #include "taco/util/env.h" #include "codegen/codegen_c.h" +#include "codegen/codegen_ispc.h" #include "codegen/codegen_cuda.h" #include "taco/cuda.h" @@ -89,6 +90,9 @@ void writeShims(vector funcs, string path, string prefix) { if (should_use_CUDA_codegen()) { CodeGen_CUDA::generateShim(func, shims); } + else if (should_use_ISPC_codegen()) { + CodeGen_ISPC::generateShim(func, shims); + } else { CodeGen_C::generateShim(func, shims); } @@ -98,6 +102,9 @@ void writeShims(vector funcs, string path, string prefix) { if (should_use_CUDA_codegen()) { shims_file.open(path+prefix+"_shims.cpp"); } + else if (should_use_ISPC_codegen()) { + shims_file.open(path+prefix+".ispc", ios::app); + } else { shims_file.open(path+prefix+".c", ios::app); } diff --git a/src/cuda.cpp b/src/cuda.cpp index 059c60105..85139f874 100644 --- a/src/cuda.cpp +++ b/src/cuda.cpp @@ -7,6 +7,17 @@ using namespace std; namespace taco { + +static bool ISPC_codegen_enabled = ISPC_BUILT; +bool should_use_ISPC_codegen() { + return ISPC_codegen_enabled; +} + +void set_ISPC_codegen_enabled(bool enabled) { + ISPC_codegen_enabled = enabled; +} + + /// Functions used by taco to interface with CUDA (especially unified memory) static bool CUDA_codegen_enabled = CUDA_BUILT; static bool CUDA_unified_memory_enabled = CUDA_BUILT; diff --git a/test/tests-scheduling-eval.cpp b/test/tests-scheduling-eval.cpp index 52bd74ab4..f59359081 100644 --- a/test/tests-scheduling-eval.cpp +++ b/test/tests-scheduling-eval.cpp @@ -1,5 +1,7 @@ +#include #include #include +#include #include #include #include "test.h" @@ -44,6 +46,14 @@ IndexStmt scheduleSpMVCPU(IndexStmt stmt, int CHUNK_SIZE=16) { .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); } +IndexStmt scheduleSpMVISPC(IndexStmt stmt, int CHUNK_SIZE=16) { + IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1"); + return stmt; + // return stmt.split(i, i0, i1, CHUNK_SIZE) + // .reorder({i0, i1, j}) + // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); +} + IndexStmt scheduleSpMMCPU(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); return stmt.split(i, i0, i1, CHUNK_SIZE) @@ -1463,7 +1473,63 @@ TEST(scheduling_eval, mttkrpGPU) { ASSERT_TENSOR_EQ(expected, A); } -TEST(generate_evaluation_files, DISABLED_cpu) { + + +TEST(generate_ispc_evaluation_files, ispc) { + std::cout << "Hi Adhitha!\n" << std::endl ; + set_CUDA_codegen_enabled(false); + set_ISPC_codegen_enabled(true); + + vector> spmv_parameters = {{32}}; + vector> spmspv_parameters = {{8}}; + + // 4 to 512 and 4, 8, 16 + vector> spmm_dcsr_parameters = {{16, 8}}; + vector> spmm_parameters = {{16,4}}; + + vector> mttkrp_parameters = {}; + mttkrp_parameters.push_back({64,0}); + + vector> sddmm_parameters = {{8, 8}}; + vector> ttv_parameters = {{32}}; + + int NUM_I = 100; + int NUM_J = 100; + + string file_ending = ".ispc"; + string file_path = "eval_prepared_ispc/"; + mkdir(file_path.c_str(), 0777); + + // spmv + { + stringstream source; + std::shared_ptr codegen = ir::CodeGen::init_default(source, ir::CodeGen::ImplementationGen); + Tensor A("A", {NUM_I, NUM_J}, CSR); + Tensor x("x", {NUM_J}, {Dense}); + Tensor y("y", {NUM_I}, {Dense}); + y(i) = A(i, j) * x(j); + std::cout << "concretizing the assignment statement\n"; + IndexStmt stmt = y.getAssignment().concretize(); + std::cout << "Printing the original IndexStmt: " << stmt << std::endl; + for (auto paramSet : spmv_parameters) { + std::cout << "param set: " << paramSet[0] << std::endl; + IndexStmt scheduled = scheduleSpMVISPC(stmt, paramSet[0]); + std::cout << "scheduled IndexStmt: " << scheduled << std::endl; + ir::Stmt compute = lower(scheduled, "spmv_csr_ispc_taco", false, true); + std::cout << "computed statement: \n" << compute << std::endl; + codegen->compile(compute, false); + } + ofstream source_file; + source_file.open(file_path + "spmv_csr_ispc_taco.h"); + source_file << source.str(); + source_file.close(); + } + + + return; +} + +TEST(generate_evaluation_files, cpu) { if (should_use_CUDA_codegen()) { return; } @@ -1779,7 +1845,7 @@ TEST(generate_evaluation_files, DISABLED_cpu) { } } -TEST(generate_evaluation_files, DISABLED_gpu) { +TEST(generate_evaluation_files, gpu) { if (!should_use_CUDA_codegen()) { return; } diff --git a/tools/taco.cpp b/tools/taco.cpp index cd351a203..ce03b61e1 100644 --- a/tools/taco.cpp +++ b/tools/taco.cpp @@ -20,6 +20,7 @@ #include "taco/lower/lower.h" #include "taco/codegen/module.h" #include "codegen/codegen_c.h" +#include "codegen/codegen_ispc.h" #include "codegen/codegen_cuda.h" #include "codegen/codegen.h" #include "taco/util/strings.h" @@ -188,6 +189,8 @@ static void printUsageInfo() { cout << endl; printFlag("print-nocolor", "Print without colors."); cout << endl; + printFlag("ispc", "Generate ISPC code for Intel CPUs"); + cout << endl; printFlag("cuda", "Generate CUDA code for NVIDIA GPUs"); cout << endl; printFlag("schedule", "Specify parallel execution schedule"); @@ -279,6 +282,8 @@ static void printVersionInfo() { cout << "Built with Python support." << endl; if(TACO_FEATURE_CUDA) cout << "Built with CUDA support." << endl; + if(TACO_FEATURE_ISPC) + cout << "Built with ISPC support." << endl; cout << endl; cout << "Built on: " << TACO_BUILD_DATE << endl; cout << "CMake build type: " << TACO_BUILD_TYPE << endl; @@ -641,6 +646,7 @@ int main(int argc, char* argv[]) { bool color = true; bool readKernels = false; bool cuda = false; + bool ispc = false; bool setSchedule = false; @@ -949,6 +955,10 @@ int main(int argc, char* argv[]) { else if ("-cuda" == argName) { cuda = true; } + else if ("-ispc" == argName) { + std::cout << "ispc true\n"; + ispc = true; + } else if ("-schedule" == argName) { vector descriptor = util::split(argValue, ","); if (descriptor.size() > 2 || descriptor.empty()) { @@ -1129,9 +1139,18 @@ int main(int argc, char* argv[]) { return reportError("TACO must be built for CUDA (cmake -DCUDA=ON ..) to benchmark", 2); } set_CUDA_codegen_enabled(true); + set_ISPC_codegen_enabled(false); + } + else if (ispc) { + if (!ISPC_BUILT && benchmark) { + return reportError("TACO must be built for ISPC (cmake -DISPC=ON .. to benchmark", 2); + } + set_CUDA_codegen_enabled(false); + set_ISPC_codegen_enabled(true); } else { set_CUDA_codegen_enabled(false); + set_ISPC_codegen_enabled(false); } stmt = scalarPromote(stmt); From dd693feb9a56c0ab528fb602e0f30c3d014e3648 Mon Sep 17 00:00:00 2001 From: Adhhitha Dias Date: Mon, 12 Jul 2021 14:10:46 -0400 Subject: [PATCH 03/16] separate ispc code to another stream and smaller conversions to match ispc code --- include/taco/cuda.h | 2 + include/taco/ir/ir.h | 2 +- include/taco/ir/ir_printer.h | 3 + include/taco/util/strings.h | 22 + src/codegen/codegen.cpp | 141 ++++- src/codegen/codegen.h | 15 +- src/codegen/codegen_ispc.cpp | 278 ++++++--- src/codegen/codegen_ispc.h | 3 + src/codegen/module.cpp | 13 + src/cuda.cpp | 8 + src/ir/ir_printer.cpp | 824 +++++++++++++++++++------- src/ir/ir_rewriter.cpp | 1 + src/lower/lowerer_impl_imperative.cpp | 69 ++- src/tensor.cpp | 7 + test/tests-scheduling-eval.cpp | 79 ++- tools/taco.cpp | 2 + 16 files changed, 1127 insertions(+), 342 deletions(-) diff --git a/include/taco/cuda.h b/include/taco/cuda.h index 7ed545c6d..9c4a7aae9 100644 --- a/include/taco/cuda.h +++ b/include/taco/cuda.h @@ -18,6 +18,8 @@ namespace taco { /// Functions used by taco to interface with ISPC bool should_use_ISPC_codegen(); void set_ISPC_codegen_enabled(bool enabled); +bool is_ISPC_code_stream_enabled(); +void set_ISPC_code_stream_enabled(bool enabled); /// Functions used by taco to interface with CUDA (especially unified memory) diff --git a/include/taco/ir/ir.h b/include/taco/ir/ir.h index f852f26b1..cb46b5142 100644 --- a/include/taco/ir/ir.h +++ b/include/taco/ir/ir.h @@ -591,7 +591,7 @@ struct Switch : public StmtNode { static const IRNodeType _type_info = IRNodeType::Switch; }; -enum class LoopKind {Serial, Static, Dynamic, Runtime, Vectorized, Static_Chunked}; +enum class LoopKind {Serial, Static, Dynamic, Runtime, Vectorized, Static_Chunked, Foreach}; /** A for loop from start to end by increment. * A vectorized loop will require the increment to be 1 and the diff --git a/include/taco/ir/ir_printer.h b/include/taco/ir/ir_printer.h index 4e50764e9..c2c505bf5 100644 --- a/include/taco/ir/ir_printer.h +++ b/include/taco/ir/ir_printer.h @@ -16,6 +16,7 @@ class IRPrinter : public IRVisitorStrict { public: IRPrinter(std::ostream& stream); IRPrinter(std::ostream& stream, bool color, bool simplify); + IRPrinter(std::ostream& stream, std::ostream& stream2, bool color, bool simplify); virtual ~IRPrinter(); void setColor(bool color); @@ -72,6 +73,7 @@ class IRPrinter : public IRVisitorStrict { virtual void visit(const Break*); std::ostream &stream; + std::ostream &stream2; int indent; bool color; bool simplify; @@ -109,6 +111,7 @@ class IRPrinter : public IRVisitorStrict { void doIndent(); void printBinOp(Expr a, Expr b, std::string op, Precedence precedence); bool needsParentheses(Precedence precedence); + void sendToStream(std::stringstream &stream); std::string keywordString(std::string); std::string commentString(std::string); diff --git a/include/taco/util/strings.h b/include/taco/util/strings.h index 5dfb2f174..a3c3d863f 100644 --- a/include/taco/util/strings.h +++ b/include/taco/util/strings.h @@ -1,6 +1,7 @@ #ifndef TACO_UTIL_STRINGS_H #define TACO_UTIL_STRINGS_H +#include "taco/cuda.h" #include #include #include @@ -8,6 +9,8 @@ #include #include +#include "taco/type.h" + // To get the value of a compiler macro variable #define STRINGIFY(x) #x #define TO_STRING(x) STRINGIFY(x) @@ -15,6 +18,25 @@ namespace taco { namespace util { +// /// Turn anything except floating points that can be written to a stream +// /// into a string. +// template +// typename std::enable_if::value, std::string>::type +// toStringISPC(const T &val) { + +// std::stringstream sstream; +// if (val == Int32) { +// sstream << "int32"; +// } +// else if (val == Int64) { +// sstream << "int64"; +// } +// else { +// sstream << val; +// } +// return sstream.str(); +// } + /// Turn anything except floating points that can be written to a stream /// into a string. template diff --git a/src/codegen/codegen.cpp b/src/codegen/codegen.cpp index f57f9950f..750f33516 100644 --- a/src/codegen/codegen.cpp +++ b/src/codegen/codegen.cpp @@ -35,6 +35,18 @@ shared_ptr CodeGen::init_default(std::ostream &dest, OutputKind outputK } } +shared_ptr CodeGen::init_default(std::ostream &dest, std::ostream &dest2, OutputKind outputKind) { + if (should_use_CUDA_codegen()) { + return make_shared(dest, outputKind); + } + else if (should_use_ISPC_codegen()) { + return make_shared(dest, dest2, outputKind); + } + else { + return make_shared(dest, outputKind); + } +} + int CodeGen::countYields(const Function *func) { struct CountYields : public IRVisitor { int yields = 0; @@ -233,6 +245,49 @@ string CodeGen::printTensorProperty(string varname, const GetProperty* op, bool return ret.str(); } +string CodeGen::getUnpackedTensorArgument(string varname, const GetProperty* op, + bool is_output_prop) { + stringstream ret; + ret << ""; + + auto tensor = op->tensor.as(); + if (op->property == TensorProperty::Values) { + // for the values, it's in the last slot + ret << "uniform " << printType(tensor->type, false) << " " << varname << "[]"; + return ret.str(); + } else if (op->property == TensorProperty::ValuesSize) { + ret << "int32 " << varname; + return ret.str(); + } + + // for a Dense level, nnz is an int + // for a Fixed level, ptr is an int + // all others are int* + if (op->property == TensorProperty::Dimension) { + if (op->type == Int32) { + ret << "int32 "; + } else if (op->type == Int64) { + ret << "int64 "; + } else { + ret << "int "; + } + ret << varname; + + } else { + taco_iassert(op->property == TensorProperty::Indices); + if (op->type == Int32) { + ret << "uniform int32 "; + } else if (op->type == Int64) { + ret << "uniform int64 "; + } else { + ret << "uniform int "; + } + ret << varname << "[]"; + } + + return ret.str(); +} + string CodeGen::unpackTensorProperty(string varname, const GetProperty* op, bool is_output_prop) { stringstream ret; @@ -314,13 +369,9 @@ string CodeGen::pointTensorProperty(std::string varname) { return ret.str(); } -// helper to print declarations -string CodeGen::printDecls(map varMap, - vector inputs, vector outputs) { - stringstream ret; - unordered_set propsAlreadyGenerated; - - vector sortedProps; +void CodeGen::getSortedProps(map &varMap, + vector &sortedProps, vector &inputs, + vector &outputs) { for (auto const& p: varMap) { if (p.first.as()) @@ -359,6 +410,17 @@ string CodeGen::printDecls(map varMap, return a->index < b->index; }); +} + +// helper to print declarations +string CodeGen::printDecls(map varMap, + vector inputs, vector outputs) { + stringstream ret; + unordered_set propsAlreadyGenerated; + + vector sortedProps; + getSortedProps(varMap, sortedProps, inputs, outputs); + for (auto prop: sortedProps) { bool isOutputProp = (find(outputs.begin(), outputs.end(), prop->tensor) != outputs.end()); @@ -379,6 +441,71 @@ string CodeGen::printDecls(map varMap, return ret.str(); } +string CodeGen::printCallISPCFunc(const Function *func, map varMap, + vector &sortedProps) { + std::stringstream ret; + ret << " "; + unordered_set propsAlreadyGenerated; + + ret << "__" << func->name << "("; + + vector inputs = func->inputs; + vector outputs = func->outputs; + getSortedProps(varMap, sortedProps, inputs, outputs); + + for (unsigned long i=0; i < sortedProps.size(); i++) { + ret << varMap[sortedProps[i]]; + if (i != sortedProps.size()-1) { + ret << ", "; + } + propsAlreadyGenerated.insert(varMap[sortedProps[i]]); + } + + ret << ");\n"; + return ret.str(); +} + +string CodeGen::printISPCFunc(const Function *func, map varMap, + vector &sortedProps) { + std::stringstream ret; + ret << "export void "; + unordered_set propsAlreadyGenerated; + + ret << "__" << func->name << "("; + + vector inputs = func->inputs; + vector outputs = func->outputs; + // getSortedProps(varMap, sortedProps, inputs, outputs); + + for (unsigned long i=0; i < sortedProps.size(); i++) { + auto prop = sortedProps[i]; + bool isOutputProp = (find(outputs.begin(), outputs.end(), + prop->tensor) != outputs.end()); + + auto var = prop->tensor.as(); + if (var->is_parameter) { + if (isOutputProp) { + ret << " " << printTensorProperty(varMap[prop], prop, false) << ";" << endl; + } else { + break; + } + } else { + ret << getUnpackedTensorArgument(varMap[prop], prop, isOutputProp); + } + propsAlreadyGenerated.insert(varMap[prop]); + + if (i!=sortedProps.size()-1) { + ret << ", "; + } + if (i%2==0) { + ret << "\n\t"; + } + } + ret << ") {\n"; + + return ret.str(); +} + string CodeGen::printPack(map, string> outputProperties, vector outputs) { diff --git a/src/codegen/codegen.h b/src/codegen/codegen.h index cc25c80d6..641239834 100644 --- a/src/codegen/codegen.h +++ b/src/codegen/codegen.h @@ -16,9 +16,13 @@ class CodeGen : public IRPrinter { enum CodeGenType { C, CUDA }; CodeGen(std::ostream& stream, CodeGenType type) : IRPrinter(stream), codeGenType(type) {}; - CodeGen(std::ostream& stream, bool color, bool simplify, CodeGenType type) : IRPrinter(stream, color, simplify), codeGenType(type) {}; + CodeGen(std::ostream& stream, bool color, bool simplify, CodeGenType type) + : IRPrinter(stream, color, simplify), codeGenType(type) {}; + CodeGen(std::ostream& stream, std::ostream& stream2, bool color, bool simplify, CodeGenType type) + : IRPrinter(stream, stream2, color, simplify), codeGenType(type) {}; /// Initialize the default code generator static std::shared_ptr init_default(std::ostream &dest, OutputKind outputKind); + static std::shared_ptr init_default(std::ostream &dest, std::ostream &dest2, OutputKind outputKind); /// Compile a lowered function virtual void compile(Stmt stmt, bool isFirst=false) =0; @@ -26,6 +30,9 @@ class CodeGen : public IRPrinter { protected: static bool checkForAlloc(const Function *func); static int countYields(const Function *func); + void getSortedProps(std::map &varMap, + std::vector &sortedProps, std::vector &inputs, + std::vector &outputs); static std::string printCType(Datatype type, bool is_ptr); static std::string printCUDAType(Datatype type, bool is_ptr); @@ -42,6 +49,10 @@ class CodeGen : public IRPrinter { std::string printContextDeclAndInit(std::map varMap, std::vector localVars, int labels, std::string funcName); + std::string printCallISPCFunc(const Function *func, std::map varMap, + std::vector &sortedProps); + std::string printISPCFunc(const Function *func, std::map varMap, + std::vector &sortedProps); std::string printDecls(std::map varMap, std::vector inputs, std::vector outputs); std::string printPack(std::map, @@ -64,6 +75,8 @@ class CodeGen : public IRPrinter { std::string printTensorProperty(std::string varname, const GetProperty* op, bool is_ptr); std::string unpackTensorProperty(std::string varname, const GetProperty* op, bool is_output_prop); + std::string getUnpackedTensorArgument(std::string varname, const GetProperty* op, + bool is_output_prop); std::string packTensorProperty(std::string varname, Expr tnsr, TensorProperty property, int mode, int index); std::string pointTensorProperty(std::string varname); diff --git a/src/codegen/codegen_ispc.cpp b/src/codegen/codegen_ispc.cpp index 4b0e82903..f107728cc 100644 --- a/src/codegen/codegen_ispc.cpp +++ b/src/codegen/codegen_ispc.cpp @@ -5,6 +5,7 @@ #include #include +#include "taco/cuda.h" #include "taco/ir/ir_visitor.h" #include "codegen_ispc.h" #include "taco/error.h" @@ -240,7 +241,10 @@ class CodeGen_ISPC::FindVars : public IRVisitor { }; CodeGen_ISPC::CodeGen_ISPC(std::ostream &dest, OutputKind outputKind, bool simplify) - : CodeGen(dest, false, simplify, C), out(dest), outputKind(outputKind) {} + : CodeGen(dest, false, simplify, C), out(dest), out2(dest), outputKind(outputKind) {} + +CodeGen_ISPC::CodeGen_ISPC(std::ostream &dest, std::ostream &dest2, OutputKind outputKind, bool simplify) + : CodeGen(dest, dest2, false, simplify, C), out(dest), out2(dest2), outputKind(outputKind) {} CodeGen_ISPC::~CodeGen_ISPC() {} @@ -254,9 +258,19 @@ void CodeGen_ISPC::compile(Stmt stmt, bool isFirst) { } out << endl; // generate code for the Stmt + std::cout << "Compiling the code\n"; stmt.accept(this); } +void CodeGen_ISPC::sendToStream(std::stringstream &stream) { + if (is_ISPC_code_stream_enabled()) { + this->out2 << stream.str(); + } + else { + this->out << stream.str(); + } +} + void CodeGen_ISPC::visit(const Function* func) { // if generating a header, protect the function declaration with a guard if (outputKind == HeaderGen) { @@ -300,14 +314,14 @@ void CodeGen_ISPC::visit(const Function* func) { // Print variable declarations out << printDecls(varFinder.varDecls, func->inputs, func->outputs) << endl; + vector sortedProps; + out << printCallISPCFunc(func, varFinder.varDecls, sortedProps); + if (emittingCoroutine) { out << printContextDeclAndInit(varMap, localVars, numYields, func->name) << endl; } - // output body - print(func->body); - // output repack only if we allocated memory if (checkForAlloc(func)) out << endl << printPack(varFinder.outputProperties, func->outputs); @@ -321,21 +335,50 @@ void CodeGen_ISPC::visit(const Function* func) { indent--; doIndent(); - out << "}\n"; + out << "}\n\n"; + + set_ISPC_code_stream_enabled(true); + out2 << printISPCFunc(func, varFinder.varDecls, sortedProps); + indent++; + doIndent(); + // output body + print(func->body); + indent--; + out2 << "}\n"; + set_ISPC_code_stream_enabled(false); + } void CodeGen_ISPC::visit(const VarDecl* op) { - if (emittingCoroutine) { - doIndent(); - op->var.accept(this); - parentPrecedence = Precedence::TOP; - stream << " = "; - op->rhs.accept(this); - stream << ";"; - stream << endl; - } else { - IRPrinter::visit(op); + // std::stringstream stream; + if (is_ISPC_code_stream_enabled()) { + if (emittingCoroutine) { + doIndent(); + op->var.accept(this); + parentPrecedence = Precedence::TOP; + stream2 << " = "; + op->rhs.accept(this); + stream2 << ";"; + stream2 << endl; + } else { + IRPrinter::visit(op); + } } + else { + if (emittingCoroutine) { + doIndent(); + op->var.accept(this); + parentPrecedence = Precedence::TOP; + stream << " = "; + op->rhs.accept(this); + stream << ";"; + stream << endl; + } else { + IRPrinter::visit(op); + } + } + + // sendToStream(stream); } void CodeGen_ISPC::visit(const Yield* op) { @@ -345,14 +388,27 @@ void CodeGen_ISPC::visit(const Yield* op) { // For Vars, we replace their names with the generated name, // since we match by reference (not name) void CodeGen_ISPC::visit(const Var* op) { - taco_iassert(varMap.count(op) > 0) << - "Var " << op->name << " not found in varMap"; - if (emittingCoroutine) { -// out << "TACO_DEREF("; + if (is_ISPC_code_stream_enabled()) { + taco_iassert(varMap.count(op) > 0) << + "Var " << op->name << " not found in varMap"; + if (emittingCoroutine) { + // out << "TACO_DEREF("; + } + out2 << varMap[op]; + if (emittingCoroutine) { + // out << ")"; + } } - out << varMap[op]; - if (emittingCoroutine) { -// out << ")"; + else { + taco_iassert(varMap.count(op) > 0) << + "Var " << op->name << " not found in varMap"; + if (emittingCoroutine) { + // out << "TACO_DEREF("; + } + out << varMap[op]; + if (emittingCoroutine) { + // out << ")"; + } } } @@ -367,31 +423,31 @@ static string genVectorizePragma(int width) { return ret.str(); } -static string getParallelizePragma(LoopKind kind) { - stringstream ret; - ret << "#pragma omp parallel for schedule"; - switch (kind) { - case LoopKind::Static: - ret << "(static, 1)"; - break; - case LoopKind::Dynamic: - ret << "(dynamic, 1)"; - break; - case LoopKind::Runtime: - ret << "(runtime)"; - break; - case LoopKind::Static_Chunked: - ret << "(static)"; - break; - default: - break; - } - return ret.str(); -} - -static string getUnrollPragma(size_t unrollFactor) { - return "#pragma unroll " + std::to_string(unrollFactor); -} +// static string getParallelizePragma(LoopKind kind) { +// stringstream ret; +// ret << "#pragma omp parallel for schedule"; +// switch (kind) { +// case LoopKind::Static: +// ret << "(static, 1)"; +// break; +// case LoopKind::Dynamic: +// ret << "(dynamic, 1)"; +// break; +// case LoopKind::Runtime: +// ret << "(runtime)"; +// break; +// case LoopKind::Static_Chunked: +// ret << "(static)"; +// break; +// default: +// break; +// } +// return ret.str(); +// } + +// static string getUnrollPragma(size_t unrollFactor) { +// return "#pragma unroll " + std::to_string(unrollFactor); +// } static string getAtomicPragma() { return "#pragma omp atomic"; @@ -404,58 +460,75 @@ static string getAtomicPragma() { // http://clang.llvm.org/docs/LanguageExtensions.html#extensions-for-loop-hint-optimizations void CodeGen_ISPC::visit(const For* op) { switch (op->kind) { + // TODO - add ISPC based multi threaded execution handling case LoopKind::Vectorized: - doIndent(); - out << genVectorizePragma(op->vec_width); - out << "\n"; - break; case LoopKind::Static: case LoopKind::Dynamic: case LoopKind::Runtime: case LoopKind::Static_Chunked: - doIndent(); - out << getParallelizePragma(op->kind); - out << "\n"; - break; default: - if (op->unrollFactor > 0) { - doIndent(); - out << getUnrollPragma(op->unrollFactor) << endl; - } break; } doIndent(); - stream << keywordString("for") << " ("; - if (!emittingCoroutine) { - stream << keywordString(util::toString(op->var.type())) << " "; - } - op->var.accept(this); - stream << " = "; - op->start.accept(this); - stream << keywordString("; "); - op->var.accept(this); - stream << " < "; - parentPrecedence = BOTTOM; - op->end.accept(this); - stream << keywordString("; "); - op->var.accept(this); - auto lit = op->increment.as(); - if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || - (lit->type.isUInt() && lit->equalsScalar(1)))) { - stream << "++"; - } - else { - stream << " += "; - op->increment.accept(this); + if (op->kind == LoopKind::Foreach) { + stream2 << keywordString("foreach") << " ("; + // if (!emittingCoroutine) { + // if (op->var.type() == Int32) { + // stream << "int32 "; + // } + // else if (op->var.type() == Int64) { + // stream << "int64 "; + // } + + // } + op->var.accept(this); + stream2 << " = "; + op->start.accept(this); + stream2 << keywordString(" ... "); + op->end.accept(this); + stream2 << ") {\n"; + + } else { + stream2 << keywordString("for") << " ("; + if (!emittingCoroutine) { + if (op->var.type() == Int32) { + stream2 << "int32 "; + } + else if (op->var.type() == Int64) { + stream2 << "int64 "; + } + + } + op->var.accept(this); + stream2 << " = "; + op->start.accept(this); + stream2 << keywordString("; "); + op->var.accept(this); + stream2 << " < "; + parentPrecedence = BOTTOM; + op->end.accept(this); + stream2 << keywordString("; "); + op->var.accept(this); + + auto lit = op->increment.as(); + if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || + (lit->type.isUInt() && lit->equalsScalar(1)))) { + stream2 << "++"; + } + else { + stream2 << " += "; + op->increment.accept(this); + } + stream2 << ") {\n"; } - stream << ") {\n"; op->contents.accept(this); doIndent(); - stream << "}"; - stream << endl; + stream2 << "}"; + stream2 << endl; + } void CodeGen_ISPC::visit(const While* op) { @@ -474,7 +547,13 @@ void CodeGen_ISPC::visit(const While* op) { void CodeGen_ISPC::visit(const GetProperty* op) { taco_iassert(varMap.count(op) > 0) << "Property " << Expr(op) << " of " << op->tensor << " not found in varMap"; - out << varMap[op]; + if (is_ISPC_code_stream_enabled()) { + out2 << varMap[op]; + } + else { + out << varMap[op]; + } + } void CodeGen_ISPC::visit(const Min* op) { @@ -549,17 +628,34 @@ void CodeGen_ISPC::visit(const Sqrt* op) { } void CodeGen_ISPC::visit(const Assign* op) { - if (op->use_atomics) { - doIndent(); - stream << getAtomicPragma() << endl; + if (is_ISPC_code_stream_enabled()) { + if (op->use_atomics) { + doIndent(); + stream2 << getAtomicPragma() << endl; + } + } + else { + if (op->use_atomics) { + doIndent(); + stream << getAtomicPragma() << endl; + } } + IRPrinter::visit(op); } void CodeGen_ISPC::visit(const Store* op) { - if (op->use_atomics) { - doIndent(); - stream << getAtomicPragma() << endl; + if (is_ISPC_code_stream_enabled()) { + if (op->use_atomics) { + doIndent(); + stream2 << getAtomicPragma() << endl; + } + } + else { + if (op->use_atomics) { + doIndent(); + stream << getAtomicPragma() << endl; + } } IRPrinter::visit(op); } diff --git a/src/codegen/codegen_ispc.h b/src/codegen/codegen_ispc.h index 35da5a01b..8abd1cc09 100644 --- a/src/codegen/codegen_ispc.h +++ b/src/codegen/codegen_ispc.h @@ -16,6 +16,7 @@ class CodeGen_ISPC : public CodeGen { /// Initialize a code generator that generates code to an /// output stream. CodeGen_ISPC(std::ostream &dest, OutputKind outputKind, bool simplify=true); + CodeGen_ISPC(std::ostream &dest, std::ostream &dest2, OutputKind outputKind, bool simplify=true); ~CodeGen_ISPC(); /// Compile a lowered function @@ -45,6 +46,7 @@ class CodeGen_ISPC : public CodeGen { std::map varMap; std::vector localVars; std::ostream &out; + std::ostream &out2; OutputKind outputKind; @@ -56,6 +58,7 @@ class CodeGen_ISPC : public CodeGen { private: virtual std::string restrictKeyword() const { return "restrict"; } + void sendToStream(std::stringstream &stream); }; } // namespace ir diff --git a/src/codegen/module.cpp b/src/codegen/module.cpp index 409ed4a83..d9cbe2edc 100644 --- a/src/codegen/module.cpp +++ b/src/codegen/module.cpp @@ -116,6 +116,7 @@ void writeShims(vector funcs, string path, string prefix) { } // anonymous namespace string Module::compile() { + std::cout << "Module::compile\n"; string prefix = tmpdir+libname; string fullpath = prefix + ".so"; @@ -130,6 +131,12 @@ string Module::compile() { file_ending = ".cu"; shims_file = prefix + "_shims.cpp"; } + else if (should_use_ISPC_codegen()) { + cc = util::getFromEnv(target.compiler_env, target.compiler); + cflags = util::getFromEnv("TACO_CFLAGS", + "-O3 -ffast-math -std=c99") + " -shared -fPIC"; + + } else { cc = util::getFromEnv(target.compiler_env, target.compiler); cflags = util::getFromEnv("TACO_CFLAGS", @@ -150,6 +157,12 @@ string Module::compile() { // write out the shims writeShims(funcs, tmpdir, libname); + for (auto &statement : funcs) { + std::cout << "----- statement --------" << std::endl; + std::cout << statement; + std::cout << std::endl; + } + std::cout << tmpdir << std::endl << libname << std::endl; // now compile it int err = system(cmd.data()); diff --git a/src/cuda.cpp b/src/cuda.cpp index 85139f874..68e49fe98 100644 --- a/src/cuda.cpp +++ b/src/cuda.cpp @@ -9,14 +9,22 @@ using namespace std; namespace taco { static bool ISPC_codegen_enabled = ISPC_BUILT; +static bool ISPC_code_stream_enabled = false; bool should_use_ISPC_codegen() { return ISPC_codegen_enabled; } +bool is_ISPC_code_stream_enabled() { + return ISPC_code_stream_enabled; +} + void set_ISPC_codegen_enabled(bool enabled) { ISPC_codegen_enabled = enabled; } +void set_ISPC_code_stream_enabled(bool enabled) { + ISPC_code_stream_enabled = enabled; +} /// Functions used by taco to interface with CUDA (especially unified memory) static bool CUDA_codegen_enabled = CUDA_BUILT; diff --git a/src/ir/ir_printer.cpp b/src/ir/ir_printer.cpp index a1997a9b7..f96251c5a 100644 --- a/src/ir/ir_printer.cpp +++ b/src/ir/ir_printer.cpp @@ -1,6 +1,7 @@ #include #include +#include "taco/cuda.h" #include "taco/ir/ir.h" #include "taco/ir/ir_printer.h" #include "taco/ir/simplify.h" @@ -34,7 +35,11 @@ IRPrinter::IRPrinter(ostream &s) : IRPrinter(s, false, false) { } IRPrinter::IRPrinter(ostream &s, bool color, bool simplify) - : stream(s), indent(0), color(color), simplify(simplify) { + : stream(s), stream2(s), indent(0), color(color), simplify(simplify) { +} + +IRPrinter::IRPrinter(ostream &s, ostream &s2, bool color, bool simplify) + : stream(s), stream2(s2), indent(0), color(color), simplify(simplify) { } IRPrinter::~IRPrinter() { @@ -59,79 +64,169 @@ void IRPrinter::print(Stmt stmt) { } void IRPrinter::visit(const Literal* op) { - if (color) { - stream << blue ; - } - - switch (op->type.getKind()) { - case Datatype::Bool: - stream << op->getValue(); - break; - case Datatype::UInt8: - stream << static_cast(op->getValue()); - break; - case Datatype::UInt16: - stream << op->getValue(); - break; - case Datatype::UInt32: - stream << op->getValue(); - break; - case Datatype::UInt64: - stream << op->getValue(); - break; - case Datatype::UInt128: - taco_not_supported_yet; - break; - case Datatype::Int8: - stream << static_cast(op->getValue()); - break; - case Datatype::Int16: - stream << op->getValue(); - break; - case Datatype::Int32: - stream << op->getValue(); - break; - case Datatype::Int64: - stream << op->getValue(); - break; - case Datatype::Int128: - taco_not_supported_yet; - break; - case Datatype::Float32: - stream << ((op->getValue() != 0.0) - ? util::toString(op->getValue()) : "0.0"); - break; - case Datatype::Float64: - stream << ((op->getValue()!=0.0) - ? util::toString(op->getValue()) : "0.0"); - break; - case Datatype::Complex64: { - std::complex val = op->getValue>(); - stream << val.real() << " + I*" << val.imag(); - } - break; - case Datatype::Complex128: { - std::complex val = op->getValue>(); - stream << val.real() << " + I*" << val.imag(); - } - break; - case Datatype::Undefined: - taco_ierror << "Undefined type in IR"; - break; - } + if (is_ISPC_code_stream_enabled()) { + if (color) { + stream2 << blue ; + } - if (color) { - stream << nc; + // It seems this is where all the types get printed in the final code generation. + // Come up with a way to generate different values if stream2 is used to generate ispc code + switch (op->type.getKind()) { + case Datatype::Bool: + stream2 << op->getValue(); + break; + case Datatype::UInt8: + stream2 << static_cast(op->getValue()); + break; + case Datatype::UInt16: + stream2 << op->getValue(); + break; + case Datatype::UInt32: + stream2 << op->getValue(); + break; + case Datatype::UInt64: + stream2 << op->getValue(); + break; + case Datatype::UInt128: + taco_not_supported_yet; + break; + case Datatype::Int8: + stream2 << static_cast(op->getValue()); + break; + case Datatype::Int16: + stream2 << op->getValue(); + break; + case Datatype::Int32: + stream2 << op->getValue(); + break; + case Datatype::Int64: + stream2 << op->getValue(); + break; + case Datatype::Int128: + taco_not_supported_yet; + break; + case Datatype::Float32: + stream2 << ((op->getValue() != 0.0) + ? util::toString(op->getValue()) : "0.0"); + break; + case Datatype::Float64: + stream2 << ((op->getValue()!=0.0) + ? util::toString(op->getValue()) : "0.0"); + break; + case Datatype::Complex64: { + std::complex val = op->getValue>(); + stream2 << val.real() << " + I*" << val.imag(); + } + break; + case Datatype::Complex128: { + std::complex val = op->getValue>(); + stream2 << val.real() << " + I*" << val.imag(); + } + break; + case Datatype::Undefined: + taco_ierror << "Undefined type in IR"; + break; + } + + if (color) { + stream2 << nc; + } + } + + + + else { + + if (color) { + stream << blue ; + } + + // It seems this is where all the types get printed in the final code generation. + // Come up with a way to generate different values if stream2 is used to generate ispc code + switch (op->type.getKind()) { + case Datatype::Bool: + stream << op->getValue(); + break; + case Datatype::UInt8: + stream << static_cast(op->getValue()); + break; + case Datatype::UInt16: + stream << op->getValue(); + break; + case Datatype::UInt32: + stream << op->getValue(); + break; + case Datatype::UInt64: + stream << op->getValue(); + break; + case Datatype::UInt128: + taco_not_supported_yet; + break; + case Datatype::Int8: + stream << static_cast(op->getValue()); + break; + case Datatype::Int16: + stream << op->getValue(); + break; + case Datatype::Int32: + stream << op->getValue(); + break; + case Datatype::Int64: + stream << op->getValue(); + break; + case Datatype::Int128: + taco_not_supported_yet; + break; + case Datatype::Float32: + stream << ((op->getValue() != 0.0) + ? util::toString(op->getValue()) : "0.0"); + break; + case Datatype::Float64: + stream << ((op->getValue()!=0.0) + ? util::toString(op->getValue()) : "0.0"); + break; + case Datatype::Complex64: { + std::complex val = op->getValue>(); + stream << val.real() << " + I*" << val.imag(); + } + break; + case Datatype::Complex128: { + std::complex val = op->getValue>(); + stream << val.real() << " + I*" << val.imag(); + } + break; + case Datatype::Undefined: + taco_ierror << "Undefined type in IR"; + break; + } + + if (color) { + stream << nc; + } + + } + } void IRPrinter::visit(const Var* op) { - if (varNames.contains(op)) { - stream << varNames.get(op); + if (is_ISPC_code_stream_enabled()) { + if (varNames.contains(op)) { + stream2 << varNames.get(op); + } + else { + stream2 << op->name; + } } else { - stream << op->name; + if (varNames.contains(op)) { + stream << varNames.get(op); + } + else { + stream << op->name; + } } + } void IRPrinter::visit(const Neg* op) { @@ -248,41 +343,83 @@ void IRPrinter::visit(const IfThenElse* op) { taco_iassert(op->cond.defined()); taco_iassert(op->then.defined()); doIndent(); - stream << keywordString("if "); - stream << "("; - parentPrecedence = Precedence::TOP; - op->cond.accept(this); - stream << ")"; + if (is_ISPC_code_stream_enabled()) { + stream2 << keywordString("if "); + stream2 << "("; + parentPrecedence = Precedence::TOP; + op->cond.accept(this); + stream2 << ")"; + + Stmt scopedStmt = Stmt(to(op->then)->scopedStmt); + if (isa(scopedStmt)) { + stream2 << " {" << endl; + op->then.accept(this); + doIndent(); + stream2 << "}"; + } + else if (isa(scopedStmt)) { + int tmp = indent; + indent = 0; + stream2 << " "; + scopedStmt.accept(this); + indent = tmp; + } + else { + stream2 << endl; + op->then.accept(this); + } - Stmt scopedStmt = Stmt(to(op->then)->scopedStmt); - if (isa(scopedStmt)) { - stream << " {" << endl; - op->then.accept(this); - doIndent(); - stream << "}"; - } - else if (isa(scopedStmt)) { - int tmp = indent; - indent = 0; - stream << " "; - scopedStmt.accept(this); - indent = tmp; + if (op->otherwise.defined()) { + stream2 << "\n"; + doIndent(); + stream2 << keywordString("else"); + stream2 << " {\n"; + op->otherwise.accept(this); + doIndent(); + stream2 << "}"; + } + stream2 << endl; } + + else { - stream << endl; - op->then.accept(this); - } + stream << keywordString("if "); + stream << "("; + parentPrecedence = Precedence::TOP; + op->cond.accept(this); + stream << ")"; - if (op->otherwise.defined()) { - stream << "\n"; - doIndent(); - stream << keywordString("else"); - stream << " {\n"; - op->otherwise.accept(this); - doIndent(); - stream << "}"; + Stmt scopedStmt = Stmt(to(op->then)->scopedStmt); + if (isa(scopedStmt)) { + stream << " {" << endl; + op->then.accept(this); + doIndent(); + stream << "}"; + } + else if (isa(scopedStmt)) { + int tmp = indent; + indent = 0; + stream << " "; + scopedStmt.accept(this); + indent = tmp; + } + else { + stream << endl; + op->then.accept(this); + } + + if (op->otherwise.defined()) { + stream << "\n"; + doIndent(); + stream << keywordString("else"); + stream << " {\n"; + op->otherwise.accept(this); + doIndent(); + stream << "}"; + } + stream << endl; } - stream << endl; + } void IRPrinter::visit(const Case* op) { @@ -345,12 +482,22 @@ void IRPrinter::visit(const Switch* op) { } void IRPrinter::visit(const Load* op) { - parentPrecedence = Precedence::LOAD; - op->arr.accept(this); - stream << "["; - parentPrecedence = Precedence::LOAD; - op->loc.accept(this); - stream << "]"; + if (is_ISPC_code_stream_enabled()) { + parentPrecedence = Precedence::LOAD; + op->arr.accept(this); + stream2 << "["; + parentPrecedence = Precedence::LOAD; + op->loc.accept(this); + stream2 << "]"; + } + else { + parentPrecedence = Precedence::LOAD; + op->arr.accept(this); + stream << "["; + parentPrecedence = Precedence::LOAD; + op->loc.accept(this); + stream << "]"; + } } void IRPrinter::visit(const Malloc* op) { @@ -367,66 +514,149 @@ void IRPrinter::visit(const Sizeof* op) { } void IRPrinter::visit(const Store* op) { - doIndent(); - op->arr.accept(this); - stream << "["; - parentPrecedence = Precedence::TOP; - op->loc.accept(this); - stream << "] = "; - parentPrecedence = Precedence::TOP; - op->data.accept(this); - stream << ";"; - stream << endl; + if (is_ISPC_code_stream_enabled()) { + doIndent(); + op->arr.accept(this); + stream2 << "["; + parentPrecedence = Precedence::TOP; + op->loc.accept(this); + stream2 << "] = "; + parentPrecedence = Precedence::TOP; + op->data.accept(this); + stream2 << ";"; + stream2 << endl; + } + else { + doIndent(); + op->arr.accept(this); + stream << "["; + parentPrecedence = Precedence::TOP; + op->loc.accept(this); + stream << "] = "; + parentPrecedence = Precedence::TOP; + op->data.accept(this); + stream << ";"; + stream << endl; + } + } void IRPrinter::visit(const For* op) { - doIndent(); - stream << keywordString("for") << " (" - << keywordString(util::toString(op->var.type())) << " "; - op->var.accept(this); - stream << " = "; - op->start.accept(this); - stream << keywordString("; "); - op->var.accept(this); - stream << " < "; - parentPrecedence = BOTTOM; - op->end.accept(this); - stream << keywordString("; "); - op->var.accept(this); + std::cout << "This is IRPrinter::visit For op method\n"; + if (is_ISPC_code_stream_enabled()) { + doIndent(); + stream2 << keywordString("for") << " (" + << keywordString(util::toString(op->var.type())) << " "; + op->var.accept(this); + stream2 << " = "; + op->start.accept(this); + stream2 << keywordString("; "); + op->var.accept(this); + stream2 << " < "; + parentPrecedence = BOTTOM; + op->end.accept(this); + stream2 << keywordString("; "); + op->var.accept(this); + + auto lit = op->increment.as(); + if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || + (lit->type.isUInt() && lit->equalsScalar(1)))) { + stream2 << "++"; + } + else { + stream2 << " += "; + op->increment.accept(this); + } + stream2 << ") {\n"; - auto lit = op->increment.as(); - if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || - (lit->type.isUInt() && lit->equalsScalar(1)))) { - stream << "++"; + op->contents.accept(this); + doIndent(); + stream2 << "}"; + stream2 << endl; } + + else { - stream << " += "; - op->increment.accept(this); + doIndent(); + stream << keywordString("for") << " (" + << keywordString(util::toString(op->var.type())) << " "; + op->var.accept(this); + stream << " = "; + op->start.accept(this); + stream << keywordString("; "); + op->var.accept(this); + stream << " < "; + parentPrecedence = BOTTOM; + op->end.accept(this); + stream << keywordString("; "); + op->var.accept(this); + + auto lit = op->increment.as(); + if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || + (lit->type.isUInt() && lit->equalsScalar(1)))) { + stream << "++"; + } + else { + stream << " += "; + op->increment.accept(this); + } + stream << ") {\n"; + + op->contents.accept(this); + doIndent(); + stream << "}"; + stream << endl; } - stream << ") {\n"; - op->contents.accept(this); - doIndent(); - stream << "}"; - stream << endl; +} + +void IRPrinter::sendToStream(std::stringstream &stream) { + if (is_ISPC_code_stream_enabled()) { + this->stream2 << stream.str(); + } + else { + this->stream << stream.str(); + } } void IRPrinter::visit(const While* op) { - doIndent(); - stream << keywordString("while "); - stream << "("; - parentPrecedence = Precedence::TOP; - op->cond.accept(this); - stream << ")"; - stream << " {\n"; - op->contents.accept(this); - doIndent(); - stream << "}"; - stream << endl; + // std::stringstream stream; + if (is_ISPC_code_stream_enabled()) { + doIndent(); + stream2 << keywordString("while "); + stream2 << "("; + parentPrecedence = Precedence::TOP; + op->cond.accept(this); + stream2 << ")"; + stream2 << " {\n"; + op->contents.accept(this); + doIndent(); + stream2 << "}"; + stream2 << endl; + } + else { + doIndent(); + stream << keywordString("while "); + stream << "("; + parentPrecedence = Precedence::TOP; + op->cond.accept(this); + stream << ")"; + stream << " {\n"; + op->contents.accept(this); + doIndent(); + stream << "}"; + stream << endl; + } + // sendToStream(stream); } void IRPrinter::visit(const Block* op) { - acceptJoin(this, stream, op->contents, ""); + if (is_ISPC_code_stream_enabled()) { + acceptJoin(this, stream2, op->contents, ""); + } + else { + acceptJoin(this, stream, op->contents, ""); + } } void IRPrinter::visit(const Scope* op) { @@ -438,85 +668,183 @@ void IRPrinter::visit(const Scope* op) { } void IRPrinter::visit(const Function* op) { - stream << keywordString("void ") << op->name; - stream << "("; - if (op->outputs.size() > 0) stream << "Tensor "; - acceptJoin(this, stream, op->outputs, ", Tensor "); - if (op->outputs.size() > 0 && op->inputs.size()) stream << ", "; - if (op->inputs.size() > 0) stream << "Tensor "; - acceptJoin(this, stream, op->inputs, ", Tensor "); - stream << ") {" << endl; + if (is_ISPC_code_stream_enabled()) { + stream2 << keywordString("void ") << op->name; + stream2 << "("; + if (op->outputs.size() > 0) stream2 << "Tensor "; + acceptJoin(this, stream2, op->outputs, ", Tensor "); + if (op->outputs.size() > 0 && op->inputs.size()) stream2 << ", "; + if (op->inputs.size() > 0) stream2 << "Tensor "; + acceptJoin(this, stream2, op->inputs, ", Tensor "); + stream2 << ") {" << endl; + + resetNameCounters(); + op->body.accept(this); + + doIndent(); + stream2 << "}"; + } + else { + stream << keywordString("void ") << op->name; + stream << "("; + if (op->outputs.size() > 0) stream << "Tensor "; + acceptJoin(this, stream, op->outputs, ", Tensor "); + if (op->outputs.size() > 0 && op->inputs.size()) stream << ", "; + if (op->inputs.size() > 0) stream << "Tensor "; + acceptJoin(this, stream, op->inputs, ", Tensor "); + stream << ") {" << endl; - resetNameCounters(); - op->body.accept(this); + resetNameCounters(); + op->body.accept(this); + + doIndent(); + stream << "}"; + } - doIndent(); - stream << "}"; } void IRPrinter::visit(const VarDecl* op) { - doIndent(); - stream << keywordString(util::toString(op->var.type())); - taco_iassert(isa(op->var)); - if (to(op->var)->is_ptr) { - stream << "* restrict"; - } - stream << " "; - string varName = varNameGenerator.getUniqueName(util::toString(op->var)); - varNames.insert({op->var, varName}); - op->var.accept(this); - parentPrecedence = Precedence::TOP; - stream << " = "; - op->rhs.accept(this); - stream << ";"; - stream << endl; + if (is_ISPC_code_stream_enabled()) { + doIndent(); + if (op->var.type() == Int32) { + stream2 << keywordString("int32"); + } + else if (op->var.type() == Int64) { + stream2 << keywordString("int64"); + } else { + stream2 << keywordString(util::toString(op->var.type())); + } + taco_iassert(isa(op->var)); + if (to(op->var)->is_ptr) { + stream2 << "* restrict"; + } + stream2 << " "; + string varName = varNameGenerator.getUniqueName(util::toString(op->var)); + varNames.insert({op->var, varName}); + op->var.accept(this); + parentPrecedence = Precedence::TOP; + stream2 << " = "; + op->rhs.accept(this); + stream2 << ";"; + stream2 << endl; + } + else { + doIndent(); + stream << keywordString(util::toString(op->var.type())); + taco_iassert(isa(op->var)); + if (to(op->var)->is_ptr) { + stream << "* restrict"; + } + stream << " "; + string varName = varNameGenerator.getUniqueName(util::toString(op->var)); + varNames.insert({op->var, varName}); + op->var.accept(this); + parentPrecedence = Precedence::TOP; + stream << " = "; + op->rhs.accept(this); + stream << ";"; + stream << endl; + } + } void IRPrinter::visit(const Assign* op) { - doIndent(); - op->lhs.accept(this); - parentPrecedence = Precedence::TOP; - bool printed = false; - if (simplify) { - if (isa(op->rhs)) { - auto add = to(op->rhs); - if (add->a == op->lhs) { - const Literal* lit = add->b.as(); - if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || - (lit->type.isUInt() && lit->equalsScalar(1)))) { - stream << "++"; + if (is_ISPC_code_stream_enabled()) { + doIndent(); + op->lhs.accept(this); + parentPrecedence = Precedence::TOP; + bool printed = false; + if (simplify) { + if (isa(op->rhs)) { + auto add = to(op->rhs); + if (add->a == op->lhs) { + const Literal* lit = add->b.as(); + if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || + (lit->type.isUInt() && lit->equalsScalar(1)))) { + stream2 << "++"; + } + else { + stream2 << " += "; + add->b.accept(this); + } + printed = true; } - else { - stream << " += "; - add->b.accept(this); + } + else if (isa(op->rhs)) { + auto mul = to(op->rhs); + if (mul->a == op->lhs) { + stream2 << " *= "; + mul->b.accept(this); + printed = true; } - printed = true; } - } - else if (isa(op->rhs)) { - auto mul = to(op->rhs); - if (mul->a == op->lhs) { - stream << " *= "; - mul->b.accept(this); - printed = true; + else if (isa(op->rhs)) { + auto bitOr = to(op->rhs); + if (bitOr->a == op->lhs) { + stream2 << " |= "; + bitOr->b.accept(this); + printed = true; + } } } - else if (isa(op->rhs)) { - auto bitOr = to(op->rhs); - if (bitOr->a == op->lhs) { - stream << " |= "; - bitOr->b.accept(this); - printed = true; - } + if (!printed) { + stream2 << " = "; + op->rhs.accept(this); } + + stream2 << ";"; + stream2 << endl; } - if (!printed) { - stream << " = "; - op->rhs.accept(this); + + + + else { + doIndent(); + op->lhs.accept(this); + parentPrecedence = Precedence::TOP; + bool printed = false; + if (simplify) { + if (isa(op->rhs)) { + auto add = to(op->rhs); + if (add->a == op->lhs) { + const Literal* lit = add->b.as(); + if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || + (lit->type.isUInt() && lit->equalsScalar(1)))) { + stream << "++"; + } + else { + stream << " += "; + add->b.accept(this); + } + printed = true; + } + } + else if (isa(op->rhs)) { + auto mul = to(op->rhs); + if (mul->a == op->lhs) { + stream << " *= "; + mul->b.accept(this); + printed = true; + } + } + else if (isa(op->rhs)) { + auto bitOr = to(op->rhs); + if (bitOr->a == op->lhs) { + stream << " |= "; + bitOr->b.accept(this); + printed = true; + } + } + } + if (!printed) { + stream << " = "; + op->rhs.accept(this); + } + + stream << ";"; + stream << endl; } - stream << ";"; - stream << endl; } void IRPrinter::visit(const Yield* op) { @@ -559,17 +887,32 @@ void IRPrinter::visit(const Comment* op) { } void IRPrinter::visit(const BlankLine*) { - stream << endl; + if (is_ISPC_code_stream_enabled()) { + stream2 << endl; + } + else { + stream << endl; + } } void IRPrinter::visit(const Continue*) { doIndent(); - stream << "continue;" << endl; + if (!is_ISPC_code_stream_enabled()) { + stream << "continue;" << endl; + } + else { + stream2 << "continue;" << endl; + } } void IRPrinter::visit(const Break*) { doIndent(); - stream << "break;" << endl; + if (!is_ISPC_code_stream_enabled()) { + stream << "break;" << endl; + } + else { + stream2 << "break;" << endl; + } } void IRPrinter::visit(const Print* op) { @@ -585,7 +928,12 @@ void IRPrinter::visit(const Print* op) { } void IRPrinter::visit(const GetProperty* op) { - stream << op->name; + if (is_ISPC_code_stream_enabled()) { + stream2 << op->name; + } + else { + stream << op->name; + } } void IRPrinter::visit(const Sort* op) { @@ -643,23 +991,47 @@ void IRPrinter::resetNameCounters() { } void IRPrinter::doIndent() { - for (int i=0; ivar); Expr start = rewrite(op->start); Expr end = rewrite(op->end); diff --git a/src/lower/lowerer_impl_imperative.cpp b/src/lower/lowerer_impl_imperative.cpp index b4c9ea710..53ffd936f 100644 --- a/src/lower/lowerer_impl_imperative.cpp +++ b/src/lower/lowerer_impl_imperative.cpp @@ -1,4 +1,5 @@ #include +#include "taco/cuda.h" #include "taco/lower/lowerer_impl_imperative.h" #include "taco/lower/lowerer_impl.h" @@ -26,6 +27,7 @@ class LowererImplImperative::Visitor : public IndexNotationVisitorStrict { public: Visitor(LowererImplImperative* impl) : impl(impl) {} Stmt lower(IndexStmt stmt) { + std::cout << "lowering IndexStmt to ir:Stmt - IndexStmt: " << stmt << std::endl; this->stmt = Stmt(); impl->accessibleIterators.scope(); IndexStmtVisitorStrict::visit(stmt); @@ -200,6 +202,7 @@ static std::set hasSparseInserts(IndexStmt stmt, Iterators iterators, return ret; } + Stmt LowererImplImperative::lower(IndexStmt stmt, string name, bool assemble, bool compute, bool pack, bool unpack) @@ -586,19 +589,27 @@ LowererImplImperative::splitAppenderAndInserters(const vector& results } +// important function +/* +* This is the for loop lowering part +*/ Stmt LowererImplImperative::lowerForall(Forall forall) { + std::cout << "doing lowerForall: " << forall << std::endl; bool hasExactBound = provGraph.hasExactBound(forall.getIndexVar()); bool forallNeedsUnderivedGuards = !hasExactBound && emitUnderivedGuards; if (!ignoreVectorize && forallNeedsUnderivedGuards && (forall.getParallelUnit() == ParallelUnit::CPUVector || forall.getUnrollFactor() > 0)) { + std::cout << "calling lowerForallCloned(forall)\n"; return lowerForallCloned(forall); } + std::cout << "inParallelLoopDepth: " << inParallelLoopDepth << "========================\n"; if (forall.getParallelUnit() != ParallelUnit::NotParallel) { inParallelLoopDepth++; } + std::cout << "inParallelLoopDepth: " << inParallelLoopDepth << "========================\n"; // Recover any available parents that were not recoverable previously vector recoverySteps; @@ -786,19 +797,23 @@ Stmt LowererImplImperative::lowerForall(Forall forall) } if (!isWhereProducer && hasPosDescendant && underivedAncestors.size() > 1 && provGraph.isPosVariable(iterator.getIndexVar()) && posDescendant == forall.getIndexVar()) { + std::cout << "calling lowerForallFusedPosition(forall\n"; loops = lowerForallFusedPosition(forall, iterator, locators, inserters, appenders, reducedAccesses, recoveryStmt); } else if (canAccelWithSparseIteration) { + std::cout << "calling lowerForallDenseAcceleration(forall\n"; loops = lowerForallDenseAcceleration(forall, locators, inserters, appenders, reducedAccesses, recoveryStmt); } // Emit dimension coordinate iteration loop else if (iterator.isDimensionIterator()) { + std::cout << "calling lowerForallDimension(forall\n"; loops = lowerForallDimension(forall, point.locators(), inserters, appenders, reducedAccesses, recoveryStmt); } // Emit position iteration loop else if (iterator.hasPosIter()) { + std::cout << "calling lowerForallPosition(forall\n"; loops = lowerForallPosition(forall, iterator, locators, inserters, appenders, reducedAccesses, recoveryStmt); } @@ -816,6 +831,10 @@ Stmt LowererImplImperative::lowerForall(Forall forall) loops = lowerMergeLattice(lattice, underivedAncestors[0], forall.getStmt(), reducedAccesses); } + + std::cout << "printing loops ----------------------------------------------------------------------------------------------\n"; + std::cout << loops << std::endl; + std::cout << "loops printed -----------------------------------------------------------------------------------------------\n"; // taco_iassert(loops.defined()); if (!generateComputeCode() && !hasStores(loops)) { @@ -832,6 +851,7 @@ Stmt LowererImplImperative::lowerForall(Forall forall) parallelUnitIndexVars.erase(forall.getParallelUnit()); parallelUnitSizes.erase(forall.getParallelUnit()); } + return Block::blanks(preInitValues, temporaryValuesInitFree[0], loops, @@ -1136,6 +1156,7 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, set reducedAccesses, ir::Stmt recoveryStmt) { + std::cout << "1 Stmt LowererImplImperative::lowerForallDimension\n"; Expr coordinate = getCoordinateVar(forall.getIndexVar()); if (forall.getParallelUnit() != ParallelUnit::NotParallel && forall.getOutputRaceStrategy() == OutputRaceStrategy::Atomics) { @@ -1143,6 +1164,8 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, atomicParallelUnit = forall.getParallelUnit(); } + std::cout << "original forall : " << forall << std::endl; + std::cout << "inside IndexStmt: " << forall.getStmt() << std::endl; Stmt body = lowerForallBody(coordinate, forall.getStmt(), locators, inserters, appenders, reducedAccesses); @@ -1158,7 +1181,13 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, std::vector bounds = provGraph.deriveIterBounds(forall.getIndexVar(), definedIndexVarsOrdered, underivedBounds, indexVarToExprMap, iterators); LoopKind kind = LoopKind::Serial; - if (forall.getParallelUnit() == ParallelUnit::CPUVector && !ignoreVectorize) { + if (should_use_ISPC_codegen()) { + std::cout << "Foreach compatible loop\n"; + if (forall.getParallelUnit() == ParallelUnit::CPUVector && !ignoreVectorize) { + kind = LoopKind::Foreach; + } + } + else if (forall.getParallelUnit() == ParallelUnit::CPUVector && !ignoreVectorize) { kind = LoopKind::Vectorized; } else if (forall.getParallelUnit() != ParallelUnit::NotParallel @@ -1166,6 +1195,7 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, kind = LoopKind::Runtime; } + std::cout << "2 Stmt LowererImplImperative::lowerForallDimension\n"; return Block::blanks(For::make(coordinate, bounds[0], bounds[1], 1, body, kind, ignoreVectorize ? ParallelUnit::NotParallel : forall.getParallelUnit(), ignoreVectorize ? 0 : forall.getUnrollFactor()), @@ -1179,6 +1209,7 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, set reducedAccesses, ir::Stmt recoveryStmt) { + std::cout << "1 Stmt LowererImplImperative::lowerForallDenseAcceleration\n"; taco_iassert(locators.size() == 1) << "Optimizing a dense workspace is only supported when the consumer is the only RHS tensor"; taco_iassert(provGraph.isFullyDerived(forall.getIndexVar())) << "Sparsely accelerating a dense workspace only works with fully derived index vars"; taco_iassert(forall.getParallelUnit() == ParallelUnit::NotParallel) << "Sparsely accelerating a dense workspace only works within serial loops"; @@ -1204,6 +1235,8 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, } Stmt declareVar = VarDecl::make(coordinate, Load::make(indexList, loopVar)); + std::cout << "original forall : " << forall << std::endl; + std::cout << "inside IndexStmt: " << forall.getStmt() << std::endl; Stmt body = lowerForallBody(coordinate, forall.getStmt(), locators, inserters, appenders, reducedAccesses); Stmt resetGuard = ir::Store::make(bitGuard, coordinate, ir::Literal::make(false), markAssignsAtomicDepth > 0, atomicParallelUnit); @@ -1216,7 +1249,12 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, Stmt posAppend = generateAppendPositions(appenders); LoopKind kind = LoopKind::Serial; - if (forall.getParallelUnit() == ParallelUnit::CPUVector && !ignoreVectorize) { + if (should_use_ISPC_codegen()) { + if (forall.getParallelUnit() == ParallelUnit::CPUVector && !ignoreVectorize) { + kind = LoopKind::Foreach; + } + } + else if (forall.getParallelUnit() == ParallelUnit::CPUVector && !ignoreVectorize) { kind = LoopKind::Vectorized; } else if (forall.getParallelUnit() != ParallelUnit::NotParallel @@ -1224,6 +1262,7 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, kind = LoopKind::Runtime; } + std::cout << "2 Stmt LowererImplImperative::lowerForallDenseAcceleration\n"; return Block::blanks(For::make(loopVar, 0, indexListSize, 1, body, kind, ignoreVectorize ? ParallelUnit::NotParallel : forall.getParallelUnit(), ignoreVectorize ? 0 : forall.getUnrollFactor()), @@ -1247,6 +1286,8 @@ Stmt LowererImplImperative::lowerForallPosition(Forall forall, Iterator iterator set reducedAccesses, ir::Stmt recoveryStmt) { + std::cout << "1 Stmt LowererImplImperative::lowerForallPosition\n" << std::endl; + Expr coordinate = getCoordinateVar(forall.getIndexVar()); Stmt declareCoordinate = Stmt(); Stmt strideGuard = Stmt(); @@ -1278,6 +1319,11 @@ Stmt LowererImplImperative::lowerForallPosition(Forall forall, Iterator iterator markAssignsAtomicDepth++; } + // see we are inside a forall. ex: forall(i, forall(j, y(i) += A(i,j) * x(j))) + // when you call forall.getStmt it returns forall(j, y(i) += A(i,j) * x(j)) which is the + // IndexStmt inside the forall IndexStmt + std::cout << "original forall : " << forall << std::endl; + std::cout << "inside IndexStmt: " << forall.getStmt() << std::endl; Stmt body = lowerForallBody(coordinate, forall.getStmt(), locators, inserters, appenders, reducedAccesses); @@ -1339,6 +1385,7 @@ Stmt LowererImplImperative::lowerForallPosition(Forall forall, Iterator iterator kind = LoopKind::Runtime; } + std::cout << "2 Stmt LowererImplImperative::lowerForallPosition\n" << std::endl; // Loop with preamble and postamble return Block::blanks( boundsCompute, @@ -1357,6 +1404,7 @@ Stmt LowererImplImperative::lowerForallFusedPosition(Forall forall, Iterator ite set reducedAccesses, ir::Stmt recoveryStmt) { + std::cout << "1 Stmt LowererImplImperative::lowerForallFusedPosition\n" << std::endl; Expr coordinate = getCoordinateVar(forall.getIndexVar()); Stmt declareCoordinate = Stmt(); if (provGraph.isCoordVariable(forall.getIndexVar())) { @@ -1447,6 +1495,8 @@ Stmt LowererImplImperative::lowerForallFusedPosition(Forall forall, Iterator ite markAssignsAtomicDepth++; } + std::cout << "original forall : " << forall << std::endl; + std::cout << "inside IndexStmt: " << forall.getStmt() << std::endl; Stmt body = lowerForallBody(coordinate, forall.getStmt(), locators, inserters, appenders, reducedAccesses); @@ -1503,6 +1553,8 @@ Stmt LowererImplImperative::lowerForallFusedPosition(Forall forall, Iterator ite && forall.getOutputRaceStrategy() != OutputRaceStrategy::ParallelReduction && !ignoreVectorize) { kind = LoopKind::Runtime; } + + std::cout << "2 Stmt LowererImplImperative::lowerForallFusedPosition\n" << std::endl; // Loop with preamble and postamble return Block::blanks(boundsCompute, Block::make(Block::make(searchForUnderivedStart), @@ -1765,6 +1817,9 @@ Stmt LowererImplImperative::lowerForallBody(Expr coordinate, IndexStmt stmt, vector inserters, vector appenders, const set& reducedAccesses) { + + std::cout << "lowering a forall body----------------------------------------------------\n"; + Stmt initVals = resizeAndInitValues(appenders, reducedAccesses); // Inserter positions @@ -1780,6 +1835,7 @@ Stmt LowererImplImperative::lowerForallBody(Expr coordinate, IndexStmt stmt, // Code of loop body statement Stmt body = lower(stmt); + std::cout << "\nBefore: [" << stmt << "]\nAfter : [" << body << "]\n"; // Code to append coordinates Stmt appendCoords = appendCoordinate(appenders, coordinate); @@ -1889,6 +1945,7 @@ vector LowererImplImperative::codeToInitializeDenseAcceleratorArrays(Where Expr p = Var::make("p" + temporary.getName(), Int()); Stmt guardZeroInit = Store::make(alreadySetArr, p, ir::Literal::zero(bitGuardType)); + std::cout << "vector LowererImplImperative::codeToInitializeDenseAcceleratorArrays\n" << std::endl; Stmt zeroInitLoop = For::make(p, 0, bitGuardSize, 1, guardZeroInit, LoopKind::Serial); Stmt inits = Block::make(alreadySetDecl, indexListDecl, allocateAlreadySet, allocateIndexList, zeroInitLoop); return {inits, freeTemps}; @@ -2203,6 +2260,7 @@ Stmt LowererImplImperative::lowerWhere(Where where) { true, false); Expr size = getTemporarySize(where); Stmt zeroInit = Store::make(values, p, ir::Literal::zero(temporary.getType().getDataType())); + std::cout << "Stmt LowererImplImperative::lowerWhere\n"; Stmt loopInit = For::make(p, 0, size, 1, zeroInit, LoopKind::Serial); initializeTemporary = Block::make(initializeTemporary, loopInit); } @@ -2334,6 +2392,7 @@ Stmt LowererImplImperative::lowerAssemble(Assemble assemble) { resultModeOrdering[iter.getMode().getLevel() - 1]); Expr pos = iter.getPosVar(); Stmt initPos = VarDecl::make(pos, iter.locate(locateCoords)[0]); + std::cout << "Stmt LowererImplImperative::lowerAssemble\n"; insertEdgeLoop = For::make(coords.back(), 0, dim, 1, Block::make(initPos, insertEdgeLoop)); } else { @@ -2415,6 +2474,7 @@ Stmt LowererImplImperative::lowerMulti(Multi multi) { } Stmt LowererImplImperative::lowerSuchThat(SuchThat suchThat) { + std::cout << "lowering such that statement\n"; Stmt stmt = lower(suchThat.getStmt()); return Block::make(stmt); } @@ -2942,6 +3002,7 @@ Stmt LowererImplImperative::resizeAndInitValues(const std::vector& app Stmt LowererImplImperative::zeroInitValues(Expr tensor, Expr begin, Expr size) { + std::cout << "1 Stmt LowererImplImperative::zeroInitValues\n"; Expr lower = simplify(ir::Mul::make(begin, size)); Expr upper = simplify(ir::Mul::make(ir::Add::make(begin, 1), size)); Expr p = Var::make("p" + util::toString(tensor), Int()); @@ -2954,6 +3015,10 @@ Stmt LowererImplImperative::zeroInitValues(Expr tensor, Expr begin, Expr size) { return ir::VarDecl::make(ir::Var::make("status", Int()), ir::Call::make("cudaMemset", {values, ir::Literal::make(0, Int()), ir::Mul::make(ir::Sub::make(upper, lower), ir::Literal::make(values.type().getNumBytes()))}, Int())); } + std::cout << "2 Stmt LowererImplImperative::zeroInitValues\n"; + if (should_use_ISPC_codegen()) { + return For::make(p, lower, upper, 1, zeroInit, LoopKind::Foreach); + } return For::make(p, lower, upper, 1, zeroInit, parallel); } diff --git a/src/tensor.cpp b/src/tensor.cpp index fab437ff1..3519456c9 100644 --- a/src/tensor.cpp +++ b/src/tensor.cpp @@ -278,6 +278,7 @@ static size_t unpackTensorData(const taco_tensor_t& tensorData, /// Pack coordinates into a data structure given by the tensor format. void TensorBase::pack() { + std::cout << "TensorBase::Pack() method\n"; if (!needsPack()) { return; } @@ -346,6 +347,7 @@ void TensorBase::pack() { taco_iassert((content->coordinateBufferUsed % content->coordinateSize) == 0); const size_t numCoordinates = content->coordinateBufferUsed / content->coordinateSize; + std::cout << "call helperFuncs\n"; const auto helperFuncs = getHelperFunctions(getFormat(), getComponentType(), dimensions); @@ -623,6 +625,7 @@ void TensorBase::compile() { compile(stmt, content->assembleWhileCompute); } void TensorBase::compile(taco::IndexStmt stmt, bool assembleWhileCompute) { + std::cout << "TensorBase::compile\n"; if (!needsCompile()) { return; } @@ -934,6 +937,7 @@ TensorBase::getHelperFunctions(const Format& format, Datatype ctype, }; const auto dims = util::map(dimensions, getDim); + set_ISPC_code_stream_enabled(false); if (format.getOrder() > 0) { const Format bufferFormat = COO(format.getOrder(), false, true, false, format.getModeOrdering()); @@ -951,6 +955,7 @@ TensorBase::getHelperFunctions(const Format& format, Datatype ctype, } // Lower packing and iterator code. + std::cout << "1 Lower packing and iterator code\n"; helperModule->addFunction(lower(packStmt, "pack", true, true)); helperModule->addFunction(lower(iterateStmt, "iterate", false, true)); } else { @@ -964,12 +969,14 @@ TensorBase::getHelperFunctions(const Format& format, Datatype ctype, IndexVar indexVar; IndexStmt assignment = (packedScalar() = bufferVector(indexVar)); IndexStmt packStmt= makeConcreteNotation(makeReductionNotation(assignment)); + std::cout << "2 Lower packing and iterator code\n"; helperModule->addFunction(lower(packStmt, "pack", true, true)); // Define and lower iterator code. IndexStmt iterateStmt = Yield({}, packedScalar()); helperModule->addFunction(lower(iterateStmt, "iterate", false, true)); } + std::cout << "Compiling the helperModule\n"; helperModule->compile(); helperFunctionsMutex.lock(); diff --git a/test/tests-scheduling-eval.cpp b/test/tests-scheduling-eval.cpp index f59359081..6a228f38b 100644 --- a/test/tests-scheduling-eval.cpp +++ b/test/tests-scheduling-eval.cpp @@ -4,6 +4,7 @@ #include #include #include +#include "taco/cuda.h" #include "test.h" #include "test_tensors.h" #include "taco/tensor.h" @@ -48,10 +49,10 @@ IndexStmt scheduleSpMVCPU(IndexStmt stmt, int CHUNK_SIZE=16) { IndexStmt scheduleSpMVISPC(IndexStmt stmt, int CHUNK_SIZE=16) { IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1"); - return stmt; - // return stmt.split(i, i0, i1, CHUNK_SIZE) - // .reorder({i0, i1, j}) - // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); + // return stmt; + return stmt.split(i, i0, i1, CHUNK_SIZE) + .reorder({i0, i1, j}) + .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); } IndexStmt scheduleSpMMCPU(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { @@ -64,6 +65,16 @@ IndexStmt scheduleSpMMCPU(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, i .parallelize(k, ParallelUnit::CPUVector, OutputRaceStrategy::IgnoreRaces); } +IndexStmt scheduleSpMMISPC(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); + return stmt.split(i, i0, i1, CHUNK_SIZE) + .pos(j, jpos, A(i,j)) + .split(jpos, jpos0, jpos1, UNROLL_FACTOR) + .reorder({i0, i1, jpos0, k, jpos1}) + .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + .parallelize(k, ParallelUnit::CPUVector, OutputRaceStrategy::IgnoreRaces); +} + IndexStmt scheduleSpGEMMCPU(IndexStmt stmt, bool doPrecompute) { Assignment assign = stmt.as().getStmt().as().getStmt() .as().getStmt().as(); @@ -1473,8 +1484,6 @@ TEST(scheduling_eval, mttkrpGPU) { ASSERT_TENSOR_EQ(expected, A); } - - TEST(generate_ispc_evaluation_files, ispc) { std::cout << "Hi Adhitha!\n" << std::endl ; set_CUDA_codegen_enabled(false); @@ -1495,15 +1504,18 @@ TEST(generate_ispc_evaluation_files, ispc) { int NUM_I = 100; int NUM_J = 100; + int NUM_K = 100; + string c_file_ending = ".h"; string file_ending = ".ispc"; string file_path = "eval_prepared_ispc/"; mkdir(file_path.c_str(), 0777); // spmv { - stringstream source; - std::shared_ptr codegen = ir::CodeGen::init_default(source, ir::CodeGen::ImplementationGen); + stringstream source1; + stringstream source2; + std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); Tensor A("A", {NUM_I, NUM_J}, CSR); Tensor x("x", {NUM_J}, {Dense}); Tensor y("y", {NUM_I}, {Dense}); @@ -1511,18 +1523,53 @@ TEST(generate_ispc_evaluation_files, ispc) { std::cout << "concretizing the assignment statement\n"; IndexStmt stmt = y.getAssignment().concretize(); std::cout << "Printing the original IndexStmt: " << stmt << std::endl; + for (auto paramSet : spmv_parameters) { std::cout << "param set: " << paramSet[0] << std::endl; IndexStmt scheduled = scheduleSpMVISPC(stmt, paramSet[0]); std::cout << "scheduled IndexStmt: " << scheduled << std::endl; - ir::Stmt compute = lower(scheduled, "spmv_csr_ispc_taco", false, true); + ir::Stmt compute = lower(scheduled, string("compute_") + util::join(paramSet, "_"), false, true); std::cout << "computed statement: \n" << compute << std::endl; codegen->compile(compute, false); } ofstream source_file; - source_file.open(file_path + "spmv_csr_ispc_taco.h"); - source_file << source.str(); + source_file.open(file_path + "spmv_csr_ispc_taco" + c_file_ending); + source_file << source1.str(); + source_file.close(); + + ofstream ispc_source_file; + ispc_source_file.open(file_path + "__spmv_csr_ispc_taco" + file_ending); + ispc_source_file << source2.str(); + ispc_source_file.close(); + + } + + // spmm + { + stringstream source1; + stringstream source2; + std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); + Tensor A("A", {NUM_I, NUM_J}, CSR); + Tensor B("B", {NUM_J, NUM_K}, {Dense, Dense}); + Tensor C("C", {NUM_I, NUM_K}, {Dense, Dense}); + C(i, k) = A(i, j) * B(j, k); + IndexStmt stmt = C.getAssignment().concretize(); + bool isFirst = true; + for (auto paramSet : spmm_parameters) { + IndexStmt scheduled = scheduleSpMMISPC(stmt, A, paramSet[0], paramSet[1]); + ir::Stmt compute = lower(scheduled, string("compute_") + util::join(paramSet, "_"), false, true); + codegen->compile(compute, isFirst); + isFirst = false; + } + ofstream source_file; + source_file.open(file_path + "spmm_csr_ispc_taco" + c_file_ending); + source_file << source1.str(); source_file.close(); + + ofstream ispc_source_file; + ispc_source_file.open(file_path + "__spmm_csr_ispc_taco" + file_ending); + ispc_source_file << source2.str(); + ispc_source_file.close(); } @@ -1846,9 +1893,13 @@ TEST(generate_evaluation_files, cpu) { } TEST(generate_evaluation_files, gpu) { - if (!should_use_CUDA_codegen()) { - return; - } + // if (!should_use_CUDA_codegen()) { + // return; + // } + set_CUDA_codegen_enabled(true); + set_ISPC_codegen_enabled(false); + + std::cout << "executing generate_evaluation_file.gpu\n"; vector> spmv_parameters = {}; // {NNZ_PER_THREAD, BLOCK_SIZE} for (int i = 3; i <= 20; i++) { diff --git a/tools/taco.cpp b/tools/taco.cpp index ce03b61e1..9a864a699 100644 --- a/tools/taco.cpp +++ b/tools/taco.cpp @@ -1297,6 +1297,7 @@ int main(int argc, char* argv[]) { } bool hasPrinted = false; + std::shared_ptr codegen = ir::CodeGen::init_default(cout, ir::CodeGen::ImplementationGen); codegen->setColor(color); if (printAssemble) { @@ -1317,6 +1318,7 @@ int main(int argc, char* argv[]) { } if (compute.defined()) { + std::cout << "Code generation\n"; codegen->compile(compute, false); } else { From 4e7bd6879c5f7ca1f43397dff5cc92259a7e1eda Mon Sep 17 00:00:00 2001 From: Adhhitha Dias Date: Mon, 19 Jul 2021 15:13:47 -0400 Subject: [PATCH 04/16] add CPUSpmd directive partially --- include/taco/index_notation/transformations.h | 2 + include/taco/ir/ir.h | 2 +- include/taco/ir_tags.h | 2 +- include/taco/lower/lowerer_impl_imperative.h | 3 + src/codegen/codegen.cpp | 66 ----- src/codegen/codegen.h | 13 +- src/codegen/codegen_cuda.cpp | 1 + src/codegen/codegen_ispc.cpp | 257 +++++++++++++++++- src/codegen/codegen_ispc.h | 7 + src/index_notation/index_notation_printer.cpp | 4 +- src/index_notation/transformations.cpp | 62 ++++- src/ir/ir_printer.cpp | 43 --- src/ir_tags.cpp | 2 +- src/lower/lowerer_impl_imperative.cpp | 76 +++++- src/tensor.cpp | 1 + test/tests-scheduling-eval.cpp | 207 +++++++++++++- tools/taco.cpp | 47 +++- 17 files changed, 647 insertions(+), 148 deletions(-) diff --git a/include/taco/index_notation/transformations.h b/include/taco/index_notation/transformations.h index 7aa2579ad..6bf277d5c 100644 --- a/include/taco/index_notation/transformations.h +++ b/include/taco/index_notation/transformations.h @@ -223,6 +223,8 @@ IndexStmt parallelizeOuterLoop(IndexStmt stmt); */ IndexStmt reorderLoopsTopologically(IndexStmt stmt); +IndexStmt justTraverseThroughTheIndexStmt(IndexStmt stmt); + /** * Performs scalar promotion so that reductions are done by accumulating into * scalar temporaries whenever possible. diff --git a/include/taco/ir/ir.h b/include/taco/ir/ir.h index cb46b5142..651faff4e 100644 --- a/include/taco/ir/ir.h +++ b/include/taco/ir/ir.h @@ -591,7 +591,7 @@ struct Switch : public StmtNode { static const IRNodeType _type_info = IRNodeType::Switch; }; -enum class LoopKind {Serial, Static, Dynamic, Runtime, Vectorized, Static_Chunked, Foreach}; +enum class LoopKind {Serial, Static, Dynamic, Runtime, Vectorized, Static_Chunked, Foreach, Mul_Thread}; /** A for loop from start to end by increment. * A vectorized loop will require the increment to be 1 and the diff --git a/include/taco/ir_tags.h b/include/taco/ir_tags.h index 5858a13e3..6a74be173 100644 --- a/include/taco/ir_tags.h +++ b/include/taco/ir_tags.h @@ -9,7 +9,7 @@ namespace taco { /// ParallelUnit::GPUWarp can be optionally used to allow for GPU warp-level primitives /// ParallelUnit::GPUThread causes for every iteration to be executed on a separate GPU thread enum class ParallelUnit { - NotParallel, DefaultUnit, GPUBlock, GPUWarp, GPUThread, CPUThread, CPUVector, CPUThreadGroupReduction, GPUBlockReduction, GPUWarpReduction + NotParallel, DefaultUnit, GPUBlock, GPUWarp, GPUThread, CPUThread, CPUVector, CPUThreadGroupReduction, GPUBlockReduction, GPUWarpReduction, CPUSimd, CPUSpmd }; extern const char *ParallelUnit_NAMES[]; diff --git a/include/taco/lower/lowerer_impl_imperative.h b/include/taco/lower/lowerer_impl_imperative.h index 65f069fda..d743f5875 100644 --- a/include/taco/lower/lowerer_impl_imperative.h +++ b/include/taco/lower/lowerer_impl_imperative.h @@ -499,10 +499,13 @@ class LowererImplImperative : public LowererImpl { bool emitUnderivedGuards = true; + int loopDepth = 0; int inParallelLoopDepth = 0; std::map parallelUnitSizes; std::map parallelUnitIndexVars; + std::map forUnits; // + std::map whereTempsWithLoopDepth; /// Keep track of what IndexVars have already been defined std::set definedIndexVars; diff --git a/src/codegen/codegen.cpp b/src/codegen/codegen.cpp index 750f33516..7081bc195 100644 --- a/src/codegen/codegen.cpp +++ b/src/codegen/codegen.cpp @@ -441,72 +441,6 @@ string CodeGen::printDecls(map varMap, return ret.str(); } -string CodeGen::printCallISPCFunc(const Function *func, map varMap, - vector &sortedProps) { - std::stringstream ret; - ret << " "; - unordered_set propsAlreadyGenerated; - - ret << "__" << func->name << "("; - - vector inputs = func->inputs; - vector outputs = func->outputs; - getSortedProps(varMap, sortedProps, inputs, outputs); - - for (unsigned long i=0; i < sortedProps.size(); i++) { - ret << varMap[sortedProps[i]]; - if (i != sortedProps.size()-1) { - ret << ", "; - } - propsAlreadyGenerated.insert(varMap[sortedProps[i]]); - } - - ret << ");\n"; - return ret.str(); -} - -string CodeGen::printISPCFunc(const Function *func, map varMap, - vector &sortedProps) { - std::stringstream ret; - ret << "export void "; - unordered_set propsAlreadyGenerated; - - ret << "__" << func->name << "("; - - vector inputs = func->inputs; - vector outputs = func->outputs; - // getSortedProps(varMap, sortedProps, inputs, outputs); - - for (unsigned long i=0; i < sortedProps.size(); i++) { - auto prop = sortedProps[i]; - bool isOutputProp = (find(outputs.begin(), outputs.end(), - prop->tensor) != outputs.end()); - - auto var = prop->tensor.as(); - if (var->is_parameter) { - if (isOutputProp) { - ret << " " << printTensorProperty(varMap[prop], prop, false) << ";" << endl; - } else { - break; - } - } else { - ret << getUnpackedTensorArgument(varMap[prop], prop, isOutputProp); - } - propsAlreadyGenerated.insert(varMap[prop]); - - if (i!=sortedProps.size()-1) { - ret << ", "; - } - if (i%2==0) { - ret << "\n\t"; - } - } - ret << ") {\n"; - - return ret.str(); -} - - string CodeGen::printPack(map, string> outputProperties, vector outputs) { stringstream ret; diff --git a/src/codegen/codegen.h b/src/codegen/codegen.h index 641239834..db891f995 100644 --- a/src/codegen/codegen.h +++ b/src/codegen/codegen.h @@ -49,10 +49,6 @@ class CodeGen : public IRPrinter { std::string printContextDeclAndInit(std::map varMap, std::vector localVars, int labels, std::string funcName); - std::string printCallISPCFunc(const Function *func, std::map varMap, - std::vector &sortedProps); - std::string printISPCFunc(const Function *func, std::map varMap, - std::vector &sortedProps); std::string printDecls(std::map varMap, std::vector inputs, std::vector outputs); std::string printPack(std::map, @@ -63,6 +59,10 @@ class CodeGen : public IRPrinter { std::string printFuncName(const Function *func, std::map inputMap={}, std::map outputMap={}); + + std::string printTensorProperty(std::string varname, const GetProperty* op, bool is_ptr); + std::string getUnpackedTensorArgument(std::string varname, const GetProperty* op, + bool is_output_prop); void resetUniqueNameCounters(); std::string genUniqueName(std::string name); @@ -72,11 +72,8 @@ class CodeGen : public IRPrinter { private: virtual std::string restrictKeyword() const { return ""; } - std::string printTensorProperty(std::string varname, const GetProperty* op, bool is_ptr); std::string unpackTensorProperty(std::string varname, const GetProperty* op, - bool is_output_prop); - std::string getUnpackedTensorArgument(std::string varname, const GetProperty* op, - bool is_output_prop); + bool is_output_prop); std::string packTensorProperty(std::string varname, Expr tnsr, TensorProperty property, int mode, int index); std::string pointTensorProperty(std::string varname); diff --git a/src/codegen/codegen_cuda.cpp b/src/codegen/codegen_cuda.cpp index 77cf0cd88..14505f740 100644 --- a/src/codegen/codegen_cuda.cpp +++ b/src/codegen/codegen_cuda.cpp @@ -646,6 +646,7 @@ void CodeGen_CUDA::printDeviceFunctions(const Function* func) { // Collect device functions resetUniqueNameCounters(); deviceFunctionLoopDepth = 0; + // here they calculate the device FunctionCollecor DeviceFunctionCollector deviceFunctionCollector(func->inputs, func->outputs, this); func->body.accept(&deviceFunctionCollector); deviceFunctions = deviceFunctionCollector.blockFors; diff --git a/src/codegen/codegen_ispc.cpp b/src/codegen/codegen_ispc.cpp index f107728cc..c8480cd25 100644 --- a/src/codegen/codegen_ispc.cpp +++ b/src/codegen/codegen_ispc.cpp @@ -7,6 +7,9 @@ #include "taco/cuda.h" #include "taco/ir/ir_visitor.h" +#include "taco/ir/ir_rewriter.h" +#include "taco/ir/simplify.h" + #include "codegen_ispc.h" #include "taco/error.h" #include "taco/util/strings.h" @@ -240,6 +243,121 @@ class CodeGen_ISPC::FindVars : public IRVisitor { } }; + +// Finds all for loops tagged with accelerator and adds statements to deviceFunctions +// Also tracks scope of when device function is called and +// tracks which variables must be passed to function. +class CodeGen_ISPC::DeviceFunctionCollector : public IRVisitor { +public: + vector blockFors; + vector threadFors; // contents is device function + vector warpFors; + map scopeMap; + + // the variables to pass to each device function + vector>> functionParameters; + vector> currentParameters; // keep as vector so code generation is deterministic + set currentParameterSet; + + set variablesDeclaredInKernel; + + vector> threadIDVars; + vector> blockIDVars; + vector> warpIDVars; + vector numThreads; + vector numWarps; + + CodeGen_ISPC *codeGen; + // copy inputs and outputs into the map + DeviceFunctionCollector(vector inputs, vector outputs, CodeGen_ISPC *codeGen) : codeGen(codeGen) { + inDeviceFunction = false; + for (auto v: inputs) { + auto var = v.as(); + taco_iassert(var) << "Inputs must be vars in codegen"; + taco_iassert(scopeMap.count(var) == 0) << + "Duplicate input found in codegen"; + scopeMap[var] = var->name; + } + for (auto v: outputs) { + auto var = v.as(); + taco_iassert(var) << "Outputs must be vars in codegen"; + taco_iassert(scopeMap.count(var) == 0) << + "Duplicate output found in codegen"; + + scopeMap[var] = var->name; + } + } + +protected: + bool inDeviceFunction; + using IRVisitor::visit; + + virtual void visit(const For *op) { + if (op->parallel_unit == ParallelUnit::CPUSpmd) { + std::cout << "ParallelUnit::CPUSpmd directive found\n"; + inDeviceFunction = false; + op->var.accept(this); + inDeviceFunction = true; + + threadFors.push_back(op); + std::cout << "scopeMap: [" << scopeMap[op->var] << "], varExpr: [" << op->var << "]\n"; + threadIDVars.push_back(pair(scopeMap[op->var], op->var)); + Expr blockSize = ir::simplify(ir::Div::make(ir::Sub::make(op->end, op->start), op->increment)); + numThreads.push_back(blockSize); + + } + else if (op->parallel_unit == ParallelUnit::CPUSimd) { + + } + else{ + op->var.accept(this); + } + op->start.accept(this); + op->end.accept(this); + op->increment.accept(this); + op->contents.accept(this); + } + + virtual void visit(const Var *op) { + if (scopeMap.count(op) == 0) { + string name = codeGen->genUniqueName(op->name); + if (!inDeviceFunction) { + scopeMap[op] = name; + } + } + else if (scopeMap.count(op) == 1 && inDeviceFunction && currentParameterSet.count(op) == 0 + && (threadIDVars.empty() || op != threadIDVars.back().second) + && !variablesDeclaredInKernel.count(op)) { + currentParameters.push_back(pair(scopeMap[op], op)); + currentParameterSet.insert(op); + } + } + + virtual void visit(const VarDecl *op) { + if (inDeviceFunction) { + variablesDeclaredInKernel.insert(op->var); + } + op->var.accept(this); + op->rhs.accept(this); + } + + virtual void visit(const GetProperty *op) { + if (scopeMap.count(op->tensor) == 0 && !inDeviceFunction) { + auto key = + tuple(op->tensor,op->property, + (size_t)op->mode, + (size_t)op->index); + auto unique_name = codeGen->genUniqueName(op->name); + scopeMap[op->tensor] = unique_name; + } + else if (scopeMap.count(op->tensor) == 1 && inDeviceFunction && currentParameterSet.count(op->tensor) == 0) { + currentParameters.push_back(pair(op->tensor.as()->name, op->tensor)); + currentParameterSet.insert(op->tensor); + } + } +}; + + CodeGen_ISPC::CodeGen_ISPC(std::ostream &dest, OutputKind outputKind, bool simplify) : CodeGen(dest, false, simplify, C), out(dest), out2(dest), outputKind(outputKind) {} @@ -262,6 +380,76 @@ void CodeGen_ISPC::compile(Stmt stmt, bool isFirst) { stmt.accept(this); } +string CodeGen_ISPC::printCallISPCFunc(const Function *func, map varMap, + vector &sortedProps) { + std::stringstream ret; + ret << " "; + unordered_set propsAlreadyGenerated; + + ret << "__" << func->name << "("; + + vector inputs = func->inputs; + vector outputs = func->outputs; + getSortedProps(varMap, sortedProps, inputs, outputs); + + for (unsigned long i=0; i < sortedProps.size(); i++) { + ret << varMap[sortedProps[i]]; + if (i != sortedProps.size()-1) { + ret << ", "; + } + propsAlreadyGenerated.insert(varMap[sortedProps[i]]); + } + + ret << ");\n"; + return ret.str(); +} + +string CodeGen_ISPC::printISPCFunc(const Function *func, map varMap, + vector &sortedProps) { + + DeviceFunctionCollector deviceFunctionCollector(func->inputs, func->outputs, this); + func->body.accept(&deviceFunctionCollector); + + + std::stringstream ret; + ret << "export void "; + unordered_set propsAlreadyGenerated; + + ret << "__" << func->name << "("; + + vector inputs = func->inputs; + vector outputs = func->outputs; + // getSortedProps(varMap, sortedProps, inputs, outputs); + + for (unsigned long i=0; i < sortedProps.size(); i++) { + auto prop = sortedProps[i]; + bool isOutputProp = (find(outputs.begin(), outputs.end(), + prop->tensor) != outputs.end()); + + auto var = prop->tensor.as(); + if (var->is_parameter) { + if (isOutputProp) { + ret << " " << printTensorProperty(varMap[prop], prop, false) << ";" << endl; + } else { + break; + } + } else { + ret << getUnpackedTensorArgument(varMap[prop], prop, isOutputProp); + } + propsAlreadyGenerated.insert(varMap[prop]); + + if (i!=sortedProps.size()-1) { + ret << ", "; + } + if (i%2==0) { + ret << "\n\t"; + } + } + ret << "\n) {\n\n"; + + return ret.str(); +} + void CodeGen_ISPC::sendToStream(std::stringstream &stream) { if (is_ISPC_code_stream_enabled()) { this->out2 << stream.str(); @@ -466,6 +654,21 @@ void CodeGen_ISPC::visit(const For* op) { case LoopKind::Dynamic: case LoopKind::Runtime: case LoopKind::Static_Chunked: + case LoopKind::Mul_Thread: + op->start.accept(this); + stream2 << std::endl; + op->start.accept(this); + stream2 << std::endl; + op->start.accept(this); + stream2 << std::endl; + op->start.accept(this); + stream2 << std::endl; + op->end.accept(this); + stream2 << std::endl; + op->end.accept(this); + stream2 << std::endl; + op->end.accept(this); + stream2 << std::endl; default: break; } @@ -629,10 +832,58 @@ void CodeGen_ISPC::visit(const Sqrt* op) { void CodeGen_ISPC::visit(const Assign* op) { if (is_ISPC_code_stream_enabled()) { - if (op->use_atomics) { - doIndent(); - stream2 << getAtomicPragma() << endl; + doIndent(); + op->lhs.accept(this); + parentPrecedence = Precedence::TOP; + bool printed = false; + if (simplify) { + if (isa(op->rhs)) { + auto add = to(op->rhs); + if (add->a == op->lhs) { + const Literal* lit = add->b.as(); + if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || + (lit->type.isUInt() && lit->equalsScalar(1)))) { + stream2 << "++"; + } + else { + if (op->use_atomics) { + stream2 << " += reduce_add("; + add->b.accept(this); + stream2 << ")"; + } + else { + stream2 << " += "; + add->b.accept(this); + } + } + printed = true; + } + } + else if (isa(op->rhs)) { + auto mul = to(op->rhs); + if (mul->a == op->lhs) { + stream2 << " *= "; + mul->b.accept(this); + printed = true; + } + } + else if (isa(op->rhs)) { + auto bitOr = to(op->rhs); + if (bitOr->a == op->lhs) { + stream2 << " |= "; + bitOr->b.accept(this); + printed = true; + } + } + } + if (!printed) { + stream2 << " = "; + op->rhs.accept(this); } + + stream2 << ";"; + stream2 << endl; + } else { if (op->use_atomics) { diff --git a/src/codegen/codegen_ispc.h b/src/codegen/codegen_ispc.h index 8abd1cc09..279d0db7a 100644 --- a/src/codegen/codegen_ispc.h +++ b/src/codegen/codegen_ispc.h @@ -43,6 +43,12 @@ class CodeGen_ISPC : public CodeGen { void visit(const Store*); void visit(const Assign*); + Stmt simplifyFunctionBodies(Stmt stmt); + std::string printCallISPCFunc(const Function *func, std::map varMap, + std::vector &sortedProps); + std::string printISPCFunc(const Function *func, std::map varMap, + std::vector &sortedProps); + std::map varMap; std::vector localVars; std::ostream &out; @@ -55,6 +61,7 @@ class CodeGen_ISPC : public CodeGen { bool emittingCoroutine; class FindVars; + class DeviceFunctionCollector; private: virtual std::string restrictKeyword() const { return "restrict"; } diff --git a/src/index_notation/index_notation_printer.cpp b/src/index_notation/index_notation_printer.cpp index 0b41615ad..d7ee998ae 100644 --- a/src/index_notation/index_notation_printer.cpp +++ b/src/index_notation/index_notation_printer.cpp @@ -224,9 +224,9 @@ void IndexNotationPrinter::visit(const YieldNode* op) { void IndexNotationPrinter::visit(const ForallNode* op) { os << "forall(" << op->indexVar << ", "; op->stmt.accept(this); - if (op->parallel_unit != ParallelUnit::NotParallel) { + // if (op->parallel_unit != ParallelUnit::NotParallel) { os << ", " << ParallelUnit_NAMES[(int) op->parallel_unit] << ", " << OutputRaceStrategy_NAMES[(int) op->output_race_strategy]; - } + // } os << ")"; } diff --git a/src/index_notation/transformations.cpp b/src/index_notation/transformations.cpp index 47fc1dd55..011779caf 100644 --- a/src/index_notation/transformations.cpp +++ b/src/index_notation/transformations.cpp @@ -1,8 +1,10 @@ #include "taco/index_notation/transformations.h" +#include "taco/cuda.h" #include "taco/index_notation/index_notation.h" #include "taco/index_notation/index_notation_rewriter.h" #include "taco/index_notation/index_notation_nodes.h" +#include "taco/index_notation/index_notation_printer.h" #include "taco/error/error_messages.h" #include "taco/util/collections.h" #include "taco/lower/iterator.h" @@ -592,7 +594,10 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const { std::string reason = ""; IndexStmt rewriteParallel(IndexStmt stmt) { + std::cout << "1 rewriting IndexStmt to support parallelize schedule directive\n--------------------------------------------\n"; + std::cout << stmt << std::endl; provGraph = ProvenanceGraph(stmt); + std::cout << "2 rewriting IndexStmt to support parallelize schedule directive\n--------------------------------------------\n"; const auto reductionVars = getReductionVars(stmt); reductionIndexVars.clear(); @@ -607,15 +612,22 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const { tensorVars = createIRTensorVars(stmt); assembledByUngroupedInsert.clear(); + std::cout << "3 rewriting IndexStmt to support parallelize schedule directive\n--------------------------------------------\n"; for (const auto& result : getAssembledByUngroupedInsertion(stmt)) { assembledByUngroupedInsert.push_back(tensorVars[result]); } + std::cout << "4 rewriting IndexStmt to support parallelize schedule directive\n--------------------------------------------\n"; + std::cout << stmt << std::endl; return rewrite(stmt); } void visit(const ForallNode* node) { + std::cout << "transformations.cpp void visit(const ForallNode* node)\n"; + std::cout << "node: \n" << node << std::endl; Forall foralli(node); + std::cout << "foralli: \n" << foralli << std::endl; + std::cout << "before stmt update stmt: \n" << stmt << std::endl; IndexVar i = parallelize.geti(); definedIndexVars.insert(foralli.getIndexVar()); @@ -632,6 +644,7 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const { Iterators iterators(foralli, tensorVars); MergeLattice lattice = MergeLattice::make(foralli, iterators, provGraph, definedIndexVars); + std::cout << "iter: " << i << ", lattice: \n" << lattice << std::endl; // Precondition 2: No coiteration of modes (i.e., merge lattice has // only one iterator) @@ -660,6 +673,7 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const { MergeLattice underivedLattice = MergeLattice::make(underivedForall, iterators, provGraph, definedIndexVars); + std::cout << "iter: " << i << ", underivedLattice: \n" << lattice << std::endl; // Precondition 3: Every result iterator must have insert capability for (Iterator iterator : underivedLattice.results()) { @@ -721,6 +735,7 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const { // build consumer that writes from temporary to output, mark consumer as parallel reduction ParallelUnit reductionUnit = ParallelUnit::CPUThreadGroupReduction; if (should_use_CUDA_codegen()) { + std::cout << "should_use_CUDA_codegen() true\n"; if (parentParallelUnits.count(ParallelUnit::GPUWarp)) { reductionUnit = ParallelUnit::GPUWarpReduction; } @@ -728,6 +743,9 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const { reductionUnit = ParallelUnit::GPUBlockReduction; } } + else { + std::cout << "should_use_CUDA_codegen() false\n"; + } IndexStmt consumer = forall(i, Assignment(assignment->lhs, w(i), assignment->op), reductionUnit, OutputRaceStrategy::ParallelReduction); precomputed_stmt = where(consumer, producer); } @@ -746,8 +764,9 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const { return; } - + std::cout << "updated stmt: \n"; stmt = forall(i, foralli.getStmt(), parallelize.getParallelUnit(), parallelize.getOutputRaceStrategy(), foralli.getUnrollFactor()); + std::cout << stmt << std::endl; return; } @@ -1181,6 +1200,7 @@ std::ostream& operator<<(std::ostream& os, IndexStmt parallelizeOuterLoop(IndexStmt stmt) { // get outer ForAll + std::cout << "get outer ForAll ----------------- \n"; Forall forall; bool matched = false; match(stmt, @@ -1215,7 +1235,19 @@ IndexStmt parallelizeOuterLoop(IndexStmt stmt) { } return parallelized256; } + else if (should_use_ISPC_codegen()) { + std::cout << "outer loop parallelization for ISPC codegen\n"; + // IndexStmt parallelized = Parallelize(forall.getIndexVar(), ParallelUnit::CPUSpmd, OutputRaceStrategy::NoRaces).apply(stmt, &reason); + // if (parallelized == IndexStmt()) { + // // can't parallelize + // return stmt; + // } + // return parallelized; + + return stmt; + } else { + std::cout << "outer loop parallelization for CPU codgen index statement\n"; IndexStmt parallelized = Parallelize(forall.getIndexVar(), ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces).apply(stmt, &reason); if (parallelized == IndexStmt()) { // can't parallelize @@ -1320,8 +1352,25 @@ topologicallySort(map> hardDeps, return sortedVars; } +IndexStmt justTraverseThroughTheIndexStmt(IndexStmt stmt) { + struct IndexStatementTraverse : public IndexNotationPrinter { + IndexStatementTraverse(std::ostream& os) : IndexNotationPrinter(os) {}; + using IndexNotationPrinter::visit; + map forallParallelUnit; + map forallOutputRaceStrategy; + }; + + std::cout << "traversing through the index statement\n"; + IndexNotationPrinter printer(std::cout); + std::cout << std::endl; + stmt.accept(&printer); + return stmt; + +} + IndexStmt reorderLoopsTopologically(IndexStmt stmt) { + std::cout << "executing reorderLoopsTopologically\n"; // Collect tensorLevelVars which stores the pairs of IndexVar and tensor // level that each tensor is accessed at struct DAGBuilder : public IndexNotationVisitor { @@ -1384,6 +1433,8 @@ IndexStmt reorderLoopsTopologically(IndexStmt stmt) { Iterators iterators(stmt); DAGBuilder dagBuilder(iterators); stmt.accept(&dagBuilder); + std::cout << "After DAGBuilder\n"; + std::cout << stmt << std::endl; // Construct tensor dependencies (sorted list of IndexVars) from tensorLevelVars map>> tensorVarOrders; @@ -1414,6 +1465,8 @@ IndexStmt reorderLoopsTopologically(IndexStmt stmt) { }; CollectSoftDependencies collectSoftDeps; stmt.accept(&collectSoftDeps); + std::cout << "After CollectSoftDependencies\n"; + std::cout << stmt << std::endl; const auto sortedVars = topologicallySort(hardDeps, collectSoftDeps.softDeps, dagBuilder.indexVarOriginalOrder); @@ -1450,7 +1503,11 @@ IndexStmt reorderLoopsTopologically(IndexStmt stmt) { }; TopoReorderRewriter rewriter(sortedVars, dagBuilder.innerBody, dagBuilder.forallParallelUnit, dagBuilder.forallOutputRaceStrategy); - return rewriter.rewrite(stmt); + IndexStmt stmtChanged = rewriter.rewrite(stmt); + std::cout << "After TopoReorderRewriter\n"; + std::cout << stmtChanged << std::endl; + + return stmtChanged; } IndexStmt scalarPromote(IndexStmt stmt, ProvenanceGraph provGraph, @@ -1478,6 +1535,7 @@ IndexStmt scalarPromote(IndexStmt stmt, ProvenanceGraph provGraph, void visit(const ForallNode* node) { Forall foralli(node); + std::cout << "scalar promote: " << foralli << std::endl; IndexVar i = foralli.getIndexVar(); // Don't allow hoisting out of forall's for GPU warp and block reduction diff --git a/src/ir/ir_printer.cpp b/src/ir/ir_printer.cpp index f96251c5a..ba2bc894b 100644 --- a/src/ir/ir_printer.cpp +++ b/src/ir/ir_printer.cpp @@ -750,50 +750,7 @@ void IRPrinter::visit(const VarDecl* op) { void IRPrinter::visit(const Assign* op) { if (is_ISPC_code_stream_enabled()) { - doIndent(); - op->lhs.accept(this); - parentPrecedence = Precedence::TOP; - bool printed = false; - if (simplify) { - if (isa(op->rhs)) { - auto add = to(op->rhs); - if (add->a == op->lhs) { - const Literal* lit = add->b.as(); - if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || - (lit->type.isUInt() && lit->equalsScalar(1)))) { - stream2 << "++"; - } - else { - stream2 << " += "; - add->b.accept(this); - } - printed = true; - } - } - else if (isa(op->rhs)) { - auto mul = to(op->rhs); - if (mul->a == op->lhs) { - stream2 << " *= "; - mul->b.accept(this); - printed = true; - } - } - else if (isa(op->rhs)) { - auto bitOr = to(op->rhs); - if (bitOr->a == op->lhs) { - stream2 << " |= "; - bitOr->b.accept(this); - printed = true; - } - } - } - if (!printed) { - stream2 << " = "; - op->rhs.accept(this); - } - stream2 << ";"; - stream2 << endl; } diff --git a/src/ir_tags.cpp b/src/ir_tags.cpp index af3dbd775..e7365d6c2 100644 --- a/src/ir_tags.cpp +++ b/src/ir_tags.cpp @@ -2,7 +2,7 @@ namespace taco { -const char *ParallelUnit_NAMES[] = {"NotParallel", "DefaultUnit", "GPUBlock", "GPUWarp", "GPUThread", "CPUThread", "CPUVector", "CPUThreadGroupReduction", "GPUBlockReduction", "GPUWarpReduction"}; +const char *ParallelUnit_NAMES[] = {"NotParallel", "DefaultUnit", "GPUBlock", "GPUWarp", "GPUThread", "CPUThread", "CPUVector", "CPUThreadGroupReduction", "GPUBlockReduction", "GPUWarpReduction", "CPUSimd", "CPUSpmd"}; const char *OutputRaceStrategy_NAMES[] = {"IgnoreRaces", "NoRaces", "Atomics", "Temporary", "ParallelReduction"}; const char *BoundType_NAMES[] = {"MinExact", "MinConstraint", "MaxExact", "MaxConstraint"}; const char *AssembleStrategy_NAMES[] = {"Append", "Insert"}; diff --git a/src/lower/lowerer_impl_imperative.cpp b/src/lower/lowerer_impl_imperative.cpp index 53ffd936f..28bd6c7c2 100644 --- a/src/lower/lowerer_impl_imperative.cpp +++ b/src/lower/lowerer_impl_imperative.cpp @@ -1,5 +1,6 @@ #include #include "taco/cuda.h" +#include "taco/ir_tags.h" #include "taco/lower/lowerer_impl_imperative.h" #include "taco/lower/lowerer_impl.h" @@ -417,6 +418,7 @@ LowererImplImperative::lower(IndexStmt stmt, string name, Stmt LowererImplImperative::lowerAssignment(Assignment assignment) { + std::cout << "\n\n converting assignment IndexStmt============================================ Assignment\n"; taco_iassert(generateAssembleCode() || generateComputeCode()); Stmt computeStmt; @@ -424,7 +426,7 @@ Stmt LowererImplImperative::lowerAssignment(Assignment assignment) Expr var = getTensorVar(result); const bool needComputeAssign = util::contains(needCompute, result); - + std::cout << "does assignment need compute assign: " << needComputeAssign << std::endl; Expr rhs; if (needComputeAssign) { rhs = lower(assignment.getRhs()); @@ -432,20 +434,51 @@ Stmt LowererImplImperative::lowerAssignment(Assignment assignment) // Assignment to scalar variables. if (isScalar(result.getType())) { + std::cout << "assignment to scalar variables\n"; if (needComputeAssign) { + std::cout << "compute assign\n"; if (!assignment.getOperator().defined()) { + std::cout << "assignment operator is not defined\n"; + std::cout << "var: " << var << ", rhs, : " << rhs << std::endl; computeStmt = Assign::make(var, rhs); } else { taco_iassert(isa(assignment.getOperator())); - bool useAtomics = markAssignsAtomicDepth > 0 && - !util::contains(whereTemps, result); + + std::cout << "assignment depth -- loopDepth: " << loopDepth << std::endl; + std::cout << "is markAssignsAtomicDepth > 0: " << (markAssignsAtomicDepth > 0) << std::endl; + for (auto &tensors_ : whereTemps) { + std::cout << tensors_ << ", "; + } + std::cout << std::endl; + std::cout << result << std::endl; + int tempVarInitLoopDepth = whereTempsWithLoopDepth.find(result)->second; + std::cout << "tempInitLoopDepth: " << tempVarInitLoopDepth << std::endl; + + bool reduction = false; + std::map::iterator itr; + for (itr = forUnits.begin(); itr!=forUnits.end(); ++itr) { + if (itr->first<=loopDepth && itr->first>tempVarInitLoopDepth && itr->second == ParallelUnit::CPUSimd) { + reduction = true; + } + std::cout << itr->first << "\t" << ParallelUnit_NAMES[(int) itr->second] << std::endl; + } + + // less than or equal to loopDepth but greater than temp variable initialized loop depth + bool useAtomics = markAssignsAtomicDepth > 0 && (!util::contains(whereTemps, result) || reduction); + std::cout << "whereTemps and result: " << !util::contains(whereTemps, result) << std::endl; + std::cout << "assignment to scalar variables useAtomics: " << useAtomics << std::endl; computeStmt = compoundAssign(var, rhs, useAtomics, atomicParallelUnit); + std::cout << "computeStatment: " << computeStmt << std::endl; } } + else { + std::cout << "not compute assign\n"; + } } // Assignments to tensor variables (non-scalar). else { + std::cout << "assignment to tensor variables\n"; Expr values = getValuesArray(result); Expr loc = generateValueLocExpr(assignment.getLhs()); @@ -479,6 +512,7 @@ Stmt LowererImplImperative::lowerAssignment(Assignment assignment) } if (needComputeAssign && values.defined()) { + std::cout << "assign compute statement\n"; if (!assignment.getOperator().defined()) { computeStmt = Store::make(values, loc, rhs); } @@ -595,9 +629,20 @@ LowererImplImperative::splitAppenderAndInserters(const vector& results */ Stmt LowererImplImperative::lowerForall(Forall forall) { + loopDepth++; + forUnits.insert(std::pair(loopDepth,forall.getParallelUnit())); std::cout << "doing lowerForall: " << forall << std::endl; bool hasExactBound = provGraph.hasExactBound(forall.getIndexVar()); bool forallNeedsUnderivedGuards = !hasExactBound && emitUnderivedGuards; + + + std::cout << "printing temporary variables with their atomic depths\n"; + map::iterator itr; + for (itr = whereTempsWithLoopDepth.begin(); itr != whereTempsWithLoopDepth.end(); ++itr) { + std::cout << itr->first << "\t" << itr->second << "\n"; + } + + if (!ignoreVectorize && forallNeedsUnderivedGuards && (forall.getParallelUnit() == ParallelUnit::CPUVector || forall.getUnrollFactor() > 0)) { @@ -852,6 +897,8 @@ Stmt LowererImplImperative::lowerForall(Forall forall) parallelUnitSizes.erase(forall.getParallelUnit()); } + forUnits.erase(loopDepth); + loopDepth--; return Block::blanks(preInitValues, temporaryValuesInitFree[0], loops, @@ -1157,12 +1204,18 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, ir::Stmt recoveryStmt) { std::cout << "1 Stmt LowererImplImperative::lowerForallDimension\n"; + std::cout << "1 Stmt LowererImplImperative::lowerForallDimension markAssignsAtomicDepth: " << markAssignsAtomicDepth << std::endl; Expr coordinate = getCoordinateVar(forall.getIndexVar()); if (forall.getParallelUnit() != ParallelUnit::NotParallel && forall.getOutputRaceStrategy() == OutputRaceStrategy::Atomics) { markAssignsAtomicDepth++; + std::cout << "1 Stmt LowererImplImperative::lowerForallDimension getParallelUnit() is Not NotParallel and outputRaceStrategy is Atomics\n"; + std::cout << "markAssignsAtomicDepth: " << markAssignsAtomicDepth << std::endl; atomicParallelUnit = forall.getParallelUnit(); } + else { + std::cout << "1 Stmt LowererImplImperative::lowerForallDimension getParallelUnit() is NotParallel or outputRaceStrategy is not Atomics\n"; + } std::cout << "original forall : " << forall << std::endl; std::cout << "inside IndexStmt: " << forall.getStmt() << std::endl; @@ -1183,9 +1236,14 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, LoopKind kind = LoopKind::Serial; if (should_use_ISPC_codegen()) { std::cout << "Foreach compatible loop\n"; - if (forall.getParallelUnit() == ParallelUnit::CPUVector && !ignoreVectorize) { + if (forall.getParallelUnit() == ParallelUnit::CPUSimd) { kind = LoopKind::Foreach; } + else if (forall.getParallelUnit() == ParallelUnit::CPUSpmd + && forall.getOutputRaceStrategy() != OutputRaceStrategy::ParallelReduction + ) { + kind = LoopKind::Mul_Thread; + } } else if (forall.getParallelUnit() == ParallelUnit::CPUVector && !ignoreVectorize) { kind = LoopKind::Vectorized; @@ -1250,7 +1308,7 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, LoopKind kind = LoopKind::Serial; if (should_use_ISPC_codegen()) { - if (forall.getParallelUnit() == ParallelUnit::CPUVector && !ignoreVectorize) { + if (forall.getParallelUnit() == ParallelUnit::CPUSimd) { kind = LoopKind::Foreach; } } @@ -2201,6 +2259,7 @@ vector LowererImplImperative::codeToInitializeTemporary(Where where) { } Stmt LowererImplImperative::lowerWhere(Where where) { + std::cout << "\n--------------------------------------- lowering where statement: " << where << "\n\n\n"; TensorVar temporary = where.getTemporary(); bool accelerateDenseWorkSpace, sortAccelerator; std::tie(accelerateDenseWorkSpace, sortAccelerator) = @@ -2237,6 +2296,7 @@ Stmt LowererImplImperative::lowerWhere(Where where) { }) ); + std::cout << "\ninitiating lowering of where consumer: " << where.getConsumer() << std::endl; Stmt consumer = lower(where.getConsumer()); if (accelerateDenseWorkSpace && sortAccelerator) { // We need to sort the indices array @@ -2266,6 +2326,7 @@ Stmt LowererImplImperative::lowerWhere(Where where) { } whereConsumers.push_back(consumer); + std::cout << "\nwhere temporaries: " << where.getTemporary() << std::endl; whereTemps.push_back(where.getTemporary()); captureNextLocatePos = true; @@ -2276,6 +2337,9 @@ Stmt LowererImplImperative::lowerWhere(Where where) { restoreAtomicDepth = true; } + whereTempsWithLoopDepth.insert(std::pair(where.getTemporary(), loopDepth)); + + std::cout << "\ninitiating lowering of where producer: " << where.getConsumer() << std::endl; Stmt producer = lower(where.getProducer()); if (accelerateDenseWorkSpace) { const Expr indexListSizeExpr = tempToIndexListSize.at(temporary); @@ -2283,6 +2347,8 @@ Stmt LowererImplImperative::lowerWhere(Where where) { initializeTemporary = Block::make(indexListSizeDecl, initializeTemporary); } + whereTempsWithLoopDepth.erase(where.getTemporary()); + if (restoreAtomicDepth) { markAssignsAtomicDepth++; } diff --git a/src/tensor.cpp b/src/tensor.cpp index 3519456c9..dac2c3fd2 100644 --- a/src/tensor.cpp +++ b/src/tensor.cpp @@ -621,6 +621,7 @@ void TensorBase::compile() { IndexStmt stmt = makeConcreteNotation(makeReductionNotation(assignment)); stmt = reorderLoopsTopologically(stmt); stmt = insertTemporaries(stmt); + std::cout << "calling parallelizeOuterLoop(stmt)\n"; stmt = parallelizeOuterLoop(stmt); compile(stmt, content->assembleWhileCompute); } diff --git a/test/tests-scheduling-eval.cpp b/test/tests-scheduling-eval.cpp index 6a228f38b..93ba7b01e 100644 --- a/test/tests-scheduling-eval.cpp +++ b/test/tests-scheduling-eval.cpp @@ -65,14 +65,31 @@ IndexStmt scheduleSpMMCPU(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, i .parallelize(k, ParallelUnit::CPUVector, OutputRaceStrategy::IgnoreRaces); } -IndexStmt scheduleSpMMISPC(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { +IndexStmt scheduleSpMMISPC1(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); return stmt.split(i, i0, i1, CHUNK_SIZE) .pos(j, jpos, A(i,j)) .split(jpos, jpos0, jpos1, UNROLL_FACTOR) .reorder({i0, i1, jpos0, k, jpos1}) .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) - .parallelize(k, ParallelUnit::CPUVector, OutputRaceStrategy::IgnoreRaces); + .parallelize(k, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); +} + +IndexStmt scheduleSpMMISPC2(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); + return stmt + .parallelize(k, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); +} + +IndexStmt scheduleSpMMISPC3(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); + return stmt + // .split(i, i0, i1, CHUNK_SIZE) + // .pos(j, jpos, A(i,j)) + // .split(jpos, jpos0, jpos1, UNROLL_FACTOR) + .reorder({j, k}) + .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + .parallelize(k, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); } IndexStmt scheduleSpGEMMCPU(IndexStmt stmt, bool doPrecompute) { @@ -128,6 +145,27 @@ IndexStmt scheduleSDDMMCPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, .parallelize(kpos1, ParallelUnit::CPUVector, OutputRaceStrategy::ParallelReduction); } +IndexStmt scheduleSDDMMISPC1(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1"); + return stmt.split(i, i0, i1, CHUNK_SIZE) + .pos(k, kpos, B(i,k)) + .split(kpos, kpos0, kpos1, UNROLL_FACTOR) + .reorder({i0, i1, kpos0, j, kpos1}) + .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + .parallelize(kpos1, ParallelUnit::CPUSimd, OutputRaceStrategy::ParallelReduction); +} + +IndexStmt scheduleSDDMMISPC2(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1"); + return stmt; + // .split(i, i0, i1, CHUNK_SIZE) + // .pos(k, kpos, B(i,k)) + // .split(kpos, kpos0, kpos1, UNROLL_FACTOR) + // .reorder({i0, i1, kpos0, j, kpos1}) + // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + // .parallelize(kpos1, ParallelUnit::CPUSimd, OutputRaceStrategy::ParallelReduction); +} + IndexStmt scheduleTTVCPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16) { IndexVar f("f"), fpos("fpos"), chunk("chunk"), fpos2("fpos2"); return stmt.fuse(i, j, f) @@ -1550,24 +1588,80 @@ TEST(generate_ispc_evaluation_files, ispc) { stringstream source2; std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); Tensor A("A", {NUM_I, NUM_J}, CSR); - Tensor B("B", {NUM_J, NUM_K}, {Dense, Dense}); - Tensor C("C", {NUM_I, NUM_K}, {Dense, Dense}); - C(i, k) = A(i, j) * B(j, k); - IndexStmt stmt = C.getAssignment().concretize(); + Tensor X("X", {NUM_J, NUM_K}, {Dense, Dense}); + Tensor Y("Y", {NUM_I, NUM_K}, {Dense, Dense}); + Y(i, k) = A(i, j) * X(j, k); + IndexStmt stmt = Y.getAssignment().concretize(); bool isFirst = true; for (auto paramSet : spmm_parameters) { - IndexStmt scheduled = scheduleSpMMISPC(stmt, A, paramSet[0], paramSet[1]); - ir::Stmt compute = lower(scheduled, string("compute_") + util::join(paramSet, "_"), false, true); + IndexStmt scheduled = scheduleSpMMISPC1(stmt, A, paramSet[0], paramSet[1]); + ir::Stmt compute = lower(scheduled, string("compute1_") + util::join(paramSet, "_"), false, true); + codegen->compile(compute, isFirst); + isFirst = false; + } + ofstream source_file; + source_file.open(file_path + "spmm_csr_ispc_taco1" + c_file_ending); + source_file << source1.str(); + source_file.close(); + + ofstream ispc_source_file; + ispc_source_file.open(file_path + "__spmm_csr_ispc_taco1" + file_ending); + ispc_source_file << source2.str(); + ispc_source_file.close(); + } + + // spmm + { + stringstream source1; + stringstream source2; + std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); + Tensor A("A", {NUM_I, NUM_J}, CSR); + Tensor X("X", {NUM_J, NUM_K}, {Dense, Dense}); + Tensor Y("Y", {NUM_I, NUM_K}, {Dense, Dense}); + Y(i, k) = A(i, j) * X(j, k); + IndexStmt stmt = Y.getAssignment().concretize(); + bool isFirst = true; + for (auto paramSet : spmm_parameters) { + IndexStmt scheduled = scheduleSpMMISPC2(stmt, A, paramSet[0], paramSet[1]); + ir::Stmt compute = lower(scheduled, string("compute2_") + util::join(paramSet, "_"), false, true); + codegen->compile(compute, isFirst); + isFirst = false; + } + ofstream source_file; + source_file.open(file_path + "spmm_csr_ispc_taco2" + c_file_ending); + source_file << source1.str(); + source_file.close(); + + ofstream ispc_source_file; + ispc_source_file.open(file_path + "__spmm_csr_ispc_taco2" + file_ending); + ispc_source_file << source2.str(); + ispc_source_file.close(); + } + + // spmm + { + stringstream source1; + stringstream source2; + std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); + Tensor A("A", {NUM_I, NUM_J}, CSR); + Tensor X("X", {NUM_J, NUM_K}, {Dense, Dense}); + Tensor Y("Y", {NUM_I, NUM_K}, {Dense, Dense}); + Y(i, k) = A(i, j) * X(j, k); + IndexStmt stmt = Y.getAssignment().concretize(); + bool isFirst = true; + for (auto paramSet : spmm_parameters) { + IndexStmt scheduled = scheduleSpMMISPC3(stmt, A, paramSet[0], paramSet[1]); + ir::Stmt compute = lower(scheduled, string("compute3_") + util::join(paramSet, "_"), false, true); codegen->compile(compute, isFirst); isFirst = false; } ofstream source_file; - source_file.open(file_path + "spmm_csr_ispc_taco" + c_file_ending); + source_file.open(file_path + "spmm_csr_ispc_taco3" + c_file_ending); source_file << source1.str(); source_file.close(); ofstream ispc_source_file; - ispc_source_file.open(file_path + "__spmm_csr_ispc_taco" + file_ending); + ispc_source_file.open(file_path + "__spmm_csr_ispc_taco3" + file_ending); ispc_source_file << source2.str(); ispc_source_file.close(); } @@ -1576,6 +1670,99 @@ TEST(generate_ispc_evaluation_files, ispc) { return; } + + +TEST(generate_ispc_sddmm_evaluation_files, ispc) { + std::cout << "Hi Adhitha!\n" << std::endl ; + set_CUDA_codegen_enabled(false); + set_ISPC_codegen_enabled(true); + + vector> spmv_parameters = {{32}}; + vector> spmspv_parameters = {{8}}; + + // 4 to 512 and 4, 8, 16 + vector> spmm_dcsr_parameters = {{16, 8}}; + vector> spmm_parameters = {{16,4}}; + + vector> mttkrp_parameters = {}; + mttkrp_parameters.push_back({64,0}); + + vector> sddmm_parameters = {{8, 8}}; + vector> ttv_parameters = {{32}}; + + int NUM_I = 100; + int NUM_J = 100; + int NUM_K = 100; + + string c_file_ending = ".h"; + string file_ending = ".ispc"; + string file_path = "eval_prepared_ispc/sddmm/"; + mkdir(file_path.c_str(), 0777); + + // sddmm + { + stringstream source1; + stringstream source2; + std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); + Tensor A("A", {NUM_I, NUM_K}, {Dense, Dense}); + Tensor B("B", {NUM_I, NUM_K}, CSR); + Tensor C("C", {NUM_I, NUM_J}, {Dense, Dense}); + Tensor D("D", {NUM_J, NUM_K}, {Dense, Dense}); + A(i,k) = B(i,k) * C(i,j) * D(j,k); + IndexStmt stmt = A.getAssignment().concretize(); + bool isFirst = true; + for (auto paramSet : sddmm_parameters) { + IndexStmt scheduled = scheduleSDDMMISPC1(stmt, B, paramSet[0], paramSet[1]); + ir::Stmt compute = lower(scheduled, string("compute1_") + util::join(paramSet, "_"), false, true); + codegen->compile(compute, isFirst); + isFirst = false; + } + ofstream source_file; + source_file.open(file_path + "sddmm_cpu_ispc_taco1" + file_ending); + source_file << source1.str(); + source_file.close(); + + ofstream ispc_source_file; + ispc_source_file.open(file_path + "__sddmm_cpu_ispc_taco1" + file_ending); + ispc_source_file << source2.str(); + ispc_source_file.close(); + } + + + // sddmm + { + stringstream source1; + stringstream source2; + std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); + Tensor Y("Y", {NUM_I, NUM_K}, {Dense, Dense}); + Tensor A("A", {NUM_I, NUM_K}, CSR); + Tensor X("X", {NUM_I, NUM_J}, {Dense, Dense}); + Y(i,j) = A(i,j) * X(i,k) * X(j,k); + IndexStmt stmt = Y.getAssignment().concretize(); + bool isFirst = true; + for (auto paramSet : sddmm_parameters) { + IndexStmt scheduled = scheduleSDDMMISPC2(stmt, A, paramSet[0], paramSet[1]); + ir::Stmt compute = lower(scheduled, string("compute2_") + util::join(paramSet, "_"), false, true); + codegen->compile(compute, isFirst); + isFirst = false; + } + ofstream source_file; + source_file.open(file_path + "sddmm_cpu_ispc_taco2" + file_ending); + source_file << source1.str(); + source_file.close(); + + ofstream ispc_source_file; + ispc_source_file.open(file_path + "__sddmm_cpu_ispc_taco2" + file_ending); + ispc_source_file << source2.str(); + ispc_source_file.close(); + } + + + return; +} + + + TEST(generate_evaluation_files, cpu) { if (should_use_CUDA_codegen()) { return; diff --git a/tools/taco.cpp b/tools/taco.cpp index 9a864a699..bf7e7c9dc 100644 --- a/tools/taco.cpp +++ b/tools/taco.cpp @@ -265,7 +265,7 @@ static void printSchedulingHelp() { "an output race strategy `strat`. Since the other transformations " "expect serial code, parallelize must come last in a series of " "transformations. Possible parallel hardware units are: " - "NotParallel, GPUBlock, GPUWarp, GPUThread, CPUThread, CPUVector. " + "NotParallel, GPUBlock, GPUWarp, GPUThread, CPUThread, CPUVector, CPUSimd, CPUSimd. " "Possible output race strategies are: " "IgnoreRaces, NoRaces, Atomics, Temporary, ParallelReduction."); } @@ -313,7 +313,8 @@ static void printCommandLine(ostream& os, int argc, char* argv[]) { } } -static bool setSchedulingCommands(vector> scheduleCommands, parser::Parser& parser, IndexStmt& stmt) { +static int setSchedulingCommands(vector> scheduleCommands, parser::Parser& parser, IndexStmt& stmt) { + std::cout << "setting scheduling commands\n"; auto findVar = [&stmt](string name) { ProvenanceGraph graph(stmt); for (auto v : graph.getAllIndexVars()) { @@ -326,9 +327,15 @@ static bool setSchedulingCommands(vector> scheduleCommands, parse abort(); // to silence a warning: control reaches end of non-void function }; - bool isGPU = false; + int isGPU = 0; + int isISPC = 0; for(vector scheduleCommand : scheduleCommands) { + std::cout << "running schedluing command: "; + for (auto &command : scheduleCommand) { + std::cout << command << " "; + } + std::cout << std::endl; string command = scheduleCommand[0]; scheduleCommand.erase(scheduleCommand.begin()); @@ -541,7 +548,15 @@ static bool setSchedulingCommands(vector> scheduleCommands, parse parallel_unit = ParallelUnit::CPUThread; } else if (unit == "CPUVector") { parallel_unit = ParallelUnit::CPUVector; - } else { + } else if (unit == "CPUSimd") { + isISPC = true; + parallel_unit = ParallelUnit::CPUSimd; + } + else if (unit == "CPUSpmd") { + parallel_unit = ParallelUnit::CPUSpmd; + isISPC = true; + } + else { taco_uerror << "Parallel hardware not defined."; goto end; } @@ -562,6 +577,8 @@ static bool setSchedulingCommands(vector> scheduleCommands, parse goto end; } + std::cout << "stmt before parallelizing the statement: " << stmt << endl; + std::cout << "ParallelUnit: " << ParallelUnit_NAMES[(int) parallel_unit] << ", outputRaceStrategy: " << OutputRaceStrategy_NAMES[(int) output_race_strategy] << std::endl; stmt = stmt.parallelize(findVar(i), parallel_unit, output_race_strategy); } else if (command == "assemble") { @@ -617,7 +634,13 @@ static bool setSchedulingCommands(vector> scheduleCommands, parse end:; } - return isGPU; + if (isGPU) { + return 1; + } + else if (isISPC) { + return 2; + } + return 0; } int main(int argc, char* argv[]) { @@ -1011,6 +1034,8 @@ int main(int argc, char* argv[]) { } } + std::cout << "cuda: " << cuda << ", ispc: " << ispc << std::endl; + // Print compute is the default if nothing else was asked for if (!printAssemble && !printEvaluate && !printIterationGraph && !writeCompute && !writeAssemble && !writeKernels && !readKernels && @@ -1019,6 +1044,7 @@ int main(int argc, char* argv[]) { } // pre-parse expression, to determine existence and order of loaded tensors + std::cout << "pre-parse expression, to determine existence and order of loaded tensors\n"; map loadedTensors; TensorBase temp_tensor; parser::Parser temp_parser(exprStr, formats, dataTypes, tensorsDimensions, loadedTensors, 42); @@ -1124,15 +1150,22 @@ int main(int argc, char* argv[]) { IndexStmt stmt = makeConcreteNotation(makeReductionNotation(tensor.getAssignment())); + std::cout << "concrete index statement: " << stmt << std::endl; + stmt = justTraverseThroughTheIndexStmt(stmt); stmt = reorderLoopsTopologically(stmt); + std::cout << "topologically reordered loops statement: " << stmt << std::endl; if (setSchedule) { - cuda |= setSchedulingCommands(scheduleCommands, parser, stmt); + int val = setSchedulingCommands(scheduleCommands, parser, stmt); + cuda |= (val==1); + ispc |= (val==2); } else { stmt = insertTemporaries(stmt); stmt = parallelizeOuterLoop(stmt); } + std::cout << "after setting the scheduling commands\n"; + std::cout << stmt << std::endl; if (cuda) { if (!CUDA_BUILT && benchmark) { @@ -1153,6 +1186,7 @@ int main(int argc, char* argv[]) { set_ISPC_codegen_enabled(false); } + std::cout << "running scalar promote\n" << std::endl; stmt = scalarPromote(stmt); if (printConcrete) { cout << stmt << endl; @@ -1240,6 +1274,7 @@ int main(int argc, char* argv[]) { } } else { + std::cout << "lowering stmt: " << stmt << std::endl; compute = lower(stmt, prefix+"compute", computeWithAssemble, true); assemble = lower(stmt, prefix+"assemble", true, false); evaluate = lower(stmt, prefix+"evaluate", true, true); From 0a4169728d9d6bcdfc1b1dabc40a0daf7e7e1e0a Mon Sep 17 00:00:00 2001 From: Adhhitha Dias Date: Mon, 26 Jul 2021 19:43:37 -0400 Subject: [PATCH 05/16] add tests and ispc compilation --- include/taco/codegen/module.h | 1 + src/codegen/codegen.cpp | 4 +- src/codegen/codegen_ispc.cpp | 249 ++++++++++---- src/codegen/codegen_ispc.h | 4 +- src/codegen/module.cpp | 79 ++++- src/tensor.cpp | 6 +- taco-uml.wsd | 411 +++++++++++++++++++++++ test/test.cpp | 14 + test/test.h | 1 + test/tests-scheduling-eval.cpp | 575 ++++++++++++++++++++++++++++++++- 10 files changed, 1263 insertions(+), 81 deletions(-) create mode 100644 taco-uml.wsd diff --git a/include/taco/codegen/module.h b/include/taco/codegen/module.h index 36eb34f1a..3df7c8e0f 100644 --- a/include/taco/codegen/module.h +++ b/include/taco/codegen/module.h @@ -68,6 +68,7 @@ class Module { private: std::stringstream source; + std::stringstream additional_source; std::stringstream header; std::string libname; std::string tmpdir; diff --git a/src/codegen/codegen.cpp b/src/codegen/codegen.cpp index 7081bc195..6ec54a2f8 100644 --- a/src/codegen/codegen.cpp +++ b/src/codegen/codegen.cpp @@ -265,9 +265,9 @@ string CodeGen::getUnpackedTensorArgument(string varname, const GetProperty* op, // all others are int* if (op->property == TensorProperty::Dimension) { if (op->type == Int32) { - ret << "int32 "; + ret << "uniform int32 "; } else if (op->type == Int64) { - ret << "int64 "; + ret << "uniform int64 "; } else { ret << "int "; } diff --git a/src/codegen/codegen_ispc.cpp b/src/codegen/codegen_ispc.cpp index c8480cd25..237bc822d 100644 --- a/src/codegen/codegen_ispc.cpp +++ b/src/codegen/codegen_ispc.cpp @@ -6,10 +6,12 @@ #include #include "taco/cuda.h" +#include "taco/ir/ir_printer.h" #include "taco/ir/ir_visitor.h" #include "taco/ir/ir_rewriter.h" #include "taco/ir/simplify.h" +#include "codegen_c.h" #include "codegen_ispc.h" #include "taco/error.h" #include "taco/util/strings.h" @@ -295,6 +297,7 @@ class CodeGen_ISPC::DeviceFunctionCollector : public IRVisitor { virtual void visit(const For *op) { if (op->parallel_unit == ParallelUnit::CPUSpmd) { std::cout << "ParallelUnit::CPUSpmd directive found\n"; + inDeviceFunction = false; op->var.accept(this); inDeviceFunction = true; @@ -380,6 +383,8 @@ void CodeGen_ISPC::compile(Stmt stmt, bool isFirst) { stmt.accept(this); } + + string CodeGen_ISPC::printCallISPCFunc(const Function *func, map varMap, vector &sortedProps) { std::stringstream ret; @@ -388,9 +393,6 @@ string CodeGen_ISPC::printCallISPCFunc(const Function *func, mapname << "("; - vector inputs = func->inputs; - vector outputs = func->outputs; - getSortedProps(varMap, sortedProps, inputs, outputs); for (unsigned long i=0; i < sortedProps.size(); i++) { ret << varMap[sortedProps[i]]; @@ -404,50 +406,123 @@ string CodeGen_ISPC::printCallISPCFunc(const Function *func, map varMap, +// varMap is already sorted <- make sure to pass the sorted varMap +void CodeGen_ISPC::printISPCFunc(const Function *func, map varMap, vector &sortedProps) { DeviceFunctionCollector deviceFunctionCollector(func->inputs, func->outputs, this); func->body.accept(&deviceFunctionCollector); - - std::stringstream ret; - ret << "export void "; - unordered_set propsAlreadyGenerated; - - ret << "__" << func->name << "("; - + std::stringstream variables; vector inputs = func->inputs; vector outputs = func->outputs; - // getSortedProps(varMap, sortedProps, inputs, outputs); + unordered_set propsAlreadyGenerated; - for (unsigned long i=0; i < sortedProps.size(); i++) { - auto prop = sortedProps[i]; - bool isOutputProp = (find(outputs.begin(), outputs.end(), - prop->tensor) != outputs.end()); - - auto var = prop->tensor.as(); - if (var->is_parameter) { - if (isOutputProp) { - ret << " " << printTensorProperty(varMap[prop], prop, false) << ";" << endl; + for (unsigned long i=0; i < sortedProps.size(); i++) { + auto prop = sortedProps[i]; + bool isOutputProp = (find(outputs.begin(), outputs.end(), + prop->tensor) != outputs.end()); + + auto var = prop->tensor.as(); + if (var->is_parameter) { + if (isOutputProp) { + variables << " " << printTensorProperty(varMap[prop], prop, false) << ";" << endl; + } else { + break; + } } else { - break; + variables << getUnpackedTensorArgument(varMap[prop], prop, isOutputProp); } - } else { - ret << getUnpackedTensorArgument(varMap[prop], prop, isOutputProp); - } - propsAlreadyGenerated.insert(varMap[prop]); + propsAlreadyGenerated.insert(varMap[prop]); - if (i!=sortedProps.size()-1) { - ret << ", "; - } - if (i%2==0) { - ret << "\n\t"; + if (i!=sortedProps.size()-1) { + variables << ", "; + } + if (i%2==0) { + variables << "\n\t"; + } } + + resetUniqueNameCounters(); + for (size_t i = 0; i < deviceFunctionCollector.threadFors.size(); i++) { + + const For *threadloop = to(deviceFunctionCollector.threadFors[i]); + taco_iassert(threadloop->parallel_unit == ParallelUnit::CPUSpmd); + Stmt function = threadloop->contents; + std::cout << "threadloop function: " << function << std::endl; + + out2 << "static task void __" << func->name << "__ ("; + out2 << variables.str(); + out2 << "\n) {\n\n"; + + indent++; + doIndent(); + // output body + print(threadloop); + indent--; + out2 << "}\n"; + + out2 << "export void __" << func->name << "("; + out2 << variables.str(); + out2 << "\n) {\n\n"; + indent++; + doIndent(); + out2 << "launch[4] " << printCallISPCFunc(func, varMap, sortedProps) << "\n"; + indent--; + out2 << "}\n"; + } - ret << "\n) {\n\n"; - return ret.str(); + if (deviceFunctionCollector.threadFors.size()==0) { + out2 << "export void __" << func->name << " ("; + out2 << variables.str(); + out2 << "\n) {\n\n"; + + indent++; + doIndent(); + // output body + print(func->body); + indent--; + out2 << "}\n"; + } + + // out2 << "export void "; + + // out2 << "__" << func->name << "("; + + // for (unsigned long i=0; i < sortedProps.size(); i++) { + // auto prop = sortedProps[i]; + // bool isOutputProp = (find(outputs.begin(), outputs.end(), + // prop->tensor) != outputs.end()); + + // auto var = prop->tensor.as(); + // if (var->is_parameter) { + // if (isOutputProp) { + // out2 << " " << printTensorProperty(varMap[prop], prop, false) << ";" << endl; + // } else { + // break; + // } + // } else { + // out2 << getUnpackedTensorArgument(varMap[prop], prop, isOutputProp); + // } + // propsAlreadyGenerated.insert(varMap[prop]); + + // if (i!=sortedProps.size()-1) { + // out2 << ", "; + // } + // if (i%2==0) { + // out2 << "\n\t"; + // } + // } + // out2 << "\n) {\n\n"; + + // indent++; + // doIndent(); + // // output body + // print(func->body); + // indent--; + // out2 << "}\n"; + } void CodeGen_ISPC::sendToStream(std::stringstream &stream) { @@ -461,6 +536,75 @@ void CodeGen_ISPC::sendToStream(std::stringstream &stream) { void CodeGen_ISPC::visit(const Function* func) { // if generating a header, protect the function declaration with a guard + if (func->name == "assemble") { + if (outputKind == HeaderGen) { + out << "#ifndef TACO_GENERATED_" << func->name << "\n"; + out << "#define TACO_GENERATED_" << func->name << "\n"; + } + + int numYields = countYields(func); + emittingCoroutine = (numYields > 0); + funcName = func->name; + labelCount = 0; + + resetUniqueNameCounters(); + FindVars inputVarFinder(func->inputs, {}, this); + func->body.accept(&inputVarFinder); + FindVars outputVarFinder({}, func->outputs, this); + func->body.accept(&outputVarFinder); + + // output function declaration + doIndent(); + out << printFuncName(func, inputVarFinder.varDecls, outputVarFinder.varDecls); + + // if we're just generating a header, this is all we need to do + if (outputKind == HeaderGen) { + out << ";\n"; + out << "#endif\n"; + return; + } + + out << " {\n"; + + indent++; + + // find all the vars that are not inputs or outputs and declare them + resetUniqueNameCounters(); + FindVars varFinder(func->inputs, func->outputs, this); + func->body.accept(&varFinder); + varMap = varFinder.varMap; + localVars = varFinder.localVars; + + // Print variable declarations + out << printDecls(varFinder.varDecls, func->inputs, func->outputs) << endl; + + if (emittingCoroutine) { + out << printContextDeclAndInit(varMap, localVars, numYields, func->name) + << endl; + } + + // output body + print(func->body); + + // output repack only if we allocated memory + if (checkForAlloc(func)) + out << endl << printPack(varFinder.outputProperties, func->outputs); + + if (emittingCoroutine) { + out << printCoroutineFinish(numYields, funcName); + } + + doIndent(); + out << "return 0;\n"; + indent--; + + doIndent(); + out << "}\n"; + return; + + } + + if (outputKind == HeaderGen) { out << "#ifndef TACO_GENERATED_" << func->name << "\n"; out << "#define TACO_GENERATED_" << func->name << "\n"; @@ -503,6 +647,9 @@ void CodeGen_ISPC::visit(const Function* func) { out << printDecls(varFinder.varDecls, func->inputs, func->outputs) << endl; vector sortedProps; + vector inputs = func->inputs; + vector outputs = func->outputs; + getSortedProps(varFinder.varDecls, sortedProps, inputs, outputs); out << printCallISPCFunc(func, varFinder.varDecls, sortedProps); if (emittingCoroutine) { @@ -526,13 +673,7 @@ void CodeGen_ISPC::visit(const Function* func) { out << "}\n\n"; set_ISPC_code_stream_enabled(true); - out2 << printISPCFunc(func, varFinder.varDecls, sortedProps); - indent++; - doIndent(); - // output body - print(func->body); - indent--; - out2 << "}\n"; + printISPCFunc(func, varFinder.varDecls, sortedProps); set_ISPC_code_stream_enabled(false); } @@ -655,20 +796,20 @@ void CodeGen_ISPC::visit(const For* op) { case LoopKind::Runtime: case LoopKind::Static_Chunked: case LoopKind::Mul_Thread: - op->start.accept(this); - stream2 << std::endl; - op->start.accept(this); - stream2 << std::endl; - op->start.accept(this); - stream2 << std::endl; - op->start.accept(this); - stream2 << std::endl; - op->end.accept(this); - stream2 << std::endl; - op->end.accept(this); - stream2 << std::endl; - op->end.accept(this); - stream2 << std::endl; + // op->start.accept(this); + // stream2 << std::endl; + // op->start.accept(this); + // stream2 << std::endl; + // op->start.accept(this); + // stream2 << std::endl; + // op->start.accept(this); + // stream2 << std::endl; + // op->end.accept(this); + // stream2 << std::endl; + // op->end.accept(this); + // stream2 << std::endl; + // op->end.accept(this); + // stream2 << std::endl; default: break; } diff --git a/src/codegen/codegen_ispc.h b/src/codegen/codegen_ispc.h index 279d0db7a..08e73b252 100644 --- a/src/codegen/codegen_ispc.h +++ b/src/codegen/codegen_ispc.h @@ -5,7 +5,7 @@ #include "taco/ir/ir.h" #include "taco/ir/ir_printer.h" -#include "codegen.h" +#include "codegen_c.h" namespace taco { namespace ir { @@ -46,7 +46,7 @@ class CodeGen_ISPC : public CodeGen { Stmt simplifyFunctionBodies(Stmt stmt); std::string printCallISPCFunc(const Function *func, std::map varMap, std::vector &sortedProps); - std::string printISPCFunc(const Function *func, std::map varMap, + void printISPCFunc(const Function *func, std::map varMap, std::vector &sortedProps); std::map varMap; diff --git a/src/codegen/module.cpp b/src/codegen/module.cpp index d9cbe2edc..82b736a13 100644 --- a/src/codegen/module.cpp +++ b/src/codegen/module.cpp @@ -43,6 +43,7 @@ void Module::addFunction(Stmt func) { void Module::compileToSource(string path, string prefix) { if (!moduleFromUserSource) { + std::cout << "module not from user source\n"; // create a codegen instance and add all the funcs bool didGenRuntime = false; @@ -51,11 +52,13 @@ void Module::compileToSource(string path, string prefix) { header.clear(); source.str(""); source.clear(); + additional_source.str(""); + additional_source.clear(); taco_tassert(target.arch == Target::C99) << "Only C99 codegen supported currently"; std::shared_ptr sourcegen = - CodeGen::init_default(source, CodeGen::ImplementationGen); + CodeGen::init_default(source, additional_source, CodeGen::ImplementationGen); std::shared_ptr headergen = CodeGen::init_default(header, CodeGen::HeaderGen); @@ -69,8 +72,17 @@ void Module::compileToSource(string path, string prefix) { ofstream source_file; string file_ending = should_use_CUDA_codegen() ? ".cu" : ".c"; source_file.open(path+prefix+file_ending); + if (should_use_ISPC_codegen()) { + source_file << "#include \"" << path+prefix+"_ispc.h\"\n"; + } source_file << source.str(); source_file.close(); + + ofstream additional_source_file; + string file_ending2 = ".ispc"; + additional_source_file.open(path+prefix+file_ending2); + additional_source_file << additional_source.str(); + additional_source_file.close(); ofstream header_file; header_file.open(path+prefix+".h"); @@ -90,9 +102,9 @@ void writeShims(vector funcs, string path, string prefix) { if (should_use_CUDA_codegen()) { CodeGen_CUDA::generateShim(func, shims); } - else if (should_use_ISPC_codegen()) { - CodeGen_ISPC::generateShim(func, shims); - } + // else if (should_use_ISPC_codegen()) { + // CodeGen_ISPC::generateShim(func, shims); + // } else { CodeGen_C::generateShim(func, shims); } @@ -102,9 +114,9 @@ void writeShims(vector funcs, string path, string prefix) { if (should_use_CUDA_codegen()) { shims_file.open(path+prefix+"_shims.cpp"); } - else if (should_use_ISPC_codegen()) { - shims_file.open(path+prefix+".ispc", ios::app); - } + // else if (should_use_ISPC_codegen()) { + // shims_file.open(path+prefix+".c", ios::app); + // } else { shims_file.open(path+prefix+".c", ios::app); } @@ -131,12 +143,13 @@ string Module::compile() { file_ending = ".cu"; shims_file = prefix + "_shims.cpp"; } - else if (should_use_ISPC_codegen()) { - cc = util::getFromEnv(target.compiler_env, target.compiler); - cflags = util::getFromEnv("TACO_CFLAGS", - "-O3 -ffast-math -std=c99") + " -shared -fPIC"; - - } + // else if (should_use_ISPC_codegen()) { + // cc = util::getFromEnv("TACO_ISPC", "ispc"); + // cflags = util::getFromEnv("TACO_ISPC_FLAGS", + // " --target=sse2-i32x4,sse4-i32x8,avx1-i32x8,avx2-i32x8,avx512knl-i32x16,avx512skx-i32x16 --pic -O3 --addressing=64 --arch=x86-64" + // ) + " "; + + // } else { cc = util::getFromEnv(target.compiler_env, target.compiler); cflags = util::getFromEnv("TACO_CFLAGS", @@ -151,9 +164,15 @@ string Module::compile() { string cmd = cc + " " + cflags + " " + prefix + file_ending + " " + shims_file + " " + "-o " + fullpath + " -lm"; + std::cout << "--------------------------------------------------------------------------------tmpdir: " << tmpdir << std::endl; + std::cout << "--------------------------------------------------------------------------------libname: " << libname << std::endl; + std::cout << "--------------------------------------------------------------------------------prefix: " << prefix << std::endl; + std::cout << "--------------------------------------------------------------------------------fullpath: " << fullpath << std::endl; + std::cout << "--------------------------------------------------------------------------------cmd: " << cmd << std::endl; // open the output file & write out the source compileToSource(tmpdir, libname); + // write out the shims writeShims(funcs, tmpdir, libname); @@ -164,10 +183,36 @@ string Module::compile() { } std::cout << tmpdir << std::endl << libname << std::endl; - // now compile it - int err = system(cmd.data()); - taco_uassert(err == 0) << "Compilation command failed:\n" << cmd - << "\nreturned " << err; + if (should_use_ISPC_codegen()) { + string ispc = util::getFromEnv("TACO_ISPC", "ispc"); + string ispcflags = util::getFromEnv("TACO_ISPC_FLAGS", + " --target=sse2-i32x4,sse4-i32x8,avx1-i32x8,avx2-i32x8,avx512knl-i32x16,avx512skx-i32x16 --pic -O3 --addressing=64 --arch=x86-64" + ) + " "; + string cmd = ispc + " " + ispcflags + " -o " + prefix + ".ispc.o " + " --emit-obj " + prefix + ".ispc " + "-h " + prefix + "_ispc.h"; + + // now compile the ispc file to generate the object file and the ispc header file + std::cout << "--------------------------------------------------------------------------------cmd: " << cmd << std::endl; + int err = system(cmd.data()); + taco_uassert(err == 0) << "Compilation command failed:\n" << cmd + << "\nreturned " << err; + + string ispc_object_file = " " + prefix + ".ispc.o "; + string ispc_object_files_for_diff_targets = " " + prefix + ".ispc_* "; + cmd = cc + " " + cflags + " " + + prefix + file_ending + " " + ispc_object_file + ispc_object_files_for_diff_targets + shims_file + " " + + "-o " + fullpath + " -lm -lrt "; + + // now compile the c file linking the ispc object file. ispc header is added to the top of the c file + std::cout << "--------------------------------------------------------------------------------cmd: " << cmd << std::endl; + err = system(cmd.data()); + taco_uassert(err == 0) << "Compilation command failed:\n" << cmd + << "\nreturned " << err; + } else { + // now compile it + int err = system(cmd.data()); + taco_uassert(err == 0) << "Compilation command failed:\n" << cmd + << "\nreturned " << err; + } // use dlsym() to open the compiled library if (lib_handle) { diff --git a/src/tensor.cpp b/src/tensor.cpp index dac2c3fd2..5e02d2660 100644 --- a/src/tensor.cpp +++ b/src/tensor.cpp @@ -808,9 +808,9 @@ void TensorBase::assemble() { void TensorBase::compute() { taco_uassert(!needsCompile()) << error::compute_without_compile; - if (!needsCompute()) { - return; - } + // if (!needsCompute()) { + // return; + // } setNeedsCompute(false); // Sync operand tensors if needed. auto operands = getTensors(getAssignment().getRhs()); diff --git a/taco-uml.wsd b/taco-uml.wsd new file mode 100644 index 000000000..4b8e39802 --- /dev/null +++ b/taco-uml.wsd @@ -0,0 +1,411 @@ +@startuml taco +scale 1 + + +class IntrusivePtr { + +T *ptr +} +class Uncopyable {} + +class IRNode { + +virtual void accept(IRVisitorStrict *v) const = 0 + +virtual IRNodeType type_info() const = 0; +} + +class BaseStmtNode {} +class BaseExprNode { + +Datatype type +} + +class StmtNode { + +void accept(IRVisitorStrict *v) const +} +class ExprNode { + +void accept(IRVisitorStrict *v) const +} + +Uncopyable <|-- IRNode +IRNode <|-- BaseStmtNode +IRNode <|-- BaseExprNode +BaseStmtNode <|-- StmtNode +BaseExprNode <|-- ExprNode + +class IRHandle { + +void accept(IRVisitorStrict *v) const +} +class Expr {} +class Stmt {} + +IntrusivePtr <|-- IRHandle +IRHandle <|-- Expr +IRHandle <|-- Stmt + +IRHandle "1" *-- "1" IRNode : contains + + + +' this class is abstract but plantuml version does not support interface keyword +interface IRVisitorStrict { + +virtual void visit(const IRNode*) const = 0 +} + +/' +IRVisitor is not an interface or abstract because it +has not pure virtual methods +'/ +class IRVisitor { + +virtual void visit(const IRNode*) +} + +class IRRewriter { + ' protected fields and methods + #Expr expr + #Stmt stmt + + #virtual void visit(const ExprNode* op) + #virtual void visit(const StmtNode* op) + + ' public fields and methods + +Expr rewrite(Expr) + +Stmt rewrite(Stmt) +} +class IRPrinter { + #std::ostream &stream + #std::ostream &stream2 + #int indent + #bool color + #bool simplify + #enum Precedence + #Precedence parentPrecedence = BOTTOM + #NameGenerator varNameGenerator + #scopedMap varNames + + #void doIndent() + #void printBinOp(Expr a, Expr b, std::string op, Precedence precedence) + #void fewMoreMethods() + + #virtual void visit(const ExprNode*) + #virtual void visit(const StmtNode*) + + +setColor(bool color) + +print(Stmt) +} +class IRVerifier {} + +IRVisitorStrict <|-- IRVisitor +IRVisitorStrict <|-- IRPrinter +IRVisitorStrict <|-- IRRewriter +IRVisitor <|-- IRVerifier + +' Inheritance from IRRewriter +' simplifier for ir::Expr +class ExpressionSimplifier {} +IRRewriter <|-- ExpressionSimplifier + +' simplifiers for ir::Stmt +class RemoveRedundantStatements {} +class RemoveRedundantLoops {} +class RemoveDuplicateBody {} + +IRRewriter <|-- RemoveRedundantStatements +IRRewriter <|-- RemoveRedundantLoops +IRRewriter <|-- RemoveDuplicateBody + + +' Inheritance from IRPrinter +class CodeGen {} +class CodeGen_C {} +class CodeGen_CUDA {} +class CodeGen_ISPC { + -class FindVars +} + +class FindVars {} + +IRPrinter <|-- CodeGen +CodeGen <|-- CodeGen_C +CodeGen <|-- CodeGen_ISPC +CodeGen <|-- CodeGen_CUDA + +IRVisitor <|-- FindVars +CodeGen_ISPC +-- FindVars + +class Manageable {} +class IndexStmtNode { + -virtual void accept(IndexStmtVisitorStrict*) const = 0 +} +class IndexExprNode { + -virtual void accept(IndexStmtVisitorStrict*) const = 0 +} + + +Manageable <|-- IndexStmtNode +Uncopyable <|-- IndexStmtNode +Manageable <|-- IndexExprNode +Uncopyable <|-- IndexExprNode + +class IndexStmt {} +class IndexExpr {} + +IntrusivePtr <|-- IndexStmt +IndexStmt "1" *-- "1" IndexStmtNode +IntrusivePtr <|-- IndexExpr +IndexExpr "1" *-- "1" IndexExprNode + + +abstract class IndexExprVisitorStrict { + +void visit(const IndexStmt&) + +virtual void visit(const AccessNode*) = 0 + +virtual void visit(const LiteralNode*) = 0 + +virtual void visit(const NegNode*) = 0 + +virtual void visit(const AddNode*) = 0 + +virtual void visit(const SubNode*) = 0 + +virtual void visit(const MulNode*) = 0 + +virtual void visit(const DivNode*) = 0 + +virtual void visit(const SqrtNode*) = 0 + +virtual void visit(const CastNode*) = 0 + +virtual void visit(const CallIntrinsicNode*) = 0 + +virtual void visit(const ReductionNode*) = 0 +} +abstract class IndexStmtVisitorStrict { + +void visit(const IndexStmt&) + +virtual void visit(const AssignmentNode*) = 0 + +virtual void visit(const YieldNode*) = 0 + +virtual void visit(const ForallNode*) = 0 + +virtual void visit(const WhereNode*) = 0 + +virtual void visit(const SequenceNode*) = 0 + +virtual void visit(const AssembleNode*) = 0 + +virtual void visit(const MultiNode*) = 0 + +virtual void visit(const SuchThatNode*) = 0 +} + +abstract class IndexNotationVisitorStrict {} +class IndexNotationPrinter { + +void print(const IndexExpr& expr) + +void print(const IndexStmt& expr) + + ' Index Expressions visit() + +void visit(const AccessNode* node) + +void visit(const LiteralNode* node) + + void visit(const NegNode* node) + + void visit(const AddNode* node) + + void visit(const SubNode* node) + + void visit(const MulNode* node) + + void visit(const DivNode* node) + + void visit(const SqrtNode* node) + + void visit(const CastNode* node) + + void visit(const CallIntrinsicNode* node) + + void visit(const UnaryExprNode* node) + + void visit(const BinaryExprNode* node) + + void visit(const ReductionNode* node) + + ' Index Statement visit() + + void visit(const AssignmentNode* node) + + void visit(const YieldNode* node) + + void visit(const ForallNode* node) + + void visit(const WhereNode* node) + + void visit(const SequenceNode* node) + + void visit(const AssembleNode* node) + + void visit(const MultiNode* node) + + void visit(const SuchThatNode* node) +} +class IndexNotationVisitor { + ' Index Expressions visit() + +virtual void visit(const AccessNode* node) + +virtual void visit(const LiteralNode* node) + +virtual void visit(const NegNode* node) + +virtual void visit(const AddNode* node) + +virtual void visit(const SubNode* node) + +virtual void visit(const MulNode* node) + +virtual void visit(const DivNode* node) + +virtual void visit(const SqrtNode* node) + +virtual void visit(const CastNode* node) + +virtual void visit(const CallIntrinsicNode* node) + +virtual void visit(const UnaryExprNode* node) + +virtual void visit(const BinaryExprNode* node) + +virtual void visit(const ReductionNode* node) + + ' Index Statement visit() + +virtual void visit(const AssignmentNode* node) + +virtual void visit(const YieldNode* node) + +virtual void visit(const ForallNode* node) + +virtual void visit(const WhereNode* node) + +virtual void visit(const SequenceNode* node) + +virtual void visit(const AssembleNode* node) + +virtual void visit(const MultiNode* node) + +virtual void visit(const SuchThatNode* node) +} +class Matcher { + +} + +abstract class IndexExprRewriterStrict { + +IndexExpr rewrite(IndexExpr) + + #IndexExpr expr + + #virtual void visit(const AccessNode* op) = 0 + #virtual void visit(const LiteralNode* op) = 0 + #virtual void visit(const NegNode* op) = 0 + #virtual void visit(const SqrtNode* op) = 0 + #virtual void visit(const AddNode* op) = 0 + #virtual void visit(const SubNode* op) = 0 + #virtual void visit(const MulNode* op) = 0 + #virtual void visit(const DivNode* op) = 0 + #virtual void visit(const CastNode* op) = 0 + #virtual void visit(const CallIntrinsicNode* op) = 0 + #virtual void visit(const ReductionNode* op) = 0 +} +abstract class IndexStmtRewriterStrict { + +IndexStmt rewrite(IndexStmt) + + #IndexStmt stmt + + #virtual void visit(const AssignmentNode* op) = 0 + #virtual void visit(const YieldNode* op) = 0 + #virtual void visit(const ForallNode* op) = 0 + #virtual void visit(const WhereNode* op) = 0 + #virtual void visit(const SequenceNode* op) = 0 + #virtual void visit(const AssembleNode* op) = 0 + #virtual void visit(const MultiNode* op) = 0 + #virtual void visit(const SuchThatNode* op) = 0 +} +abstract class IndexNotationRewriterStrict {} +class IndexNotationRewriter { + ' Index Expressions visit() + +virtual void visit(const AccessNode* node) + +virtual void visit(const LiteralNode* node) + +virtual void visit(const NegNode* node) + +virtual void visit(const AddNode* node) + +virtual void visit(const SubNode* node) + +virtual void visit(const MulNode* node) + +virtual void visit(const DivNode* node) + +virtual void visit(const SqrtNode* node) + +virtual void visit(const CastNode* node) + +virtual void visit(const CallIntrinsicNode* node) + +virtual void visit(const UnaryExprNode* node) + +virtual void visit(const BinaryExprNode* node) + +virtual void visit(const ReductionNode* node) + + ' Index Statement visit() + +virtual void visit(const AssignmentNode* node) + +virtual void visit(const YieldNode* node) + +virtual void visit(const ForallNode* node) + +virtual void visit(const WhereNode* node) + +virtual void visit(const SequenceNode* node) + +virtual void visit(const AssembleNode* node) + +virtual void visit(const MultiNode* node) + +virtual void visit(const SuchThatNode* node) +} + + +IndexExprVisitorStrict <|-- IndexNotationVisitorStrict +IndexStmtVisitorStrict <|-- IndexNotationVisitorStrict +IndexNotationVisitorStrict <|-- IndexNotationVisitor +IndexNotationVisitorStrict <|-- IndexNotationPrinter +IndexNotationVisitor <|-- Matcher + +IndexExprVisitorStrict <|-- IndexExprRewriterStrict +IndexStmtVisitorStrict <|-- IndexStmtRewriterStrict +IndexExprRewriterStrict <|-- IndexNotationRewriterStrict +IndexStmtRewriterStrict <|-- IndexNotationRewriterStrict + +IndexNotationRewriterStrict <|-- IndexNotationRewriter + +' - private +' # protected +' ~ package private +' + public + +' {static} +' {abstract} virtual methods + +' lowering part -- convertion from IndexExpr and IndexStmt to ir::Expr and ir::Stmt +class Lowerer { + +std::shared_ptr impl; +} +abstract class LowererImpl { + ' protected fields and methods + #class Visitor; + #friend class Visitor; + #std::shared_ptr visitor; + + #virtual ir::Stmt lower(IndexStmt stmt); + #virtual ir::Expr lower(IndexExpr expr); + + #virtual ir::Expr lowerExpr(IndexExpr expr) = 0; + #virtual ir::Stmt lowerStmt(IndexStmt stmt) = 0; + + ' public fields and methods + +virtual ir::Stmt lower(IndexStmt stmt, std::string name, + bool assemble, bool compute, bool pack, bool unpack) = 0; +} + +class LowererImplImperative { + ' private fields and methods + -class Visitor + -fiend class Visitor + -std::shared_ptr visitor + -bool assemble + -bool compute + -vars a_bunch_of_other_fields + + ' protected fields and methods + #virtual ir::Stmt lowerExpr(IndexExpr expr); + #virtual ir::Stmt lowerStmt(IndexStmt stmt); + + ' public fields and methods + +ir::Stmt lower(IndexStmt stmt, std::string name, + bool assemble, bool compute, bool pack, bool unpack) + +} +note bottom of LowererImplImperative : Stmt LowererImplImperative::lower(IndexStmt stmt) {\n return visitor->lower(stmt);\n} + +Uncopyable <|-- LowererImpl +Lowerer "1" *-- "1" LowererImpl : contains + + +' visitor that does the lowering +class Visitor { + ' private fields and methods + -LowererImpl* impl + -Expr expr + -Stmt stmt + + -void visit(const AssignmentNode* node) + -void visit(const YieldNode* node) + -void visit(const ForallNode* node) + -void visit(const WhereNode* node) + -void visit(const MultiNode* node) + -void visit(const SuchThatNode* node) + -void visit(const SequenceNode* node) + -void visit(const AssembleNode* node) + -void visit(const AccessNode* node) + -void visit(const LiteralNode* node) + -void visit(const NegNode* node) + -void visit(const AddNode* node) + -void visit(const SubNode* node) + -void visit(const MulNode* node) + -void visit(const DivNode* node) + -void visit(const SqrtNode* node) + -void visit(const CastNode* node) + -void visit(const CallIntrinsicNode* node) + -void visit(const ReductionNode* node) + + ' public fields and methods + +Visitor(LowererImplImperative* impl) + +Stmt lower(IndexStmt stmt) + +Expr lower(IndexExpr expr) +} + +note bottom of Visitor: Stmt lower(IndexStmt stmt) {\n this->stmt = Stmt();\n impl->accessibleIterators.scope();\n IndexStmtVisitorStrict::visit(stmt);\n impl->accessibleIterators.unscope();\n return this->stmt;\n} + +IndexNotationVisitorStrict <|-- Visitor +LowererImpl "1" +-- "1" Visitor : contains +Visitor "1" *-- "1" LowererImpl : contains + +LowererImpl <|-- LowererImplImperative +LowererImplImperative "1" +-- "1" Visitor : contains +Visitor "1" *-- "1" LowererImplImperative : contains + +@enduml \ No newline at end of file diff --git a/test/test.cpp b/test/test.cpp index a49f10ff7..851493b7f 100644 --- a/test/test.cpp +++ b/test/test.cpp @@ -38,6 +38,20 @@ void ASSERT_TENSOR_EQ(TensorBase expected, TensorBase actual) { ASSERT_TRUE(equals(expected, actual)); } +// void ASSERT_TENSOR_VAL(TensorBase expected, TensorBase actual) { +// std::cout << "order: " << expected.getOrder(); +// std::vector modes{}; +// for (int mode = 0; mode < expected.getOrder(); mode++) { +// if (expected.getDimension(mode) != actual.getDimension(mode)) { +// ASSERT_TRUE(false); +// } + +// for (int i=0; i expected, void ASSERT_STORAGE_EQ(TensorStorage expected, TensorStorage actual); void ASSERT_TENSOR_EQ(TensorBase expected, TensorBase actual); +// void ASSERT_TENSOR_VAL(TensorBase expected, TensorBase actual); template void ASSERT_COMPONENTS_EQUALS(vector>> expectedIndices, diff --git a/test/tests-scheduling-eval.cpp b/test/tests-scheduling-eval.cpp index 93ba7b01e..4957418e0 100644 --- a/test/tests-scheduling-eval.cpp +++ b/test/tests-scheduling-eval.cpp @@ -12,6 +12,23 @@ #include "taco/index_notation/transformations.h" #include "codegen/codegen.h" #include "taco/lower/lower.h" +#include "taco/util/timers.h" + + +#define TOOL_BENCHMARK_TIMER(CODE,NAME,TIMER) { \ + if (time) { \ + taco::util::Timer timer; \ + timer.start(); \ + CODE; \ + timer.stop(); \ + taco::util::TimeResults result = timer.getResult(); \ + cout << NAME << " " << result << " ms" << endl; \ + TIMER=result; \ + } \ + else { \ + CODE; \ + } \ +} using namespace taco; const IndexVar i("i"), j("j"), k("k"), l("l"), m("m"), n("n"); @@ -52,7 +69,7 @@ IndexStmt scheduleSpMVISPC(IndexStmt stmt, int CHUNK_SIZE=16) { // return stmt; return stmt.split(i, i0, i1, CHUNK_SIZE) .reorder({i0, i1, j}) - .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); + .parallelize(i0, ParallelUnit::CPUSimd, OutputRaceStrategy::NoRaces); } IndexStmt scheduleSpMMCPU(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { @@ -71,16 +88,42 @@ IndexStmt scheduleSpMMISPC1(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, .pos(j, jpos, A(i,j)) .split(jpos, jpos0, jpos1, UNROLL_FACTOR) .reorder({i0, i1, jpos0, k, jpos1}) - .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) .parallelize(k, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); } +IndexStmt scheduleSpMMISPC1_2(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); + return stmt.split(i, i0, i1, CHUNK_SIZE) + .pos(j, jpos, A(i,j)) + .split(jpos, jpos0, jpos1, UNROLL_FACTOR) + .reorder({i0, i1, jpos0, k, jpos1}) + // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + .parallelize(i0, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); +} + +IndexStmt scheduleSpMMISPC1_3(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); + return stmt.split(i, i0, i1, CHUNK_SIZE) + .pos(j, jpos, A(i,j)) + .split(jpos, jpos0, jpos1, UNROLL_FACTOR) + .reorder({i0, i1, jpos0, k, jpos1}) + // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + .parallelize(i1, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); +} + IndexStmt scheduleSpMMISPC2(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); return stmt .parallelize(k, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); } +IndexStmt scheduleSpMMISPC2_2(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); + return stmt + .parallelize(i, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); +} + IndexStmt scheduleSpMMISPC3(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); return stmt @@ -88,10 +131,21 @@ IndexStmt scheduleSpMMISPC3(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, // .pos(j, jpos, A(i,j)) // .split(jpos, jpos0, jpos1, UNROLL_FACTOR) .reorder({j, k}) - .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) .parallelize(k, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); } +IndexStmt scheduleSpMMISPC3_2(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); + return stmt + // .split(i, i0, i1, CHUNK_SIZE) + // .pos(j, jpos, A(i,j)) + // .split(jpos, jpos0, jpos1, UNROLL_FACTOR) + .reorder({j, k}) + // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + .parallelize(i, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); +} + IndexStmt scheduleSpGEMMCPU(IndexStmt stmt, bool doPrecompute) { Assignment assign = stmt.as().getStmt().as().getStmt() .as().getStmt().as(); @@ -145,6 +199,16 @@ IndexStmt scheduleSDDMMCPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, .parallelize(kpos1, ParallelUnit::CPUVector, OutputRaceStrategy::ParallelReduction); } +IndexStmt scheduleSDDMMISPC(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1"); + return stmt.split(i, i0, i1, CHUNK_SIZE) + .pos(k, kpos, B(i,k)) + .split(kpos, kpos0, kpos1, UNROLL_FACTOR) + .reorder({i0, i1, kpos0, j, kpos1}) + // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); + .parallelize(kpos1, ParallelUnit::CPUSimd, OutputRaceStrategy::ParallelReduction); +} + IndexStmt scheduleSDDMMISPC1(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1"); return stmt.split(i, i0, i1, CHUNK_SIZE) @@ -175,6 +239,16 @@ IndexStmt scheduleTTVCPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16) { .parallelize(chunk, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); } +IndexStmt scheduleTTVISPC(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16) { + IndexVar f("f"), fpos("fpos"), chunk("chunk"), fpos2("fpos2"); + return stmt; + // return stmt.fuse(i, j, f) + // .pos(f, fpos, B(i,j,k)) + // .split(fpos, chunk, fpos2, CHUNK_SIZE) + // .reorder({chunk, fpos2, k}) + // .parallelize(chunk, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); +} + IndexStmt scheduleTTVCPUCSR(IndexStmt stmt) { TensorVar result = stmt.as().getStmt().as().getStmt() .as().getStmt().as().getLhs() @@ -635,6 +709,92 @@ TEST(scheduling_eval, spmmCPU) { ASSERT_TENSOR_EQ(expected, C); } +TEST(scheduling_eval, spmmISPC) { + taco::util::TimeResults timevalue; + bool time = true; + + set_ISPC_codegen_enabled(false); + set_CUDA_codegen_enabled(false); + + int NUM_I = 1021/10; + int NUM_J = 1039/10; + int NUM_K = 128; + float SPARSITY = .1; + Tensor A("A", {NUM_I, NUM_J}, CSR); + Tensor B("B", {NUM_J, NUM_K}, {Dense, Dense}); + Tensor C("C", {NUM_I, NUM_K}, {Dense, Dense}); + + srand(75883); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + if (rand_float < SPARSITY) { + A.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + } + + for (int j = 0; j < NUM_J; j++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + B.insert({j, k}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + + A.pack(); + B.pack(); + + set_ISPC_codegen_enabled(true); + C(i, k) = A(i, j) * B(j, k); + + IndexStmt stmt = C.getAssignment().concretize(); + // stmt = scheduleSpMMISPC1(stmt, A); + // stmt = scheduleSpMMISPC1_2(stmt, A); + stmt = scheduleSpMMISPC1_3(stmt, A); + + // stmt = scheduleSpMMISPC2(stmt, A); + // stmt = scheduleSpMMISPC2_2(stmt, A); + + // stmt = scheduleSpMMISPC3(stmt, A); + // stmt = scheduleSpMMISPC3_2(stmt, A); + + //printToFile("spmm_cpu", stmt); + + C.compile(stmt); + C.assemble(); + C.compute(); + + set_ISPC_codegen_enabled(false); + Tensor expected("expected", {NUM_I, NUM_K}, {Dense, Dense}); + expected(i, k) = A(i, j) * B(j, k); + IndexStmt stmt_taco = expected.getAssignment().concretize(); + stmt_taco = scheduleSpMMCPU(stmt_taco, A); + + expected.compile(stmt_taco); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, C); + + float ERROR_MARGIN = 0.01; + // ASSERT_TENSOR_VAL(expected, y); + for (int i = 0; i < NUM_I; i++) { + for (int k = 0; k < NUM_K; k++) { + if (expected(i,k) <= C(i,k) + ERROR_MARGIN && expected(i,k) >= C(i,k) - ERROR_MARGIN) { + // std::cout << "matched values: expected -> " << expected(j) << " == " << y(j) << " <- actual\n"; + } + else { + std::cout << "unmatched values: expected -> " << expected(i,k) << " != " << C(i,k) << " <- actual\n"; + ASSERT_TRUE(false); + }; + } + } + + for (int i=0; i<10; i++) { + TOOL_BENCHMARK_TIMER(C.compute(), "Compute ISPC: ", timevalue); + TOOL_BENCHMARK_TIMER(expected.compute(), "Compute TACO: ", timevalue); + } +} + struct spgemm : public TestWithParam> {}; TEST_P(spgemm, scheduling_eval) { @@ -878,6 +1038,96 @@ TEST(scheduling_eval, sddmmCPU) { ASSERT_TENSOR_EQ(expected, A); } +// bin/taco-test --gtest_filter=scheduling_eval.sddmmISPC +TEST(scheduling_eval, sddmmISPC) { + + taco::util::TimeResults timevalue; + bool time = true; + + set_CUDA_codegen_enabled(false); + set_ISPC_codegen_enabled(false); + + int NUM_I = 1021/10; + int NUM_J = 1039/10; + int NUM_K = 1057/10; + float SPARSITY = .3; + Tensor A("A", {NUM_I, NUM_K}, {Dense, Dense}); + Tensor B("B", {NUM_I, NUM_K}, CSR); + Tensor C("C", {NUM_I, NUM_J}, {Dense, Dense}); + Tensor D("D", {NUM_J, NUM_K}, {Dense, Dense}); + + srand(268238); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + C.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + + for (int i = 0; i < NUM_I; i++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, k}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + } + + for (int j = 0; j < NUM_J; j++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + D.insert({j, k}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + + B.pack(); + C.pack(); + D.pack(); + + set_ISPC_codegen_enabled(true); + A(i,k) = B(i,k) * C(i,j) * D(j,k); + + IndexStmt stmt = A.getAssignment().concretize(); + stmt = scheduleSDDMMISPC(stmt, B); + + //printToFile("sddmm_cpu", stmt); + + A.compile(stmt); + A.assemble(); + // A.compute(); + + set_ISPC_codegen_enabled(false); + Tensor expected("expected", {NUM_I, NUM_K}, {Dense, Dense}); + expected(i,k) = B(i,k) * C(i,j) * D(j,k); + IndexStmt stmt_taco = A.getAssignment().concretize(); + stmt_taco = scheduleSDDMMCPU(stmt_taco, B); + expected.compile(stmt_taco); + expected.assemble(); + // expected.compute(); + + TOOL_BENCHMARK_TIMER(A.compute(), "Compute ISPC: ", timevalue); + TOOL_BENCHMARK_TIMER(expected.compute(), "Compute TACO: ", timevalue); + + ASSERT_TENSOR_EQ(expected, A); + + + float ERROR_MARGIN = 0.01; + // ASSERT_TENSOR_VAL(expected, y); + for (int i = 0; i < NUM_I; i++) { + for (int k = 0; k < NUM_K; k++) { + if (expected(i,k) <= A(i,k) + ERROR_MARGIN && expected(i,k) >= A(i,k) - ERROR_MARGIN) { + // std::cout << "matched values: expected -> " << expected(j) << " == " << y(j) << " <- actual\n"; + } + else { + std::cout << "unmatched values: expected -> " << expected(i,k) << " != " << A(i,k) << " <- actual\n"; + ASSERT_TRUE(false); + }; + } + } + std::cout << "test scheduling_eval.sddmmISPC passed\n"; + +} + TEST(scheduling_eval, spmvCPU) { if (should_use_CUDA_codegen()) { return; @@ -926,6 +1176,100 @@ TEST(scheduling_eval, spmvCPU) { ASSERT_TENSOR_EQ(expected, y); } + +TEST(scheduling_eval, spmvISPC) { + + taco::util::TimeResults timevalue; + bool time = true; + + set_ISPC_codegen_enabled(false); + set_CUDA_codegen_enabled(false); + + int NUM_I = 200021/10; + int NUM_J = 200039/10; + float SPARSITY = .2; + Tensor A("A", {NUM_I, NUM_J}, CSR); + Tensor x("x", {NUM_J}, Format({Dense})); + Tensor y("y", {NUM_I}, Format({Dense})); + + srand(120); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + if (rand_float < SPARSITY) { + A.insert({i, j}, (double) ((int) (rand_float * 3 / SPARSITY))); + } + } + } + + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + x.insert({j}, (double) ((int) (rand_float*3/SPARSITY))); + } + + x.pack(); + A.pack(); + + set_ISPC_codegen_enabled(true); + + y(i) = A(i, j) * x(j); + + IndexStmt stmt = y.getAssignment().concretize(); + stmt = scheduleSpMVISPC(stmt); + + //printToFile("spmv_cpu", stmt); + + y.compile(stmt); + y.assemble(); + // y.compile(); + + set_ISPC_codegen_enabled(false); + + // Tensor expected("expected", {NUM_I}, Format({Dense})); + // expected(i) = A(i, j) * x(j); + // expected.compile(); + // expected.assemble(); + // expected.compute(); + + + Tensor expected("expected", {NUM_I}, Format({Dense})); + expected(i) = A(i, j) * x(j); + IndexStmt stmt_taco = expected.getAssignment().concretize(); + stmt_taco = scheduleSpMVCPU(stmt_taco); + + expected.compile(stmt_taco); + expected.assemble(); + // expected.compile(); + + + TOOL_BENCHMARK_TIMER(y.compute(), "Compute ISPC: ", timevalue); + TOOL_BENCHMARK_TIMER(expected.compute(), "Compute TACO: ", timevalue); + + + ASSERT_TENSOR_EQ(expected, y); + + float ERROR_MARGIN = 0.01; + // ASSERT_TENSOR_VAL(expected, y); + for (int j = 0; j < NUM_J; j++) { + if (expected(j) <= y(j) + ERROR_MARGIN && expected(j) >= y(j) - ERROR_MARGIN) { + // std::cout << "matched values: expected -> " << expected(j) << " == " << y(j) << " <- actual\n"; + } + else { + std::cout << "unmatched values: expected -> " << expected(j) << " != " << y(j) << " <- actual\n"; + ASSERT_TRUE(false); + }; + } + + std::cout << "test scheduling_eval.spmvISPC passed\n"; + + for (int i=0; i<10; i++) { + TOOL_BENCHMARK_TIMER(y.compute(), "Compute ISPC: ", timevalue); + TOOL_BENCHMARK_TIMER(expected.compute(), "Compute TACO: ", timevalue); + } + + +} + TEST(scheduling_eval, ttvCPU) { if (should_use_CUDA_codegen()) { return; @@ -977,6 +1321,65 @@ TEST(scheduling_eval, ttvCPU) { ASSERT_TENSOR_EQ(expected, A); } + +TEST(scheduling_eval, ttvISPC) { + if (should_use_CUDA_codegen()) { + return; + } + set_CUDA_codegen_enabled(false); + set_ISPC_codegen_enabled(false); + int NUM_I = 1021/10; + int NUM_J = 1039/10; + int NUM_K = 1057/10; + float SPARSITY = .3; + Tensor A("A", {NUM_I, NUM_J}, {Dense, Dense}); // TODO: change to sparse outputs + Tensor B("B", {NUM_I, NUM_J, NUM_K}, {Sparse, Sparse, Sparse}); + Tensor c("c", {NUM_K}, Format({Dense})); + + srand(9536); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float) rand() / (float) (RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, j, k}, (double) ((int) (rand_float * 3 / SPARSITY))); + } + } + } + } + + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + c.insert({k}, (double) ((int) (rand_float*3))); + } + + B.pack(); + c.pack(); + + set_ISPC_codegen_enabled(true); + A(i,j) = B(i,j,k) * c(k); + + IndexStmt stmt = A.getAssignment().concretize(); + stmt = scheduleTTVISPC(stmt, B); + + //printToFile("ttv_cpu", stmt); + + A.compile(stmt); + A.assemble(); + A.compute(); + + set_ISPC_codegen_enabled(false); + Tensor expected("expected", {NUM_I, NUM_J}, {Dense, Dense}); + expected(i,j) = B(i,j,k) * c(k); + IndexStmt stmt_taco = expected.getAssignment().concretize(); + stmt_taco = scheduleTTVCPU(stmt_taco, B); + expected.compile(); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, A); +} + + TEST(scheduling_eval, ttvCPU_CSR) { if (should_use_CUDA_codegen()) { return; @@ -1081,6 +1484,60 @@ TEST(scheduling_eval, ttmCPU) { ASSERT_TENSOR_EQ(expected, A); } +TEST(scheduling_eval, ttmISPC) { + if (should_use_CUDA_codegen()) { + return; + } + int NUM_I = 1021/40; + int NUM_J = 1039/40; + int NUM_K = 1057/40; + int NUM_L = 1232/40; + float SPARSITY = .1; + Tensor A("A", {NUM_I, NUM_J, NUM_L}, {Dense, Dense, Dense}); // TODO: change to sparse outputs + Tensor B("B", {NUM_I, NUM_J, NUM_K}, {Sparse, Sparse, Sparse}); + Tensor C("C", {NUM_K, NUM_L}, {Dense, Dense}); + + srand(935); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float) rand() / (float) (RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, j, k}, (double) ((int) (rand_float * 3 / SPARSITY))); + } + } + } + } + + for (int k = 0; k < NUM_K; k++) { + for (int l = 0; l < NUM_L; l++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + C.insert({k, l}, (double) ((int) (rand_float*3))); + } + } + + B.pack(); + C.pack(); + + A(i,j,l) = B(i,j,k) * C(k,l); + + IndexStmt stmt = A.getAssignment().concretize(); + stmt = scheduleTTMCPU(stmt, B); + + //printToFile("ttm_cpu", stmt); + + A.compile(stmt); + A.assemble(); + A.compute(); + + Tensor expected("expected", {NUM_I, NUM_J, NUM_L}, {Dense, Dense, Dense}); + expected(i,j,l) = B(i,j,k) * C(k,l); + expected.compile(); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, A); +} + TEST(scheduling_eval, mttkrpCPU) { if (should_use_CUDA_codegen()) { return; @@ -1143,6 +1600,69 @@ TEST(scheduling_eval, mttkrpCPU) { ASSERT_TENSOR_EQ(expected, A); } + +TEST(scheduling_eval, mttkrpISPC) { + if (should_use_CUDA_codegen()) { + return; + } + int NUM_I = 1021/20; + int NUM_J = 1039/20; + int NUM_K = 1057/20; + int NUM_L = 1232/20; + float SPARSITY = .1; + Tensor A("A", {NUM_I, NUM_J}, {Dense, Dense}); + Tensor B("B", {NUM_I, NUM_K, NUM_L}, {Dense, Sparse, Sparse}); + Tensor C("C", {NUM_K, NUM_J}, {Dense, Dense}); + Tensor D("D", {NUM_L, NUM_J}, {Dense, Dense}); + + srand(549694); + for (int i = 0; i < NUM_I; i++) { + for (int k = 0; k < NUM_K; k++) { + for (int l = 0; l < NUM_L; l++) { + float rand_float = (float) rand() / (float) (RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, k, l}, (double) ((int) (rand_float * 3 / SPARSITY))); + } + } + } + } + + for (int k = 0; k < NUM_K; k++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + C.insert({k, j}, (double) ((int) (rand_float*3))); + } + } + + for (int l = 0; l < NUM_L; l++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + D.insert({l, j}, (double) ((int) (rand_float*3))); + } + } + + B.pack(); + C.pack(); + D.pack(); + + A(i,j) = B(i,k,l) * C(k,j) * D(l,j); + + IndexStmt stmt = A.getAssignment().concretize(); + stmt = scheduleMTTKRPCPU(stmt, B); + //printToFile("mttkrp_cpu", stmt); + + A.compile(stmt); + A.assemble(); + A.compute(); + + Tensor expected("expected", {NUM_I, NUM_J}, {Dense, Dense}); + expected(i,j) = B(i,k,l) * C(k,j) * D(l,j); + expected.compile(); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, A); +} + TEST(scheduling_eval, spmvGPU) { if (!should_use_CUDA_codegen()) { return; @@ -2079,6 +2599,55 @@ TEST(generate_evaluation_files, cpu) { } } +TEST(generate_evaluation_files_spmv, ispc) { + set_CUDA_codegen_enabled(false); + set_ISPC_codegen_enabled(true); + + std::cout << "executing generate_evaluation_file.ispc\n"; + + int NUM_I = 100; + int NUM_J = 100; + + vector> spmv_parameters = {}; // {NNZ_PER_THREAD, BLOCK_SIZE} + for (int i = 3; i <= 20; i++) { + spmv_parameters.push_back({i, 512}); + } + + string file_ending_c = ".c"; + string file_ending_ispc = ".ispc"; + string file_path = "eval_prepared_ispc/spmv/"; + mkdir(file_path.c_str(), 0777); + + // spmv + { + stringstream source1; + stringstream source2; + std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); + Tensor A("A", {NUM_I, NUM_J}, CSR); + Tensor x("x", {NUM_J}, Format({Dense})); + Tensor y("y", {NUM_I}, Format({Dense})); + IndexExpr precomputed = A(i, j) * x(j); + y(i) = precomputed; + IndexStmt stmt = y.getAssignment().concretize(); + bool isFirst = true; + for (auto paramSet : spmv_parameters) { + IndexStmt scheduled = scheduleSpMVCPU(stmt); + ir::Stmt compute = lower(scheduled, string("compute_") + util::join(paramSet, "_"), false, true); + codegen->compile(compute, isFirst); + isFirst = false; + } + ofstream source_file1; + source_file1.open(file_path + "spmv_ispc" + file_ending_c); + source_file1 << source1.str(); + source_file1.close(); + + ofstream source_file2; + source_file2.open(file_path + "__spmv_ispc" + file_ending_ispc); + source_file2 << source2.str(); + source_file2.close(); + } +} + TEST(generate_evaluation_files, gpu) { // if (!should_use_CUDA_codegen()) { // return; From a5c3a8cea4c8c736d7bf0c4cf976095cbed11401 Mon Sep 17 00:00:00 2001 From: Adhhitha Dias Date: Wed, 8 Sep 2021 10:26:47 -0400 Subject: [PATCH 06/16] add class diagram --- .gitignore | 1 + out/taco-uml/._taco.svg | Bin 0 -> 4096 bytes out/taco-uml/taco.svg | 878 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 879 insertions(+) create mode 100755 out/taco-uml/._taco.svg create mode 100644 out/taco-uml/taco.svg diff --git a/.gitignore b/.gitignore index 9abc3adc7..215b56e9a 100644 --- a/.gitignore +++ b/.gitignore @@ -14,4 +14,5 @@ doc apps/tensor_times_vector/tensor_times_vector .cache +.vscode compile_commands.json diff --git a/out/taco-uml/._taco.svg b/out/taco-uml/._taco.svg new file mode 100755 index 0000000000000000000000000000000000000000..e88dbd51b684b39e4ea0b0f4425ef9bc02f5d445 GIT binary patch literal 4096 zcmZQz6=P>$Vqox1Ojhs@R)|o50+1L3ClDJkFz{^v(m+1nBL)UWIUt(=a103vVwlIZ z9-@O`0Z_RBnifVNA1W@DoS&IntrusivePtrT *ptrUncopyableIRNodevirtual void accept(IRVisitorStrict *v) const = 0virtual IRNodeType type_info() const = 0;BaseStmtNodeBaseExprNodeDatatype typeStmtNodevoid accept(IRVisitorStrict *v) constExprNodevoid accept(IRVisitorStrict *v) constIRHandlevoid accept(IRVisitorStrict *v) constExprStmtIRVisitorStrictvirtual void visit(const IRNode*) const = 0IRVisitorvirtual void visit(const IRNode*)IRRewriterExpr exprStmt stmtvirtual void visit(const ExprNode* op)virtual void visit(const StmtNode* op)Expr rewrite(Expr)Stmt rewrite(Stmt)IRPrinterstd::ostream &streamstd::ostream &stream2int indentbool colorbool simplifyenum PrecedencePrecedence parentPrecedence = BOTTOMNameGenerator varNameGeneratorscopedMap<Expr, std::String> varNamesvoid doIndent()void printBinOp(Expr a, Expr b, std::string op, Precedence precedence)void fewMoreMethods()virtual void visit(const ExprNode*)virtual void visit(const StmtNode*)setColor(bool color)print(Stmt)IRVerifierExpressionSimplifierRemoveRedundantStatementsRemoveRedundantLoopsRemoveDuplicateBodyCodeGenCodeGen_CCodeGen_CUDACodeGen_ISPCManageableIndexStmtNodevirtual void accept(IndexStmtVisitorStrict*) const = 0IndexExprNodevirtual void accept(IndexStmtVisitorStrict*) const = 0IndexStmtIndexExprIndexExprVisitorStrictvoid visit(const IndexStmt&)virtual void visit(const AccessNode*) = 0virtual void visit(const LiteralNode*) = 0virtual void visit(const NegNode*) = 0virtual void visit(const AddNode*) = 0virtual void visit(const SubNode*) = 0virtual void visit(const MulNode*) = 0virtual void visit(const DivNode*) = 0virtual void visit(const SqrtNode*) = 0virtual void visit(const CastNode*) = 0virtual void visit(const CallIntrinsicNode*) = 0virtual void visit(const ReductionNode*) = 0IndexStmtVisitorStrictvoid visit(const IndexStmt&)virtual void visit(const AssignmentNode*) = 0virtual void visit(const YieldNode*) = 0virtual void visit(const ForallNode*) = 0virtual void visit(const WhereNode*) = 0virtual void visit(const SequenceNode*) = 0virtual void visit(const AssembleNode*) = 0virtual void visit(const MultiNode*) = 0virtual void visit(const SuchThatNode*) = 0IndexNotationVisitorStrictIndexNotationPrintervoid print(const IndexExpr& expr)void print(const IndexStmt& expr)void visit(const AccessNode* node)void visit(const LiteralNode* node)void visit(const NegNode* node)void visit(const AddNode* node)void visit(const SubNode* node)void visit(const MulNode* node)void visit(const DivNode* node)void visit(const SqrtNode* node)void visit(const CastNode* node)void visit(const CallIntrinsicNode* node)void visit(const UnaryExprNode* node)void visit(const BinaryExprNode* node)void visit(const ReductionNode* node)void visit(const AssignmentNode* node)void visit(const YieldNode* node)void visit(const ForallNode* node)void visit(const WhereNode* node)void visit(const SequenceNode* node)void visit(const AssembleNode* node)void visit(const MultiNode* node)void visit(const SuchThatNode* node)IndexNotationVisitorvirtual void visit(const AccessNode* node)virtual void visit(const LiteralNode* node)virtual void visit(const NegNode* node)virtual void visit(const AddNode* node)virtual void visit(const SubNode* node)virtual void visit(const MulNode* node)virtual void visit(const DivNode* node)virtual void visit(const SqrtNode* node)virtual void visit(const CastNode* node)virtual void visit(const CallIntrinsicNode* node)virtual void visit(const UnaryExprNode* node)virtual void visit(const BinaryExprNode* node)virtual void visit(const ReductionNode* node)virtual void visit(const AssignmentNode* node)virtual void visit(const YieldNode* node)virtual void visit(const ForallNode* node)virtual void visit(const WhereNode* node)virtual void visit(const SequenceNode* node)virtual void visit(const AssembleNode* node)virtual void visit(const MultiNode* node)virtual void visit(const SuchThatNode* node)MatcherIndexExprRewriterStrictIndexExpr exprIndexExpr rewrite(IndexExpr)virtual void visit(const AccessNode* op) = 0virtual void visit(const LiteralNode* op) = 0virtual void visit(const NegNode* op) = 0virtual void visit(const SqrtNode* op) = 0virtual void visit(const AddNode* op) = 0virtual void visit(const SubNode* op) = 0virtual void visit(const MulNode* op) = 0virtual void visit(const DivNode* op) = 0virtual void visit(const CastNode* op) = 0virtual void visit(const CallIntrinsicNode* op) = 0virtual void visit(const ReductionNode* op) = 0IndexStmtRewriterStrictIndexStmt stmtIndexStmt rewrite(IndexStmt)virtual void visit(const AssignmentNode* op) = 0virtual void visit(const YieldNode* op) = 0virtual void visit(const ForallNode* op) = 0virtual void visit(const WhereNode* op) = 0virtual void visit(const SequenceNode* op) = 0virtual void visit(const AssembleNode* op) = 0virtual void visit(const MultiNode* op) = 0virtual void visit(const SuchThatNode* op) = 0IndexNotationRewriterStrictIndexNotationRewritervirtual void visit(const AccessNode* node)virtual void visit(const LiteralNode* node)virtual void visit(const NegNode* node)virtual void visit(const AddNode* node)virtual void visit(const SubNode* node)virtual void visit(const MulNode* node)virtual void visit(const DivNode* node)virtual void visit(const SqrtNode* node)virtual void visit(const CastNode* node)virtual void visit(const CallIntrinsicNode* node)virtual void visit(const UnaryExprNode* node)virtual void visit(const BinaryExprNode* node)virtual void visit(const ReductionNode* node)virtual void visit(const AssignmentNode* node)virtual void visit(const YieldNode* node)virtual void visit(const ForallNode* node)virtual void visit(const WhereNode* node)virtual void visit(const SequenceNode* node)virtual void visit(const AssembleNode* node)virtual void visit(const MultiNode* node)virtual void visit(const SuchThatNode* node)Lowererstd::shared_ptr<LowererImpl> impl;LowererImplclass Visitor;friend class Visitor;std::shared_ptr<Visitor> visitor;virtual ir::Stmt lower(IndexStmt stmt);virtual ir::Expr lower(IndexExpr expr);virtual ir::Expr lowerExpr(IndexExpr expr) = 0;virtual ir::Stmt lowerStmt(IndexStmt stmt) = 0;virtual ir::Stmt lower(IndexStmt stmt, std::string name,bool assemble, bool compute, bool pack, bool unpack) = 0;LowererImplImperativeclass Visitorfiend class Visitorstd::shared_ptr<Visitor> visitorbool assemblebool computevars a_bunch_of_other_fieldsvirtual ir::Stmt lowerExpr(IndexExpr expr);virtual ir::Stmt lowerStmt(IndexStmt stmt);ir::Stmt lower(IndexStmt stmt, std::string name,bool assemble, bool compute, bool pack, bool unpack)Stmt LowererImplImperative::lower(IndexStmt stmt) {return visitor->lower(stmt);}VisitorLowererImpl* implExpr exprStmt stmtvoid visit(const AssignmentNode* node)void visit(const YieldNode* node)void visit(const ForallNode* node)void visit(const WhereNode* node)void visit(const MultiNode* node)void visit(const SuchThatNode* node)void visit(const SequenceNode* node)void visit(const AssembleNode* node)void visit(const AccessNode* node)void visit(const LiteralNode* node)void visit(const NegNode* node)void visit(const AddNode* node)void visit(const SubNode* node)void visit(const MulNode* node)void visit(const DivNode* node)void visit(const SqrtNode* node)void visit(const CastNode* node)void visit(const CallIntrinsicNode* node)void visit(const ReductionNode* node)Visitor(LowererImplImperative* impl)Stmt lower(IndexStmt stmt)Expr lower(IndexExpr expr)Stmt lower(IndexStmt stmt) {this->stmt = Stmt();impl->accessibleIterators.scope();IndexStmtVisitorStrict::visit(stmt);impl->accessibleIterators.unscope();return this->stmt;}contains111111contains11contains11contains11contains11contains11 \ No newline at end of file From 4a4a569f83b7acf5656eff290fd004c62bdc38b9 Mon Sep 17 00:00:00 2001 From: Adhhitha Dias Date: Wed, 8 Sep 2021 10:35:14 -0400 Subject: [PATCH 07/16] add ispc headers for binary search and fix compile errors --- include/taco/ir/ir.h | 2 +- src/codegen/codegen_ispc.cpp | 397 +++++++++++++++++++++-------------- src/codegen/codegen_ispc.h | 8 +- src/ir/ir_printer.cpp | 40 +++- 4 files changed, 277 insertions(+), 170 deletions(-) diff --git a/include/taco/ir/ir.h b/include/taco/ir/ir.h index 651faff4e..96dc7d034 100644 --- a/include/taco/ir/ir.h +++ b/include/taco/ir/ir.h @@ -591,7 +591,7 @@ struct Switch : public StmtNode { static const IRNodeType _type_info = IRNodeType::Switch; }; -enum class LoopKind {Serial, Static, Dynamic, Runtime, Vectorized, Static_Chunked, Foreach, Mul_Thread}; +enum class LoopKind {Serial, Static, Dynamic, Runtime, Vectorized, Static_Chunked, Foreach, Mul_Thread, Init}; /** A for loop from start to end by increment. * A vectorized loop will require the increment to be 1 and the diff --git a/src/codegen/codegen_ispc.cpp b/src/codegen/codegen_ispc.cpp index 237bc822d..d35af1748 100644 --- a/src/codegen/codegen_ispc.cpp +++ b/src/codegen/codegen_ispc.cpp @@ -145,8 +145,61 @@ const string cHeaders = " free(t);\n" "}\n" "#endif\n"; + +const string ispcHeaders = + "#define __TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))\n" + "#define __TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))\n" + "#define __TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)\n" + "int __cmp(const void *a, const void *b) {\n" + " return *((const int*)a) - *((const int*)b);\n" + "}\n" + "int __taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {\n" + " if (array[arrayStart] >= target) {\n" + " return arrayStart;\n" + " }\n" + " int lowerBound = arrayStart; // always < target\n" + " int upperBound = arrayEnd; // always >= target\n" + " while (upperBound - lowerBound > 1) {\n" + " int mid = (upperBound + lowerBound) / 2;\n" + " int midValue = array[mid];\n" + " if (midValue < target) {\n" + " lowerBound = mid;\n" + " }\n" + " else if (midValue > target) {\n" + " upperBound = mid;\n" + " }\n" + " else {\n" + " return mid;\n" + " }\n" + " }\n" + " return upperBound;\n" + "}\n" + "int __taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {\n" + " if (array[arrayEnd] <= target) {\n" + " return arrayEnd;\n" + " }\n" + " int lowerBound = arrayStart; // always <= target\n" + " int upperBound = arrayEnd; // always > target\n" + " while (upperBound - lowerBound > 1) {\n" + " int mid = (upperBound + lowerBound) / 2;\n" + " int midValue = array[mid];\n" + " if (midValue < target) {\n" + " lowerBound = mid;\n" + " }\n" + " else if (midValue > target) {\n" + " upperBound = mid;\n" + " }\n" + " else {\n" + " return mid;\n" + " }\n" + " }\n" + " return lowerBound;\n" + "}\n\n\n"; + } // anonymous namespace + + // find variables for generating declarations // generates a single var for each GetProperty class CodeGen_ISPC::FindVars : public IRVisitor { @@ -249,11 +302,10 @@ class CodeGen_ISPC::FindVars : public IRVisitor { // Finds all for loops tagged with accelerator and adds statements to deviceFunctions // Also tracks scope of when device function is called and // tracks which variables must be passed to function. -class CodeGen_ISPC::DeviceFunctionCollector : public IRVisitor { +class CodeGen_ISPC::FunctionCollector : public IRVisitor { public: - vector blockFors; vector threadFors; // contents is device function - vector warpFors; + vector initFors; // for loops to initialize statements map scopeMap; // the variables to pass to each device function @@ -271,7 +323,7 @@ class CodeGen_ISPC::DeviceFunctionCollector : public IRVisitor { CodeGen_ISPC *codeGen; // copy inputs and outputs into the map - DeviceFunctionCollector(vector inputs, vector outputs, CodeGen_ISPC *codeGen) : codeGen(codeGen) { + FunctionCollector(vector inputs, vector outputs, CodeGen_ISPC *codeGen) : codeGen(codeGen) { inDeviceFunction = false; for (auto v: inputs) { auto var = v.as(); @@ -310,7 +362,11 @@ class CodeGen_ISPC::DeviceFunctionCollector : public IRVisitor { } else if (op->parallel_unit == ParallelUnit::CPUSimd) { - + std::cout << "************************************************************************** CPUSimd For node\n"; + } + else if (op->kind == LoopKind::Init) { + std::cout << "************************************************************************* Init loop kind found\n"; + initFors.push_back(op); } else{ op->var.accept(this); @@ -376,6 +432,10 @@ void CodeGen_ISPC::compile(Stmt stmt, bool isFirst) { if (isFirst) { // output the headers out << cHeaders; + + if (&out != &out2) { + out2 << ispcHeaders; + } } out << endl; // generate code for the Stmt @@ -385,13 +445,13 @@ void CodeGen_ISPC::compile(Stmt stmt, bool isFirst) { -string CodeGen_ISPC::printCallISPCFunc(const Function *func, map varMap, +string CodeGen_ISPC::printCallISPCFunc(const std::string& funcName, map varMap, vector &sortedProps) { std::stringstream ret; ret << " "; unordered_set propsAlreadyGenerated; - ret << "__" << func->name << "("; + ret << "__" << funcName << "("; for (unsigned long i=0; i < sortedProps.size(); i++) { @@ -410,118 +470,71 @@ string CodeGen_ISPC::printCallISPCFunc(const Function *func, map varMap, vector &sortedProps) { - DeviceFunctionCollector deviceFunctionCollector(func->inputs, func->outputs, this); - func->body.accept(&deviceFunctionCollector); + FunctionCollector functionCollector(func->inputs, func->outputs, this); + func->body.accept(&functionCollector); - std::stringstream variables; vector inputs = func->inputs; vector outputs = func->outputs; unordered_set propsAlreadyGenerated; - for (unsigned long i=0; i < sortedProps.size(); i++) { - auto prop = sortedProps[i]; - bool isOutputProp = (find(outputs.begin(), outputs.end(), - prop->tensor) != outputs.end()); - - auto var = prop->tensor.as(); - if (var->is_parameter) { - if (isOutputProp) { - variables << " " << printTensorProperty(varMap[prop], prop, false) << ";" << endl; - } else { - break; - } + for (unsigned long i=0; i < sortedProps.size(); i++) { + auto prop = sortedProps[i]; + bool isOutputProp = (find(outputs.begin(), outputs.end(), + prop->tensor) != outputs.end()); + + auto var = prop->tensor.as(); + if (var->is_parameter) { + if (isOutputProp) { + funcVariables << " " << printTensorProperty(varMap[prop], prop, false) << ";" << endl; } else { - variables << getUnpackedTensorArgument(varMap[prop], prop, isOutputProp); + break; } - propsAlreadyGenerated.insert(varMap[prop]); + } else { + funcVariables << getUnpackedTensorArgument(varMap[prop], prop, isOutputProp); + } + propsAlreadyGenerated.insert(varMap[prop]); - if (i!=sortedProps.size()-1) { - variables << ", "; - } - if (i%2==0) { - variables << "\n\t"; - } + if (i!=sortedProps.size()-1) { + funcVariables << ", "; + } + if (i%2==0) { + funcVariables << "\n\t"; } + } resetUniqueNameCounters(); - for (size_t i = 0; i < deviceFunctionCollector.threadFors.size(); i++) { - const For *threadloop = to(deviceFunctionCollector.threadFors[i]); + // threadFors code generation + for (size_t i = 0; i < functionCollector.threadFors.size(); i++) { + + const For *threadloop = to(functionCollector.threadFors[i]); taco_iassert(threadloop->parallel_unit == ParallelUnit::CPUSpmd); Stmt function = threadloop->contents; std::cout << "threadloop function: " << function << std::endl; - out2 << "static task void __" << func->name << "__ ("; - out2 << variables.str(); + out2 << "\nstatic task void __" << func->name << "__ ("; + out2 << funcVariables.str(); out2 << "\n) {\n\n"; indent++; - doIndent(); - // output body + // output body of the threadloop + taskCode = true; print(threadloop); indent--; - out2 << "}\n"; - - out2 << "export void __" << func->name << "("; - out2 << variables.str(); - out2 << "\n) {\n\n"; - indent++; - doIndent(); - out2 << "launch[4] " << printCallISPCFunc(func, varMap, sortedProps) << "\n"; - indent--; - out2 << "}\n"; - - } - - if (deviceFunctionCollector.threadFors.size()==0) { - out2 << "export void __" << func->name << " ("; - out2 << variables.str(); - out2 << "\n) {\n\n"; + out2 << "}\n\n"; - indent++; - doIndent(); - // output body - print(func->body); - indent--; - out2 << "}\n"; } - // out2 << "export void "; - - // out2 << "__" << func->name << "("; + taskCode = false; + out2 << "export void __" << func->name << " ("; + out2 << funcVariables.str(); + out2 << "\n) {\n\n"; - // for (unsigned long i=0; i < sortedProps.size(); i++) { - // auto prop = sortedProps[i]; - // bool isOutputProp = (find(outputs.begin(), outputs.end(), - // prop->tensor) != outputs.end()); - - // auto var = prop->tensor.as(); - // if (var->is_parameter) { - // if (isOutputProp) { - // out2 << " " << printTensorProperty(varMap[prop], prop, false) << ";" << endl; - // } else { - // break; - // } - // } else { - // out2 << getUnpackedTensorArgument(varMap[prop], prop, isOutputProp); - // } - // propsAlreadyGenerated.insert(varMap[prop]); - - // if (i!=sortedProps.size()-1) { - // out2 << ", "; - // } - // if (i%2==0) { - // out2 << "\n\t"; - // } - // } - // out2 << "\n) {\n\n"; - - // indent++; - // doIndent(); - // // output body - // print(func->body); - // indent--; - // out2 << "}\n"; + indent++; + // output body + print(func->body); + indent--; + out2 << "}\n"; } @@ -535,6 +548,8 @@ void CodeGen_ISPC::sendToStream(std::stringstream &stream) { } void CodeGen_ISPC::visit(const Function* func) { + set_ISPC_code_stream_enabled(false); + // if generating a header, protect the function declaration with a guard if (func->name == "assemble") { if (outputKind == HeaderGen) { @@ -646,11 +661,11 @@ void CodeGen_ISPC::visit(const Function* func) { // Print variable declarations out << printDecls(varFinder.varDecls, func->inputs, func->outputs) << endl; - vector sortedProps; + sortedProps = {}; vector inputs = func->inputs; vector outputs = func->outputs; getSortedProps(varFinder.varDecls, sortedProps, inputs, outputs); - out << printCallISPCFunc(func, varFinder.varDecls, sortedProps); + out << printCallISPCFunc(func->name, varFinder.varDecls, sortedProps); if (emittingCoroutine) { out << printContextDeclAndInit(varMap, localVars, numYields, func->name) @@ -788,51 +803,84 @@ static string getAtomicPragma() { // Docs for vectorization pragmas: // http://clang.llvm.org/docs/LanguageExtensions.html#extensions-for-loop-hint-optimizations void CodeGen_ISPC::visit(const For* op) { - switch (op->kind) { - // TODO - add ISPC based multi threaded execution handling - case LoopKind::Vectorized: - case LoopKind::Static: - case LoopKind::Dynamic: - case LoopKind::Runtime: - case LoopKind::Static_Chunked: - case LoopKind::Mul_Thread: - // op->start.accept(this); - // stream2 << std::endl; - // op->start.accept(this); - // stream2 << std::endl; - // op->start.accept(this); - // stream2 << std::endl; - // op->start.accept(this); - // stream2 << std::endl; - // op->end.accept(this); - // stream2 << std::endl; - // op->end.accept(this); - // stream2 << std::endl; - // op->end.accept(this); - // stream2 << std::endl; - default: - break; + if (!is_ISPC_code_stream_enabled()) { + CodeGen::visit(op); + return; } - doIndent(); - if (op->kind == LoopKind::Foreach) { - stream2 << keywordString("foreach") << " ("; - // if (!emittingCoroutine) { - // if (op->var.type() == Int32) { - // stream << "int32 "; - // } - // else if (op->var.type() == Int64) { - // stream << "int64 "; - // } + if (op->kind == LoopKind::Mul_Thread) { + if (!taskCode) { + out2 << "launch[4] " << printCallISPCFunc(funcName+"__", varMap, sortedProps) << "\n"; + return; + } + stream2 << "uniform unsigned int chunk_size = ("; + op->end.accept(this); + stream2 << " - "; + op->start.accept(this); + stream2 << ") / taskCount;\n"; + stream2 << " uniform unsigned int modulo = ("; + op->end.accept(this); + stream2 << " - "; + op->start.accept(this); + stream2 << ") % taskCount;\n"; + + stream2 << " uniform unsigned int start = "; + op->start.accept(this); + stream2 << " + chunk_size * taskIndex;\n"; + + stream2 << " if (taskIndex != 0) {\n"; + stream2 << " start += modulo;\n"; + stream2 << " }\n"; + + stream2 << " uniform unsigned int end = start + chunk_size;\n"; + stream2 << " if (taskIndex == 0) {\n"; + stream2 << " end += modulo;\n"; + stream2 << " }\n\n"; + + stream2 << keywordString(" for") << " ("; + if (!emittingCoroutine) { + if (op->var.type() == Int32) { + stream2 << "int32 "; + } + else if (op->var.type() == Int64) { + stream2 << "int64 "; + } - // } + } + op->var.accept(this); + stream2 << " = "; + stream2 << "start"; + // op->start.accept(this); + stream2 << keywordString("; "); + op->var.accept(this); + stream2 << " < "; + parentPrecedence = BOTTOM; + stream2 << "end"; + // op->end.accept(this); + stream2 << keywordString("; "); + op->var.accept(this); + + auto lit = op->increment.as(); + if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || + (lit->type.isUInt() && lit->equalsScalar(1)))) { + stream2 << "++"; + } + else { + stream2 << " += "; + op->increment.accept(this); + } + + } + + else if (op->kind == LoopKind::Foreach) { + stream2 << keywordString("foreach") << " ("; + op->var.accept(this); stream2 << " = "; op->start.accept(this); stream2 << keywordString(" ... "); op->end.accept(this); - stream2 << ") {\n"; } else { stream2 << keywordString("for") << " ("; @@ -865,9 +913,10 @@ void CodeGen_ISPC::visit(const For* op) { stream2 << " += "; op->increment.accept(this); } - stream2 << ") {\n"; + } + stream2 << ") {\n"; op->contents.accept(this); doIndent(); stream2 << "}"; @@ -934,33 +983,69 @@ void CodeGen_ISPC::visit(const Max* op) { void CodeGen_ISPC::visit(const Allocate* op) { string elementType = printCType(op->var.type(), false); - doIndent(); - op->var.accept(this); - stream << " = ("; - stream << elementType << "*"; - stream << ")"; - if (op->is_realloc) { - stream << "realloc("; + + if (is_ISPC_code_stream_enabled()) { + op->var.accept(this); - stream << ", "; - } - else { - // If the allocation was requested to clear the allocated memory, - // use calloc instead of malloc. - if (op->clear) { - stream << "calloc(1, "; - } else { - stream << "malloc("; + stream2 << " = "; + // stream2 << " = ("; + // stream2 << elementType << "*"; + // stream2 << ")"; + if (op->is_realloc) { + stream2 << "realloc("; + op->var.accept(this); + stream2 << ", "; } - } - stream << "sizeof(" << elementType << ")"; - stream << " * "; - parentPrecedence = MUL; - op->num_elements.accept(this); - parentPrecedence = TOP; - stream << ");"; + else { + // If the allocation was requested to clear the allocated memory, + // use calloc instead of malloc. + if (op->clear) { + stream2 << "calloc(1, "; + } else { + stream2 << "new "; + } + } + stream2 << elementType << "["; + parentPrecedence = MUL; + op->num_elements.accept(this); + parentPrecedence = TOP; + stream2 << "];"; + stream2 << endl; + + + } else { + + op->var.accept(this); + stream << " = ("; + stream << elementType << "*"; + stream << ")"; + if (op->is_realloc) { + stream << "realloc("; + op->var.accept(this); + stream << ", "; + } + else { + // If the allocation was requested to clear the allocated memory, + // use calloc instead of malloc. + if (op->clear) { + stream << "calloc(1, "; + } else { + stream << "malloc("; + } + } + stream << "sizeof(" << elementType << ")"; + stream << " * "; + parentPrecedence = MUL; + op->num_elements.accept(this); + parentPrecedence = TOP; + stream << ");"; stream << endl; + + + } + + } void CodeGen_ISPC::visit(const Sqrt* op) { diff --git a/src/codegen/codegen_ispc.h b/src/codegen/codegen_ispc.h index 08e73b252..2e440abc0 100644 --- a/src/codegen/codegen_ispc.h +++ b/src/codegen/codegen_ispc.h @@ -2,6 +2,7 @@ #define TACO_BACKEND_ISPC_H #include #include +#include #include "taco/ir/ir.h" #include "taco/ir/ir_printer.h" @@ -44,24 +45,27 @@ class CodeGen_ISPC : public CodeGen { void visit(const Assign*); Stmt simplifyFunctionBodies(Stmt stmt); - std::string printCallISPCFunc(const Function *func, std::map varMap, + std::string printCallISPCFunc(const std::string& funcName, std::map varMap, std::vector &sortedProps); void printISPCFunc(const Function *func, std::map varMap, std::vector &sortedProps); std::map varMap; std::vector localVars; + bool taskCode = false; std::ostream &out; std::ostream &out2; OutputKind outputKind; std::string funcName; + std::stringstream funcVariables; + std::vector sortedProps; int labelCount; bool emittingCoroutine; class FindVars; - class DeviceFunctionCollector; + class FunctionCollector; private: virtual std::string restrictKeyword() const { return "restrict"; } diff --git a/src/ir/ir_printer.cpp b/src/ir/ir_printer.cpp index ba2bc894b..fa224bde4 100644 --- a/src/ir/ir_printer.cpp +++ b/src/ir/ir_printer.cpp @@ -333,10 +333,18 @@ void IRPrinter::visit(const Cast* op) { } void IRPrinter::visit(const Call* op) { - stream << op->func << "("; - parentPrecedence = Precedence::CALL; - acceptJoin(this, stream, op->args, ", "); - stream << ")"; + if (!is_ISPC_code_stream_enabled()) { + stream << op->func << "("; + parentPrecedence = Precedence::CALL; + acceptJoin(this, stream, op->args, ", "); + stream << ")"; + } else { + // statically added function to the ispc file has __ in the front + stream2 << "__" << op->func << "("; + parentPrecedence = Precedence::CALL; + acceptJoin(this, stream2, op->args, ", "); + stream2 << ")"; + } } void IRPrinter::visit(const IfThenElse* op) { @@ -716,7 +724,7 @@ void IRPrinter::visit(const VarDecl* op) { } taco_iassert(isa(op->var)); if (to(op->var)->is_ptr) { - stream2 << "* restrict"; + stream2 << "* "; // removed restrict keyword from here } stream2 << " "; string varName = varNameGenerator.getUniqueName(util::toString(op->var)); @@ -829,12 +837,22 @@ void IRPrinter::visit(const Allocate* op) { } void IRPrinter::visit(const Free* op) { - doIndent(); - stream << "free("; - parentPrecedence = Precedence::TOP; - op->var.accept(this); - stream << ");"; - stream << endl; + if (is_ISPC_code_stream_enabled()) { + doIndent(); + stream2 << "delete[] "; + parentPrecedence = Precedence::TOP; + op->var.accept(this); + stream2 << ";"; + stream2 << endl; + } + else { + doIndent(); + stream << "free("; + parentPrecedence = Precedence::TOP; + op->var.accept(this); + stream << ");"; + stream << endl; + } } void IRPrinter::visit(const Comment* op) { From 8a42b2f226cece4a8da21f06e548fe46bfc2e124 Mon Sep 17 00:00:00 2001 From: Adhhitha Dias Date: Wed, 8 Sep 2021 10:37:00 -0400 Subject: [PATCH 08/16] add test kernels sddmm, mttkrp, ttv, etc.. --- test/tests-scheduling-eval.cpp | 727 +++++++++++++++++++++++++++++++-- 1 file changed, 695 insertions(+), 32 deletions(-) diff --git a/test/tests-scheduling-eval.cpp b/test/tests-scheduling-eval.cpp index 4957418e0..59debc88e 100644 --- a/test/tests-scheduling-eval.cpp +++ b/test/tests-scheduling-eval.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include "taco/cuda.h" #include "test.h" #include "test_tensors.h" @@ -57,6 +58,31 @@ void printToFile(string filename, IndexStmt stmt) { source_file.close(); } +void printToFile(string filename, string additional_filename, IndexStmt stmt) { + stringstream source1; + stringstream source2; + + string file_path = "eval_generated/"; + mkdir(file_path.c_str(), 0777); + + std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); + ir::Stmt compute = lower(stmt, "compute", false, true); + codegen->compile(compute, true); + + ofstream source_file; + string file_ending = should_use_CUDA_codegen() ? ".cu" : ".c"; + source_file.open(file_path+filename+file_ending); + source_file << source1.str(); + source_file.close(); + + ofstream additional_source_file; + string additional_file_ending = ".ispc"; + additional_source_file.open(file_path+additional_filename+additional_file_ending); + additional_source_file << source2.str(); + additional_source_file.close(); + +} + IndexStmt scheduleSpMVCPU(IndexStmt stmt, int CHUNK_SIZE=16) { IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1"); return stmt.split(i, i0, i1, CHUNK_SIZE) @@ -92,6 +118,16 @@ IndexStmt scheduleSpMMISPC1(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, .parallelize(k, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); } +IndexStmt scheduleSpMMISPCOMP1(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); + return stmt.split(i, i0, i1, CHUNK_SIZE) + .pos(j, jpos, A(i,j)) + .split(jpos, jpos0, jpos1, UNROLL_FACTOR) + .reorder({i0, i1, jpos0, k, jpos1}) + .parallelize(i0, ParallelUnit::CPUSpmd, OutputRaceStrategy::NoRaces) + .parallelize(k, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); +} + IndexStmt scheduleSpMMISPC1_2(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); return stmt.split(i, i0, i1, CHUNK_SIZE) @@ -199,6 +235,27 @@ IndexStmt scheduleSDDMMCPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, .parallelize(kpos1, ParallelUnit::CPUVector, OutputRaceStrategy::ParallelReduction); } +IndexStmt scheduleSDDMMCSRCPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1"); + return stmt; + // return stmt.split(i, i0, i1, CHUNK_SIZE) + // .pos(k, kpos, B(i,k)) + // .split(kpos, kpos0, kpos1, UNROLL_FACTOR) + // .reorder({i0, i1, kpos0, j, kpos1}); + // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); + // .parallelize(k, ParallelUnit::CPUVector, OutputRaceStrategy::IgnoreRaces); +} + +IndexStmt scheduleSDDMM2CPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); + return stmt.split(i, i0, i1, CHUNK_SIZE) + .pos(j, jpos, B(i,j)) + .split(jpos, jpos0, jpos1, UNROLL_FACTOR) + .reorder({i0, i1, jpos0, k, jpos1}) + .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + .parallelize(jpos1, ParallelUnit::CPUVector, OutputRaceStrategy::ParallelReduction); +} + IndexStmt scheduleSDDMMISPC(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1"); return stmt.split(i, i0, i1, CHUNK_SIZE) @@ -209,6 +266,16 @@ IndexStmt scheduleSDDMMISPC(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, .parallelize(kpos1, ParallelUnit::CPUSimd, OutputRaceStrategy::ParallelReduction); } +IndexStmt scheduleSDDMM2ISPC(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); + return stmt.split(i, i0, i1, CHUNK_SIZE) + .pos(j, jpos, B(i,j)) + .split(jpos, jpos0, jpos1, UNROLL_FACTOR) + .reorder({i0, i1, jpos0, k, jpos1}) + // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); + .parallelize(jpos1, ParallelUnit::CPUSimd, OutputRaceStrategy::ParallelReduction); +} + IndexStmt scheduleSDDMMISPC1(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1"); return stmt.split(i, i0, i1, CHUNK_SIZE) @@ -241,12 +308,12 @@ IndexStmt scheduleTTVCPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16) { IndexStmt scheduleTTVISPC(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16) { IndexVar f("f"), fpos("fpos"), chunk("chunk"), fpos2("fpos2"); - return stmt; - // return stmt.fuse(i, j, f) - // .pos(f, fpos, B(i,j,k)) - // .split(fpos, chunk, fpos2, CHUNK_SIZE) - // .reorder({chunk, fpos2, k}) - // .parallelize(chunk, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); + // return stmt; + return stmt.fuse(i, j, f) + .pos(f, fpos, B(i,j,k)) + .split(fpos, chunk, fpos2, CHUNK_SIZE) + .reorder({chunk, fpos2, k}) + .parallelize(chunk, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); } IndexStmt scheduleTTVCPUCSR(IndexStmt stmt) { @@ -258,6 +325,25 @@ IndexStmt scheduleTTVCPUCSR(IndexStmt stmt) { OutputRaceStrategy::NoRaces); } +IndexStmt scheduleTTVCPUCSR_ST(IndexStmt stmt) { + TensorVar result = stmt.as().getStmt().as().getStmt() + .as().getStmt().as().getLhs() + .getTensorVar(); + return stmt.assemble(result, AssembleStrategy::Insert); +} + +IndexStmt scheduleTTVISPCCSR(IndexStmt stmt) { + TensorVar result = stmt.as().getStmt().as().getStmt() + .as().getStmt().as().getLhs() + .getTensorVar(); + return stmt.assemble(result, AssembleStrategy::Insert) + .parallelize(i, ParallelUnit::CPUSimd, OutputRaceStrategy::NoRaces); +} + +IndexStmt scheduleTTVISPCCSR2(IndexStmt stmt) { + return stmt; +} + IndexStmt scheduleTTMCPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { IndexVar f("f"), fpos("fpos"), chunk("chunk"), fpos2("fpos2"), kpos("kpos"), kpos1("kpos1"), kpos2("kpos2"); return stmt.fuse(i, j, f) @@ -282,12 +368,47 @@ IndexStmt scheduleMTTKRPCPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, .parallelize(i1, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); } +IndexStmt scheduleMTTKRPCPU_ST(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i1("i1"), i2("i2"); + IndexExpr precomputeExpr = stmt.as().getStmt().as().getStmt() + .as().getStmt().as().getStmt() + .as().getRhs().as().getA(); + TensorVar w("w", Type(Float64, {Dimension(j)}), taco::dense); + return stmt.split(i, i1, i2, CHUNK_SIZE) + .reorder({i1, i2, k, l, j}) + .precompute(precomputeExpr, j, j, w); + // .parallelize(j, ParallelUnit::CPUVector, OutputRaceStrategy::Atomics); // gives error when lowering for IgnoreRaces, NoRaces and Atomics + // .parallelize(i1, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); +} + +IndexStmt scheduleMTTKRPISPC(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i1("i1"), i2("i2"); + IndexExpr precomputeExpr = stmt.as().getStmt().as().getStmt() + .as().getStmt().as().getStmt() + .as().getRhs().as().getA(); + TensorVar w("w", Type(Float64, {Dimension(j)}), taco::dense); + return stmt.split(i, i1, i2, CHUNK_SIZE) + .reorder({i1, i2, k, l, j}) + .precompute(precomputeExpr, j, j, w) + .parallelize(j, ParallelUnit::CPUSimd, OutputRaceStrategy::NoRaces); +} + IndexStmt scheduleMTTKRPPrecomputedCPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { IndexVar i1("i1"), i2("i2"), j_pre("j_pre"); return stmt.split(i, i1, i2, CHUNK_SIZE) .parallelize(i1, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); } +IndexStmt scheduleMTTKRPPrecomputedCPU_ST(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i1("i1"), i2("i2"), j_pre("j_pre"); + return stmt.split(i, i1, i2, CHUNK_SIZE); +} + +IndexStmt scheduleMTTKRPPrecomputedISPC_ST(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i1("i1"), i2("i2"), j_pre("j_pre"); + return stmt.parallelize(j, ParallelUnit::CPUSimd, OutputRaceStrategy::NoRaces); +} + IndexStmt scheduleMTTKRP4CPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { IndexVar i1("i1"), i2("i2"); return stmt.split(i, i1, i2, CHUNK_SIZE) @@ -295,6 +416,19 @@ IndexStmt scheduleMTTKRP4CPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16 .parallelize(i1, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); } +IndexStmt scheduleMTTKRP4CPU_ST(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i1("i1"), i2("i2"); + return stmt.split(i, i1, i2, CHUNK_SIZE) + .reorder({i1, i2, k, l, m, j}); +} + +IndexStmt scheduleMTTKRP4ISPC_ST(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i1("i1"), i2("i2"); + return stmt.split(i, i1, i2, CHUNK_SIZE) + .reorder({i1, i2, k, l, m, j}) + .parallelize(j, ParallelUnit::CPUSimd, OutputRaceStrategy::NoRaces); +} + IndexStmt scheduleMTTKRP5CPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { IndexVar i1("i1"), i2("i2"); return stmt.split(i, i1, i2, CHUNK_SIZE) @@ -1024,7 +1158,7 @@ TEST(scheduling_eval, sddmmCPU) { IndexStmt stmt = A.getAssignment().concretize(); stmt = scheduleSDDMMCPU(stmt, B); - //printToFile("sddmm_cpu", stmt); + printToFile("sddmm_cpu_ryan2", stmt); A.compile(stmt); A.assemble(); @@ -1038,6 +1172,126 @@ TEST(scheduling_eval, sddmmCPU) { ASSERT_TENSOR_EQ(expected, A); } + +TEST(scheduling_eval, sddmmcsrCPU) { + if (should_use_CUDA_codegen()) { + return; + } + int NUM_I = 1021/10; + int NUM_J = 1039/10; + int NUM_K = 1057/10; + float SPARSITY = .3; + Tensor A("A", {NUM_I, NUM_K}, CSR); + Tensor B("B", {NUM_I, NUM_K}, CSR); + Tensor C("C", {NUM_I, NUM_J}, {Dense, Dense}); + Tensor D("D", {NUM_J, NUM_K}, {Dense, Dense}); + + srand(268238); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + C.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + + for (int i = 0; i < NUM_I; i++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, k}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + } + + for (int j = 0; j < NUM_J; j++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + D.insert({j, k}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + + B.pack(); + C.pack(); + D.pack(); + + A(i,k) = B(i,k) * C(i,j) * D(j,k); + + IndexStmt stmt = A.getAssignment().concretize(); + stmt = scheduleSDDMMCSRCPU(stmt, B); + + printToFile("sddmm_cpu", stmt); + + A.compile(stmt); + A.assemble(); + A.compute(); + + Tensor expected("expected", {NUM_I, NUM_K}, CSR); + expected(i,k) = B(i,k) * C(i,j) * D(j,k); + + IndexStmt stmt_ref = expected.getAssignment().concretize(); + printToFile("sddmm_cpu_ref", stmt_ref); + + expected.compile(stmt_ref); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, A); +} + + +TEST(scheduling_eval, sddmm2CPU) { + if (should_use_CUDA_codegen()) { + return; + } + int NUM_I = 1021/10; + int NUM_J = 1021/10; + int NUM_K = 18; + float SPARSITY = .3; + Tensor Y("Y", {NUM_I, NUM_J}, CSR); + Tensor A("A", {NUM_I, NUM_J}, CSR); + Tensor X("X", {NUM_I, NUM_K}, {Dense, Dense}); + + srand(268238); + + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + if (rand_float < SPARSITY) { + A.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + } + + for (int i = 0; i < NUM_J; i++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + X.insert({i, k}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + + A.pack(); + X.pack(); + + Y(i,j) = A(i,j) * X(i,k) * X(j,k); + + IndexStmt stmt = A.getAssignment().concretize(); + // stmt = scheduleSDDMMCPU(stmt, B); + + //printToFile("sddmm_cpu", stmt); + + A.compile(stmt); + A.assemble(); + A.compute(); + + Tensor expected("expected", {NUM_I, NUM_J}, {Dense, Dense}); + expected(i,j) = A(i,j) * X(i,k) * X(j,k); + expected.compile(); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, A); +} + + + // bin/taco-test --gtest_filter=scheduling_eval.sddmmISPC TEST(scheduling_eval, sddmmISPC) { @@ -1128,6 +1382,89 @@ TEST(scheduling_eval, sddmmISPC) { } + +// bin/taco-test --gtest_filter=scheduling_eval.sddmmISPC +TEST(scheduling_eval, sddmm2ISPC) { + + taco::util::TimeResults timevalue; + bool time = true; + + set_CUDA_codegen_enabled(false); + set_ISPC_codegen_enabled(false); + + int NUM_I = 1021/10; + int NUM_K = 1039/10; + int NUM_J = 1021/10; + float SPARSITY = .3; + Tensor A("A", {NUM_I, NUM_J}, {Dense, Dense}); + Tensor B("B", {NUM_I, NUM_J}, CSR); + Tensor C("C", {NUM_I, NUM_K}, {Dense, Dense}); + + srand(268238); + for (int i = 0; i < NUM_I; i++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + C.insert({i, k}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + } + + B.pack(); + C.pack(); + + set_ISPC_codegen_enabled(true); + A(i,j) = B(i,j) * C(i,k) * C(j,k); + + IndexStmt stmt = A.getAssignment().concretize(); + stmt = scheduleSDDMM2ISPC(stmt, B); + + //printToFile("sddmm_cpu", stmt); + + A.compile(stmt); + A.assemble(); + // A.compute(); + + set_ISPC_codegen_enabled(false); + Tensor expected("expected", {NUM_I, NUM_J}, {Dense, Dense}); + expected(i,j) = B(i,j) * C(i,k) * C(j,k); + IndexStmt stmt_taco = A.getAssignment().concretize(); + stmt_taco = scheduleSDDMM2CPU(stmt_taco, B); + expected.compile(stmt_taco); + expected.assemble(); + // expected.compute(); + + TOOL_BENCHMARK_TIMER(A.compute(), "Compute ISPC: ", timevalue); + TOOL_BENCHMARK_TIMER(expected.compute(), "Compute TACO: ", timevalue); + + ASSERT_TENSOR_EQ(expected, A); + + + float ERROR_MARGIN = 0.01; + // ASSERT_TENSOR_VAL(expected, y); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + if (expected(i,j) <= A(i,j) + ERROR_MARGIN && expected(i,j) >= A(i,j) - ERROR_MARGIN) { + // std::cout << "matched values: expected -> " << expected(j) << " == " << y(j) << " <- actual\n"; + } + else { + std::cout << "unmatched values: expected -> " << expected(i,j) << " != " << A(i,j) << " <- actual\n"; + ASSERT_TRUE(false); + }; + } + } + std::cout << "test scheduling_eval.sddmmISPC passed\n"; + +} + + TEST(scheduling_eval, spmvCPU) { if (should_use_CUDA_codegen()) { return; @@ -1215,9 +1552,9 @@ TEST(scheduling_eval, spmvISPC) { y(i) = A(i, j) * x(j); IndexStmt stmt = y.getAssignment().concretize(); - stmt = scheduleSpMVISPC(stmt); + // stmt = scheduleSpMVISPC(stmt); - //printToFile("spmv_cpu", stmt); + printToFile("spmv_cpu", stmt); y.compile(stmt); y.assemble(); @@ -1307,7 +1644,7 @@ TEST(scheduling_eval, ttvCPU) { IndexStmt stmt = A.getAssignment().concretize(); stmt = scheduleTTVCPU(stmt, B); - //printToFile("ttv_cpu", stmt); + printToFile("ttv_cpu", stmt); A.compile(stmt); A.assemble(); @@ -1362,7 +1699,7 @@ TEST(scheduling_eval, ttvISPC) { IndexStmt stmt = A.getAssignment().concretize(); stmt = scheduleTTVISPC(stmt, B); - //printToFile("ttv_cpu", stmt); + printToFile("ttv_ispc", "__ttv_ispc", stmt); A.compile(stmt); A.assemble(); @@ -1390,7 +1727,7 @@ TEST(scheduling_eval, ttvCPU_CSR) { int NUM_K = 1057/10; float SPARSITY = .3; Tensor A("A", {NUM_I, NUM_J}, {Dense, Sparse}); - Tensor B("B", {NUM_I, NUM_J, NUM_K}, {Sparse, Sparse, Sparse}); + Tensor B("B", {NUM_I, NUM_J, NUM_K}, {Dense, Sparse, Sparse}); Tensor c("c", {NUM_K}, Format({Dense})); srand(9536); @@ -1418,11 +1755,13 @@ TEST(scheduling_eval, ttvCPU_CSR) { IndexStmt stmt = A.getAssignment().concretize(); stmt = scheduleTTVCPUCSR(stmt); + printToFile("ttv_cpu_csr", stmt); + A.compile(stmt); A.assemble(); A.compute(); - Tensor expected("expected", {NUM_I, NUM_J}, {Dense, Dense}); + Tensor expected("expected", {NUM_I, NUM_J}, {Dense, Sparse}); expected(i,j) = B(i,j,k) * c(k); expected.compile(); expected.assemble(); @@ -1430,6 +1769,82 @@ TEST(scheduling_eval, ttvCPU_CSR) { ASSERT_TENSOR_EQ(expected, A); } +TEST(scheduling_eval, ttvISPC_CSR) { + if (should_use_CUDA_codegen()) { + return; + } + + int NUM_I = 10000; + int NUM_J = 1039/10; + int NUM_K = 128; + float SPARSITY = .3; + Tensor A("A", {NUM_I, NUM_J}, {Dense, Sparse}); + Tensor B("B", {NUM_I, NUM_J, NUM_K}, {Dense, Sparse, Sparse}); + Tensor c("c", {NUM_K}, Format({Dense})); + + srand(9536); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float) rand() / (float) (RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, j, k}, (double) ((int) (rand_float * 3 / SPARSITY))); + } + } + } + } + + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + c.insert({k}, (double) ((int) (rand_float*3))); + } + + B.pack(); + c.pack(); + + set_ISPC_codegen_enabled(true); + A(i,j) = B(i,j,k) * c(k); + + IndexStmt stmt = A.getAssignment().concretize(); + stmt = scheduleTTVISPCCSR(stmt); + printToFile("ttv_ispc_csr", "__ttv_ispc_csr", stmt); + + A.compile(stmt); + A.assemble(); + A.compute(); + + set_ISPC_codegen_enabled(false); + Tensor expected("expected", {NUM_I, NUM_J}, {Dense, Sparse}); + expected(i,j) = B(i,j,k) * c(k); + IndexStmt taco_stmt = expected.getAssignment().concretize(); + taco_stmt = scheduleTTVCPUCSR_ST(taco_stmt); + expected.compile(taco_stmt); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, A); + + Tensor A2("A2", {NUM_I, NUM_J}, {Dense, Sparse}); + set_ISPC_codegen_enabled(true); + A2(i,j) = B(i,j,k) * c(k); + + IndexStmt stmt2 = A2.getAssignment().concretize(); + + A2.compile(stmt2); + A2.assemble(); + A2.compute(); + + taco::util::TimeResults timevalue; + bool time = true; + + for (int i=0; i<3; i++) { + TOOL_BENCHMARK_TIMER(expected.compute(), "Compute TACO1: ", timevalue); + TOOL_BENCHMARK_TIMER(A.compute(), "Compute ISPC1: ", timevalue); + TOOL_BENCHMARK_TIMER(A2.compute(), "Compute ISPC2: ", timevalue); + } + + +} + TEST(scheduling_eval, ttmCPU) { if (should_use_CUDA_codegen()) { return; @@ -1605,12 +2020,13 @@ TEST(scheduling_eval, mttkrpISPC) { if (should_use_CUDA_codegen()) { return; } - int NUM_I = 1021/20; - int NUM_J = 1039/20; + set_ISPC_codegen_enabled(false); + set_CUDA_codegen_enabled(false); + int NUM_I = 10000; // 1021/20; + int NUM_J = 256; int NUM_K = 1057/20; int NUM_L = 1232/20; float SPARSITY = .1; - Tensor A("A", {NUM_I, NUM_J}, {Dense, Dense}); Tensor B("B", {NUM_I, NUM_K, NUM_L}, {Dense, Sparse, Sparse}); Tensor C("C", {NUM_K, NUM_J}, {Dense, Dense}); Tensor D("D", {NUM_L, NUM_J}, {Dense, Dense}); @@ -1645,24 +2061,183 @@ TEST(scheduling_eval, mttkrpISPC) { C.pack(); D.pack(); - A(i,j) = B(i,k,l) * C(k,j) * D(l,j); + set_ISPC_codegen_enabled(true); - IndexStmt stmt = A.getAssignment().concretize(); - stmt = scheduleMTTKRPCPU(stmt, B); - //printToFile("mttkrp_cpu", stmt); + Tensor A1("A1", {NUM_I, NUM_J}, {Dense, Dense}); + A1(i,j) = B(i,k,l) * C(k,j) * D(l,j); + IndexStmt stmt1 = A1.getAssignment().concretize(); + stmt1 = scheduleMTTKRPISPC(stmt1, B); + // printToFile("mttkrp1_cpu_ispc", stmt1); + A1.compile(stmt1); + A1.assemble(); + A1.compute(); - A.compile(stmt); - A.assemble(); - A.compute(); + set_ISPC_codegen_enabled(false); + Tensor expected1("expected1", {NUM_I, NUM_J}, {Dense, Dense}); + expected1(i,j) = B(i,k,l) * C(k,j) * D(l,j); + IndexStmt taco_stmt1 = expected1.getAssignment().concretize(); + taco_stmt1 = scheduleMTTKRPCPU(taco_stmt1, B); + expected1.compile(taco_stmt1); + expected1.assemble(); + expected1.compute(); + ASSERT_TENSOR_EQ(expected1, A1); - Tensor expected("expected", {NUM_I, NUM_J}, {Dense, Dense}); - expected(i,j) = B(i,k,l) * C(k,j) * D(l,j); - expected.compile(); - expected.assemble(); - expected.compute(); - ASSERT_TENSOR_EQ(expected, A); + set_ISPC_codegen_enabled(true); + Tensor A2("A2", {NUM_I, NUM_J}, {Dense, Dense}); + A2(i,j) = B(i,k,l) * C(k,j) * D(l,j); + IndexStmt stmt2 = A1.getAssignment().concretize(); + stmt2 = scheduleMTTKRPPrecomputedISPC_ST(stmt2, B); + // printToFile("mttkrp_cpu_ispc", stmt); + A2.compile(stmt2); + A2.assemble(); + A2.compute(); + ASSERT_TENSOR_EQ(expected1, A2); + + set_ISPC_codegen_enabled(false); + Tensor expected2("expected2", {NUM_I, NUM_J}, {Dense, Dense}); + expected2(i,j) = B(i,k,l) * C(k,j) * D(l,j); + IndexStmt taco_stmt2 = expected2.getAssignment().concretize(); + taco_stmt2 = scheduleMTTKRPPrecomputedCPU_ST(taco_stmt2, B); + expected2.compile(taco_stmt2); + expected2.assemble(); + expected2.compute(); + ASSERT_TENSOR_EQ(expected1, expected2); + + taco::util::TimeResults timevalue; + bool time = true; + + for (int i=0; i<3; i++) { + TOOL_BENCHMARK_TIMER(expected1.compute(), "Compute TACO1: ", timevalue); + TOOL_BENCHMARK_TIMER(A1.compute(), "Compute ISPC1: ", timevalue); + TOOL_BENCHMARK_TIMER(expected2.compute(), "Compute TACO2: ", timevalue); + TOOL_BENCHMARK_TIMER(A2.compute(), "Compute ISPC2: ", timevalue); + } } + +TEST(scheduling_eval, mttkrp4ISPC) { + if (should_use_CUDA_codegen()) { + return; + } + set_ISPC_codegen_enabled(false); + set_CUDA_codegen_enabled(false); + int NUM_I = 1000; // 1021/20; + int NUM_J = 16; + int NUM_K = 1057/20; + int NUM_L = 1232/20; + int NUM_M = 1124/20; + float SPARSITY = .1; + Tensor B("B", {NUM_I, NUM_K, NUM_L, NUM_M}, {Dense, Sparse, Sparse, Sparse}); + Tensor C("C", {NUM_K, NUM_J}, {Dense, Dense}); + Tensor D("D", {NUM_L, NUM_J}, {Dense, Dense}); + Tensor E("E", {NUM_M, NUM_J}, {Dense, Dense}); + + srand(549694); + for (int i = 0; i < NUM_I; i++) { + for (int k = 0; k < NUM_K; k++) { + for (int l = 0; l < NUM_L; l++) { + for (int m = 0; m < NUM_M; m++) { + float rand_float = (float) rand() / (float) (RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, k, l, m}, (double) ((int) (rand_float * 3 / SPARSITY))); + } + } + } + } + } + + for (int k = 0; k < NUM_K; k++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + C.insert({k, j}, (double) ((int) (rand_float*3))); + } + } + + for (int l = 0; l < NUM_L; l++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + D.insert({l, j}, (double) ((int) (rand_float*3))); + } + } + + for (int m = 0; m < NUM_M; m++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + E.insert({m, j}, (double) ((int) (rand_float*3))); + } + } + + B.pack(); + C.pack(); + D.pack(); + E.pack(); + + set_ISPC_codegen_enabled(true); + Tensor A1("A1", {NUM_I, NUM_J}, {Dense, Dense}); + A1(i,j) = B(i,k,l,m) * C(k,j) * D(l,j) * E(m,j); + IndexStmt stmt1 = A1.getAssignment().concretize(); + stmt1 = scheduleMTTKRP4ISPC_ST(stmt1, B); + // printToFile("mttkrp1_cpu_ispc", stmt1); + A1.compile(stmt1); + A1.assemble(); + A1.compute(); + + set_ISPC_codegen_enabled(false); + Tensor expected1("expected1", {NUM_I, NUM_J}, {Dense, Dense}); + expected1(i,j) = B(i,k,l,m) * C(k,j) * D(l,j) * E(m,j); + IndexStmt taco_stmt1 = expected1.getAssignment().concretize(); + taco_stmt1 = scheduleMTTKRP4CPU_ST(taco_stmt1, B); + expected1.compile(taco_stmt1); + expected1.assemble(); + expected1.compute(); + ASSERT_TENSOR_EQ(expected1, A1); + + // set_ISPC_codegen_enabled(true); + // Tensor A2("A2", {NUM_I, NUM_J}, {Dense, Dense}); + // A2(i,j) = B(i,k,l) * C(k,j) * D(l,j); + // IndexStmt stmt2 = A1.getAssignment().concretize(); + // stmt2 = scheduleMTTKRPPrecomputedISPC_ST(stmt2, B); + // // printToFile("mttkrp_cpu_ispc", stmt); + // A2.compile(stmt2); + // A2.assemble(); + // A2.compute(); + // ASSERT_TENSOR_EQ(expected1, A2); + + set_ISPC_codegen_enabled(false); + Tensor expected2("expected2", {NUM_I, NUM_J}, {Dense, Dense}); + expected2(i,j) = B(i,k,l,m) * C(k,j) * D(l,j) * E(m,j); + + IndexExpr BE = B(i,k,l,m) * E(m,j); + IndexExpr BDE = BE * D(l, j); + expected2(i,j) = BDE * C(k,j); + IndexStmt taco_stmt2 = expected2.getAssignment().concretize(); + TensorVar BE_workspace("BE_workspace", Type(Float64, {Dimension(j)}), taco::dense); + TensorVar BDE_workspace("BDE_workspace", Type(Float64, {Dimension(j)}), taco::dense); + + IndexStmt precomputed_stmt = forall(i, forall(k, + where(forall(j, expected2(i,j) += BDE_workspace(j) * C(k,j)), + forall(l, where(forall(j, BDE_workspace(j) += BE_workspace(j) * D(l,j)), + forall(m, forall(j, BE_workspace(j) += B(i,k,l,m) * E(m,j)))))))); + + // IndexStmt scheduled2 = scheduleMTTKRPPrecomputedCPU(precomputed_stmt, B, 64); + // expected2.compile(scheduled2); + // expected2.assemble(); + // expected2.compute(); + // ASSERT_TENSOR_EQ(expected1, expected2); + + taco::util::TimeResults timevalue; + bool time = true; + + for (int i=0; i<3; i++) { + TOOL_BENCHMARK_TIMER(expected1.compute(), "Compute TACO1: ", timevalue); + TOOL_BENCHMARK_TIMER(A1.compute(), "Compute ISPC1: ", timevalue); + // TOOL_BENCHMARK_TIMER(expected2.compute(), "Compute TACO2: ", timevalue); + // TOOL_BENCHMARK_TIMER(A2.compute(), "Compute ISPC2: ", timevalue); + } +} + + + TEST(scheduling_eval, spmvGPU) { if (!should_use_CUDA_codegen()) { return; @@ -2042,7 +2617,7 @@ TEST(scheduling_eval, mttkrpGPU) { ASSERT_TENSOR_EQ(expected, A); } -TEST(generate_ispc_evaluation_files, ispc) { +TEST(generate_evaluation_files, ispc) { std::cout << "Hi Adhitha!\n" << std::endl ; set_CUDA_codegen_enabled(false); set_ISPC_codegen_enabled(true); @@ -2063,6 +2638,7 @@ TEST(generate_ispc_evaluation_files, ispc) { int NUM_I = 100; int NUM_J = 100; int NUM_K = 100; + int NUM_L = 100; string c_file_ending = ".h"; string file_ending = ".ispc"; @@ -2130,7 +2706,35 @@ TEST(generate_ispc_evaluation_files, ispc) { ispc_source_file.close(); } - // spmm + // spmm omp + { + stringstream source1; + stringstream source2; + std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); + Tensor A("A", {NUM_I, NUM_J}, CSR); + Tensor X("X", {NUM_J, NUM_K}, {Dense, Dense}); + Tensor Y("Y", {NUM_I, NUM_K}, {Dense, Dense}); + Y(i, k) = A(i, j) * X(j, k); + IndexStmt stmt = Y.getAssignment().concretize(); + bool isFirst = true; + for (auto paramSet : spmm_parameters) { + IndexStmt scheduled = scheduleSpMMISPCOMP1(stmt, A, paramSet[0], paramSet[1]); + ir::Stmt compute = lower(scheduled, string("compute1_") + util::join(paramSet, "_"), false, true); + codegen->compile(compute, isFirst); + isFirst = false; + } + ofstream source_file; + source_file.open(file_path + "spmm_omp_ispc_taco1" + c_file_ending); + source_file << source1.str(); + source_file.close(); + + ofstream ispc_source_file; + ispc_source_file.open(file_path + "__spmm_omp_ispc_taco1" + file_ending); + ispc_source_file << source2.str(); + ispc_source_file.close(); + } + + // spmm2 { stringstream source1; stringstream source2; @@ -2186,6 +2790,64 @@ TEST(generate_ispc_evaluation_files, ispc) { ispc_source_file.close(); } + // ttv + { + stringstream source; + stringstream source2; + std::shared_ptr codegen = ir::CodeGen::init_default(source, source2, ir::CodeGen::ImplementationGen); + Tensor A("A", {NUM_I, NUM_J}, {Dense, Dense}); // TODO: change to sparse outputs + Tensor B("B", {NUM_I, NUM_J, NUM_K}, {Sparse, Sparse, Sparse}); + Tensor c("c", {NUM_K}, Format({Dense})); + A(i,j) = B(i,j,k) * c(k); + IndexStmt stmt = A.getAssignment().concretize(); + bool isFirst = true; + for (auto paramSet : ttv_parameters) { + IndexStmt scheduled = scheduleTTVCPU(stmt, B, paramSet[0]); + ir::Stmt compute = lower(scheduled, string("compute_") + util::join(paramSet, "_"), false, true); + codegen->compile(compute, isFirst); + isFirst = false; + } + ofstream source_file; + source_file.open(file_path + "ttv_cpu" + c_file_ending); + source_file << source.str(); + source_file.close(); + + ofstream ispc_source_file; + ispc_source_file.open(file_path + "__ttv_cpu" + file_ending); + ispc_source_file << source2.str(); + ispc_source_file.close(); + } + + + // mttkrp3 + { + stringstream source; + stringstream source2; + std::shared_ptr codegen = ir::CodeGen::init_default(source, source2, ir::CodeGen::ImplementationGen); + Tensor A("A", {NUM_I, NUM_J}, {Dense, Dense}); + Tensor B("B", {NUM_I, NUM_K, NUM_L}, {Dense, Sparse, Sparse}); + Tensor C("C", {NUM_K, NUM_J}, {Dense, Dense}); + Tensor D("D", {NUM_L, NUM_J}, {Dense, Dense}); + A(i,j) = B(i,k,l) * C(k,j) * D(l,j); + IndexStmt stmt = A.getAssignment().concretize(); + bool isFirst = true; + for (auto paramSet : mttkrp_parameters) { + IndexStmt scheduled = scheduleMTTKRPCPU(stmt, B, paramSet[0], paramSet[1]); + ir::Stmt compute = lower(scheduled, string("compute_") + util::join(paramSet, "_"), false, true); + codegen->compile(compute, isFirst); + isFirst = false; + } + ofstream source_file; + source_file.open(file_path + "mttkrp3_cpu" + c_file_ending); + source_file << source.str(); + source_file.close(); + + ofstream ispc_source_file; + ispc_source_file.open(file_path + "__mttkrp3_cpu" + file_ending); + ispc_source_file << source2.str(); + ispc_source_file.close(); + } + return; } @@ -2283,6 +2945,7 @@ TEST(generate_ispc_sddmm_evaluation_files, ispc) { + TEST(generate_evaluation_files, cpu) { if (should_use_CUDA_codegen()) { return; @@ -2599,7 +3262,7 @@ TEST(generate_evaluation_files, cpu) { } } -TEST(generate_evaluation_files_spmv, ispc) { +TEST(generate_evaluation_files, spmv_ispc) { set_CUDA_codegen_enabled(false); set_ISPC_codegen_enabled(true); From 09864add784e06ca0b6eee6728ea0e11923f2540 Mon Sep 17 00:00:00 2001 From: Adhhitha Dias Date: Thu, 3 Mar 2022 14:08:23 -0500 Subject: [PATCH 09/16] fuse kernel implementation --- CMakeLists.txt | 41 +- include/taco/codegen/module.h | 12 +- include/taco/index_notation/transformations.h | 3 +- include/taco/taco_tensor_t.h | 1 + include/taco/tensor.h | 2 + src/codegen/codegen_c.cpp | 22 +- src/codegen/codegen_c.h | 29 +- src/codegen/codegen_ispc.cpp | 112 +- src/codegen/codegen_ispc.h | 13 +- src/codegen/module.cpp | 60 +- src/index_notation/index_notation.cpp | 19 +- src/index_notation/transformations.cpp | 684 ++- src/ir/ir_printer.cpp | 2 +- src/ir/ir_rewriter.cpp | 2 +- src/lower/iteration_graph.cpp | 10 + src/lower/iterator.cpp | 3 + src/lower/lowerer_impl_imperative.cpp | 176 +- src/lower/tensor_path.h | 4 +- src/tensor.cpp | 62 +- test/CMakeLists.txt | 1 + test/kernels/mttkrp_gemm/mttkrp_ryan.c | 177 + test/kernels/mttkrp_gemm/mttkrp_ryan.h | 125 + test/kernels/mttkrp_gemm/taco_default.c | 183 + test/kernels/mttkrp_gemm/taco_default.h | 125 + .../sddmm_spmm/csr_dense_dense_sddmm.c | 199 + .../sddmm_spmm/csr_dense_dense_sddmm.h | 125 + .../sddmm_spmm/csr_dense_dense_sddmm.so | Bin 0 -> 14360 bytes test/kernels/sddmm_spmm/csr_dense_spmm.c | 190 + test/kernels/sddmm_spmm/csr_dense_spmm.h | 125 + test/kernels/sddmm_spmm/csr_dense_spmm.so | Bin 0 -> 14520 bytes test/kernels/sddmm_spmm/fused_kernel.c | 183 + test/kernels/sddmm_spmm/fused_kernel.h | 125 + test/kernels/sddmm_spmm/fused_kernel.so | Bin 0 -> 14512 bytes test/kernels/sddmm_spmm/sddmm_ryan.c | 210 + test/kernels/sddmm_spmm/sddmm_ryan.h | 125 + test/kernels/sddmm_spmm/sddmm_ryan.so | Bin 0 -> 14352 bytes test/kernels/sddmm_spmm/taco_original.c | 166 + test/kernels/sddmm_spmm/taco_original.h | 125 + test/kernels/sddmm_spmm/taco_original.so | Bin 0 -> 14304 bytes test/kernels/spmm_gemm/gemm_default.c | 160 + test/kernels/spmm_gemm/gemm_default.h | 125 + test/kernels/spmm_gemm/gemm_default.so | Bin 0 -> 14296 bytes test/kernels/spmm_gemm/gemm_template.c | 183 + test/kernels/spmm_gemm/gemm_template.h | 125 + test/kernels/spmm_gemm/gemm_template.so | Bin 0 -> 14512 bytes test/kernels/spmv_spmv/spmv_fused.c | 178 + test/kernels/spmv_spmv/spmv_fused.h | 125 + test/kernels/spmv_spmv/spmv_fused.so | Bin 0 -> 14152 bytes test/kernels/spmv_spmv/spmv_spmv_default.c | 157 + test/kernels/spmv_spmv/spmv_spmv_default.h | 125 + test/kernels/ttm_ttm/fused copy.c | 248 + test/kernels/ttm_ttm/fused.c | 242 + test/kernels/ttm_ttm/fused.h | 125 + test/kernels/ttm_ttm/fused.so | Bin 0 -> 14560 bytes test/kernels/ttm_ttm/gemm.c | 181 + test/kernels/ttm_ttm/gemm.h | 125 + test/kernels/ttm_ttm/ttm1_1.c | 219 + test/kernels/ttm_ttm/ttm1_1.h | 125 + test/kernels/ttm_ttm/ttm1_1.so | Bin 0 -> 14400 bytes test/kernels/ttm_ttm/ttm1_2.c | 219 + test/kernels/ttm_ttm/ttm1_2.h | 125 + test/kernels/ttm_ttm/ttm1_2.so | Bin 0 -> 14400 bytes test/kernels/ttm_ttm/ttm2.c | 218 + test/kernels/ttm_ttm/ttm2.h | 125 + test/kernels/ttm_ttm/ttm2.so | Bin 0 -> 14400 bytes test/kernels/ttm_ttm/ttm_original copy 2.c | 242 + test/kernels/ttm_ttm/ttm_original copy.c | 225 + test/kernels/ttm_ttm/ttm_original.c | 226 + test/kernels/ttm_ttm/ttm_original.h | 125 + test/kernels/ttm_ttm/ttm_original.so | Bin 0 -> 14408 bytes test/kernels/ttm_ttm/ttm_original2.c | 229 + test/kernels/ttm_ttm/ttm_original2.h | 125 + test/kernels/ttm_ttm/ttm_original2.so | Bin 0 -> 14568 bytes test/stats/hadamard-gemm.txt | 749 +++ test/stats/mttkrp-spmm.txt | 1090 ++++ test/stats/sddmm-spmm-gemm.txt | 1153 ++++ test/stats/sddmm-spmm.txt | 5174 +++++++++++++++++ test/stats/spmm-spmm.txt | 3432 +++++++++++ test/stats/spmv-spmv.txt | 81 + test/stats/ttm-ttm.txt | 2924 ++++++++++ test/tests-indexstmt.cpp | 194 +- test/tests-scheduling-eval.cpp | 241 +- test/tests-scheduling-fuse.cpp | 2872 +++++++++ test/tests-scheduling-ispc-eval.cpp | 2 + test/tests-transformation.cpp | 2 + test/util.h | 113 + tools/CMakeLists.txt | 1 + tools/taco.cpp | 44 +- 88 files changed, 25490 insertions(+), 325 deletions(-) create mode 100644 test/kernels/mttkrp_gemm/mttkrp_ryan.c create mode 100644 test/kernels/mttkrp_gemm/mttkrp_ryan.h create mode 100644 test/kernels/mttkrp_gemm/taco_default.c create mode 100644 test/kernels/mttkrp_gemm/taco_default.h create mode 100644 test/kernels/sddmm_spmm/csr_dense_dense_sddmm.c create mode 100644 test/kernels/sddmm_spmm/csr_dense_dense_sddmm.h create mode 100755 test/kernels/sddmm_spmm/csr_dense_dense_sddmm.so create mode 100644 test/kernels/sddmm_spmm/csr_dense_spmm.c create mode 100644 test/kernels/sddmm_spmm/csr_dense_spmm.h create mode 100755 test/kernels/sddmm_spmm/csr_dense_spmm.so create mode 100644 test/kernels/sddmm_spmm/fused_kernel.c create mode 100644 test/kernels/sddmm_spmm/fused_kernel.h create mode 100755 test/kernels/sddmm_spmm/fused_kernel.so create mode 100644 test/kernels/sddmm_spmm/sddmm_ryan.c create mode 100644 test/kernels/sddmm_spmm/sddmm_ryan.h create mode 100755 test/kernels/sddmm_spmm/sddmm_ryan.so create mode 100644 test/kernels/sddmm_spmm/taco_original.c create mode 100644 test/kernels/sddmm_spmm/taco_original.h create mode 100755 test/kernels/sddmm_spmm/taco_original.so create mode 100644 test/kernels/spmm_gemm/gemm_default.c create mode 100644 test/kernels/spmm_gemm/gemm_default.h create mode 100755 test/kernels/spmm_gemm/gemm_default.so create mode 100644 test/kernels/spmm_gemm/gemm_template.c create mode 100644 test/kernels/spmm_gemm/gemm_template.h create mode 100755 test/kernels/spmm_gemm/gemm_template.so create mode 100644 test/kernels/spmv_spmv/spmv_fused.c create mode 100644 test/kernels/spmv_spmv/spmv_fused.h create mode 100755 test/kernels/spmv_spmv/spmv_fused.so create mode 100644 test/kernels/spmv_spmv/spmv_spmv_default.c create mode 100644 test/kernels/spmv_spmv/spmv_spmv_default.h create mode 100644 test/kernels/ttm_ttm/fused copy.c create mode 100644 test/kernels/ttm_ttm/fused.c create mode 100644 test/kernels/ttm_ttm/fused.h create mode 100755 test/kernels/ttm_ttm/fused.so create mode 100644 test/kernels/ttm_ttm/gemm.c create mode 100644 test/kernels/ttm_ttm/gemm.h create mode 100644 test/kernels/ttm_ttm/ttm1_1.c create mode 100644 test/kernels/ttm_ttm/ttm1_1.h create mode 100755 test/kernels/ttm_ttm/ttm1_1.so create mode 100644 test/kernels/ttm_ttm/ttm1_2.c create mode 100644 test/kernels/ttm_ttm/ttm1_2.h create mode 100755 test/kernels/ttm_ttm/ttm1_2.so create mode 100644 test/kernels/ttm_ttm/ttm2.c create mode 100644 test/kernels/ttm_ttm/ttm2.h create mode 100755 test/kernels/ttm_ttm/ttm2.so create mode 100644 test/kernels/ttm_ttm/ttm_original copy 2.c create mode 100644 test/kernels/ttm_ttm/ttm_original copy.c create mode 100644 test/kernels/ttm_ttm/ttm_original.c create mode 100644 test/kernels/ttm_ttm/ttm_original.h create mode 100755 test/kernels/ttm_ttm/ttm_original.so create mode 100644 test/kernels/ttm_ttm/ttm_original2.c create mode 100644 test/kernels/ttm_ttm/ttm_original2.h create mode 100755 test/kernels/ttm_ttm/ttm_original2.so create mode 100644 test/stats/hadamard-gemm.txt create mode 100644 test/stats/mttkrp-spmm.txt create mode 100644 test/stats/sddmm-spmm-gemm.txt create mode 100644 test/stats/sddmm-spmm.txt create mode 100644 test/stats/spmm-spmm.txt create mode 100644 test/stats/spmv-spmv.txt create mode 100644 test/stats/ttm-ttm.txt create mode 100644 test/tests-scheduling-fuse.cpp create mode 100644 test/tests-scheduling-ispc-eval.cpp create mode 100644 test/util.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 7e9359e01..aff905db5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,7 +12,7 @@ project(taco option(CUDA "Build for NVIDIA GPU (CUDA must be preinstalled)" OFF) option(ISPC "Build for Intel ISPC Compiler (ISPC Compiler must be preinstalled)" OFF) option(PYTHON "Build TACO for python environment" OFF) -option(OPENMP "Build with OpenMP execution support" OFF) +option(OPENMP "Build with OpenMP execution support" ON) option(COVERAGE "Build with code coverage analysis" OFF) set(TACO_FEATURE_CUDA 0) set(TACO_FEATURE_ISPC 0) @@ -95,6 +95,39 @@ if(OPENMP) set(C_CXX_FLAGS "-fopenmp ${C_CXX_FLAGS}") endif(OPENMP) +set(PAPI_DIR "/home/min/a/kadhitha/workspace/my_taco/papi/src/install/") + +find_path(PAPI_DIR + NAMES include/papi.h +) + +find_library(PAPI_LIBRARIES + # Pick the static library first for easier run-time linking. + NAMES libpapi.a papi + HINTS ${PAPI_DIR}/lib ${HILTIDEPS}/lib +) + +find_path(PAPI_INCLUDE_DIRS + NAMES papi.h + HINTS ${PAPI_DIR}/include ${HILTIDEPS}/include +) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(PAPI DEFAULT_MSG + PAPI_LIBRARIES + PAPI_INCLUDE_DIRS +) + +mark_as_advanced( + PAPI_PREFIX_DIRS + PAPI_LIBRARIES + PAPI_INCLUDE_DIRS +) + +include_directories(${PAPI_INCLUDE_DIRS}) + +# project (ValgrindExample) + if(COVERAGE) find_program(PATH_TO_GCOVR gcovr REQUIRED) # add coverage tooling to build flags @@ -104,7 +137,8 @@ if(COVERAGE) message("-- Code coverage analysis (gcovr) enabled") endif(COVERAGE) -set(C_CXX_FLAGS "${C_CXX_FLAGS}") +set(C_CXX_FLAGS "${C_CXX_FLAGS} -I/${PAPI_DIR}/include -L/${PAPI_DIR}/lib") +# set(C_CXX_FLAGS "${C_CXX_FLAGS}") set(CMAKE_C_FLAGS "${C_CXX_FLAGS}") set(CMAKE_CXX_FLAGS "${C_CXX_FLAGS} -std=c++14") @@ -117,6 +151,9 @@ set(TACO_INCLUDE_DIR ${TACO_PROJECT_DIR}/include) enable_testing() include_directories(${TACO_INCLUDE_DIR}) +# include_directories("/home/min/a/kadhitha/workspace/my_taco/valgrind") +# project (ValgrindExample) +# include (CTest) set(TACO_LIBRARY_DIR ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}) diff --git a/include/taco/codegen/module.h b/include/taco/codegen/module.h index 3df7c8e0f..4db5fcdaf 100644 --- a/include/taco/codegen/module.h +++ b/include/taco/codegen/module.h @@ -17,7 +17,7 @@ class Module { public: /// Create a module for some target Module(Target target=getTargetFromEnvironment()) - : lib_handle(nullptr), moduleFromUserSource(false), target(target) { + : lib_handle(nullptr), so_lib_handle(nullptr), moduleFromUserSource(false), target(target) { setJITLibname(); setJITTmpdir(); } @@ -44,11 +44,16 @@ class Module { /// before calling. If there's no function of this name then a nullptr is /// returned. void* getFuncPtr(std::string name); + void* getFuncPtr(std::string& sofile, std::string name); /// Call a raw function in this module and return the result + int callFuncPackedRaw(std::string name, std::string& sofile, void** args); int callFuncPackedRaw(std::string name, void** args); /// Call a raw function in this module and return the result + int callFuncPackedRaw(std::string name, std::string& sofile, std::vector args) { + return callFuncPackedRaw(name, sofile, args.data()); + } int callFuncPackedRaw(std::string name, std::vector args) { return callFuncPackedRaw(name, args.data()); } @@ -57,6 +62,10 @@ class Module { int callFuncPacked(std::string name, void** args) { return callFuncPackedRaw("_shim_"+name, args); } + + int callFuncPacked(std::string name, std::string& sofile, void** args) { + return callFuncPackedRaw("_shim_"+name, sofile,args); + } /// Call a function using the taco_tensor_t interface and return the result int callFuncPacked(std::string name, std::vector args) { @@ -73,6 +82,7 @@ class Module { std::string libname; std::string tmpdir; void* lib_handle; + void* so_lib_handle; std::vector funcs; // true iff the module was created from user-provided source diff --git a/include/taco/index_notation/transformations.h b/include/taco/index_notation/transformations.h index 6bf277d5c..4d6ec6830 100644 --- a/include/taco/index_notation/transformations.h +++ b/include/taco/index_notation/transformations.h @@ -223,7 +223,8 @@ IndexStmt parallelizeOuterLoop(IndexStmt stmt); */ IndexStmt reorderLoopsTopologically(IndexStmt stmt); -IndexStmt justTraverseThroughTheIndexStmt(IndexStmt stmt); +IndexStmt loopFusionOverFission(IndexStmt stmt, Assignment assignment, + std::string side, int iters); /** * Performs scalar promotion so that reductions are done by accumulating into diff --git a/include/taco/taco_tensor_t.h b/include/taco/taco_tensor_t.h index 20d78bb51..f27acd9c7 100644 --- a/include/taco/taco_tensor_t.h +++ b/include/taco/taco_tensor_t.h @@ -6,6 +6,7 @@ #ifndef TACO_TENSOR_T_DEFINED #define TACO_TENSOR_T_DEFINED +#include #include typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; diff --git a/include/taco/tensor.h b/include/taco/tensor.h index b91782256..883718fb6 100644 --- a/include/taco/tensor.h +++ b/include/taco/tensor.h @@ -413,6 +413,8 @@ class TensorBase { /// Compile the tensor expression. void compile(); + void compute(std::ofstream& statfile); + void compute(std::ofstream& statfile, std::string& sofile); void compile(IndexStmt stmt, bool assembleWhileCompute=false); diff --git a/src/codegen/codegen_c.cpp b/src/codegen/codegen_c.cpp index 2ade9d7f6..83da7aaab 100644 --- a/src/codegen/codegen_c.cpp +++ b/src/codegen/codegen_c.cpp @@ -34,6 +34,7 @@ const string cHeaders = "#include \n" "#include \n" "#include \n" + "#include \n" "#if _OPENMP\n" "#include \n" "#endif\n" @@ -240,7 +241,10 @@ class CodeGen_C::FindVars : public IRVisitor { }; CodeGen_C::CodeGen_C(std::ostream &dest, OutputKind outputKind, bool simplify) - : CodeGen(dest, false, simplify, C), out(dest), outputKind(outputKind) {} + : CodeGen(dest, false, simplify, C), out(dest), out2(dest), outputKind(outputKind) {} + +CodeGen_C::CodeGen_C(std::ostream &dest, std::ostream &dest2, OutputKind outputKind, bool simplify) + : CodeGen(dest, dest2, false, simplify, C), out(dest), out2(dest2), outputKind(outputKind) {} CodeGen_C::~CodeGen_C() {} @@ -299,14 +303,18 @@ void CodeGen_C::visit(const Function* func) { // Print variable declarations out << printDecls(varFinder.varDecls, func->inputs, func->outputs) << endl; + // out << "printf(\"declarations added\\n\");" << std::endl; if (emittingCoroutine) { out << printContextDeclAndInit(varMap, localVars, numYields, func->name) << endl; } + // out << "printf(\"declarations added2\\n\");" << std::endl; // output body print(func->body); + // out << "printf(\"function body added " << count++ << "\\n\"); // " << std::endl; + // output repack only if we allocated memory if (checkForAlloc(func)) @@ -403,6 +411,9 @@ static string getAtomicPragma() { // Docs for vectorization pragmas: // http://clang.llvm.org/docs/LanguageExtensions.html#extensions-for-loop-hint-optimizations void CodeGen_C::visit(const For* op) { + + // out << " printf(\"adding for loop " << count++ << "\\n\"); //" << std::endl; + switch (op->kind) { case LoopKind::Vectorized: doIndent(); @@ -452,6 +463,14 @@ void CodeGen_C::visit(const For* op) { } stream << ") {\n"; + // out << " printf(\"loop " << count++ << " : %d , dim: %d, %d\\n\","; + // op->var.accept(this); + // out << ", "; + // op->start.accept(this); + // out << ", "; + // op->end.accept(this); + // out << "); // " << count++ << std::endl; + op->contents.accept(this); doIndent(); stream << "}"; @@ -472,6 +491,7 @@ void CodeGen_C::visit(const While* op) { } void CodeGen_C::visit(const GetProperty* op) { + // std::cout << "GetProperty* " << op << std::endl; taco_iassert(varMap.count(op) > 0) << "Property " << Expr(op) << " of " << op->tensor << " not found in varMap"; out << varMap[op]; diff --git a/src/codegen/codegen_c.h b/src/codegen/codegen_c.h index 55c9d01a8..c8505a3bb 100644 --- a/src/codegen/codegen_c.h +++ b/src/codegen/codegen_c.h @@ -16,6 +16,7 @@ class CodeGen_C : public CodeGen { /// Initialize a code generator that generates code to an /// output stream. CodeGen_C(std::ostream &dest, OutputKind outputKind, bool simplify=true); + CodeGen_C(std::ostream &dest, std::ostream &dest2, OutputKind outputKind, bool simplify=true); ~CodeGen_C(); /// Compile a lowered function @@ -28,23 +29,25 @@ class CodeGen_C : public CodeGen { protected: using IRPrinter::visit; - void visit(const Function*); - void visit(const VarDecl*); - void visit(const Yield*); - void visit(const Var*); - void visit(const For*); - void visit(const While*); - void visit(const GetProperty*); - void visit(const Min*); - void visit(const Max*); - void visit(const Allocate*); - void visit(const Sqrt*); - void visit(const Store*); - void visit(const Assign*); + virtual void visit(const Function*); + virtual void visit(const VarDecl*); + virtual void visit(const Yield*); + virtual void visit(const Var*); + virtual void visit(const For*); + virtual void visit(const While*); + virtual void visit(const GetProperty*); + virtual void visit(const Min*); + virtual void visit(const Max*); + virtual void visit(const Allocate*); + virtual void visit(const Sqrt*); + virtual void visit(const Store*); + virtual void visit(const Assign*); std::map varMap; std::vector localVars; std::ostream &out; + std::ostream &out2; + int count = 0; OutputKind outputKind; diff --git a/src/codegen/codegen_ispc.cpp b/src/codegen/codegen_ispc.cpp index d35af1748..d4f428ccf 100644 --- a/src/codegen/codegen_ispc.cpp +++ b/src/codegen/codegen_ispc.cpp @@ -418,10 +418,10 @@ class CodeGen_ISPC::FunctionCollector : public IRVisitor { CodeGen_ISPC::CodeGen_ISPC(std::ostream &dest, OutputKind outputKind, bool simplify) - : CodeGen(dest, false, simplify, C), out(dest), out2(dest), outputKind(outputKind) {} + : CodeGen_C(dest, dest, outputKind, simplify) {} CodeGen_ISPC::CodeGen_ISPC(std::ostream &dest, std::ostream &dest2, OutputKind outputKind, bool simplify) - : CodeGen(dest, dest2, false, simplify, C), out(dest), out2(dest2), outputKind(outputKind) {} + : CodeGen_C(dest, dest2, outputKind, simplify) {} CodeGen_ISPC::~CodeGen_ISPC() {} @@ -543,7 +543,7 @@ void CodeGen_ISPC::sendToStream(std::stringstream &stream) { this->out2 << stream.str(); } else { - this->out << stream.str(); + CodeGen_C::sendToStream(stream); } } @@ -709,17 +709,7 @@ void CodeGen_ISPC::visit(const VarDecl* op) { } } else { - if (emittingCoroutine) { - doIndent(); - op->var.accept(this); - parentPrecedence = Precedence::TOP; - stream << " = "; - op->rhs.accept(this); - stream << ";"; - stream << endl; - } else { - IRPrinter::visit(op); - } + CodeGen_C::visit(op); } // sendToStream(stream); @@ -744,15 +734,7 @@ void CodeGen_ISPC::visit(const Var* op) { } } else { - taco_iassert(varMap.count(op) > 0) << - "Var " << op->name << " not found in varMap"; - if (emittingCoroutine) { - // out << "TACO_DEREF("; - } - out << varMap[op]; - if (emittingCoroutine) { - // out << ")"; - } + CodeGen_C::visit(op); } } @@ -804,7 +786,7 @@ static string getAtomicPragma() { // http://clang.llvm.org/docs/LanguageExtensions.html#extensions-for-loop-hint-optimizations void CodeGen_ISPC::visit(const For* op) { if (!is_ISPC_code_stream_enabled()) { - CodeGen::visit(op); + CodeGen_C::visit(op); return; } doIndent(); @@ -934,7 +916,7 @@ void CodeGen_ISPC::visit(const While* op) { out << "\n"; } - IRPrinter::visit(op); + CodeGen_C::visit(op); } void CodeGen_ISPC::visit(const GetProperty* op) { @@ -982,10 +964,11 @@ void CodeGen_ISPC::visit(const Max* op) { } void CodeGen_ISPC::visit(const Allocate* op) { - string elementType = printCType(op->var.type(), false); - doIndent(); + if (is_ISPC_code_stream_enabled()) { + string elementType = printCType(op->var.type(), false); + doIndent(); op->var.accept(this); stream2 << " = "; @@ -1015,33 +998,7 @@ void CodeGen_ISPC::visit(const Allocate* op) { } else { - - op->var.accept(this); - stream << " = ("; - stream << elementType << "*"; - stream << ")"; - if (op->is_realloc) { - stream << "realloc("; - op->var.accept(this); - stream << ", "; - } - else { - // If the allocation was requested to clear the allocated memory, - // use calloc instead of malloc. - if (op->clear) { - stream << "calloc(1, "; - } else { - stream << "malloc("; - } - } - stream << "sizeof(" << elementType << ")"; - stream << " * "; - parentPrecedence = MUL; - op->num_elements.accept(this); - parentPrecedence = TOP; - stream << ");"; - stream << endl; - + CodeGen_C::visit(op); } @@ -1110,15 +1067,14 @@ void CodeGen_ISPC::visit(const Assign* op) { stream2 << ";"; stream2 << endl; + IRPrinter::visit(op); } else { - if (op->use_atomics) { - doIndent(); - stream << getAtomicPragma() << endl; - } + CodeGen_C::visit(op); + } - IRPrinter::visit(op); + } void CodeGen_ISPC::visit(const Store* op) { @@ -1137,43 +1093,5 @@ void CodeGen_ISPC::visit(const Store* op) { IRPrinter::visit(op); } -void CodeGen_ISPC::generateShim(const Stmt& func, stringstream &ret) { - const Function *funcPtr = func.as(); - - ret << "int _shim_" << funcPtr->name << "(void** parameterPack) {\n"; - ret << " return " << funcPtr->name << "("; - - size_t i=0; - string delimiter = ""; - - const auto returnType = funcPtr->getReturnType(); - if (returnType.second != Datatype()) { - ret << "(void**)(parameterPack[0]), "; - ret << "(char*)(parameterPack[1]), "; - ret << "(" << returnType.second << "*)(parameterPack[2]), "; - ret << "(int32_t*)(parameterPack[3])"; - - i = 4; - delimiter = ", "; - } - - for (auto output : funcPtr->outputs) { - auto var = output.as(); - auto cast_type = var->is_tensor ? "taco_tensor_t*" - : printCType(var->type, var->is_ptr); - - ret << delimiter << "(" << cast_type << ")(parameterPack[" << i++ << "])"; - delimiter = ", "; - } - for (auto input : funcPtr->inputs) { - auto var = input.as(); - auto cast_type = var->is_tensor ? "taco_tensor_t*" - : printCType(var->type, var->is_ptr); - ret << delimiter << "(" << cast_type << ")(parameterPack[" << i++ << "])"; - delimiter = ", "; - } - ret << ");\n"; - ret << "}\n"; -} } } diff --git a/src/codegen/codegen_ispc.h b/src/codegen/codegen_ispc.h index 2e440abc0..62d2897ca 100644 --- a/src/codegen/codegen_ispc.h +++ b/src/codegen/codegen_ispc.h @@ -12,7 +12,7 @@ namespace taco { namespace ir { -class CodeGen_ISPC : public CodeGen { +class CodeGen_ISPC : public CodeGen_C { public: /// Initialize a code generator that generates code to an /// output stream. @@ -28,7 +28,7 @@ class CodeGen_ISPC : public CodeGen { static void generateShim(const Stmt& func, std::stringstream &stream); protected: - using IRPrinter::visit; + using CodeGen_C::visit; void visit(const Function*); void visit(const VarDecl*); @@ -50,19 +50,10 @@ class CodeGen_ISPC : public CodeGen { void printISPCFunc(const Function *func, std::map varMap, std::vector &sortedProps); - std::map varMap; - std::vector localVars; bool taskCode = false; - std::ostream &out; - std::ostream &out2; - - OutputKind outputKind; - std::string funcName; std::stringstream funcVariables; std::vector sortedProps; - int labelCount; - bool emittingCoroutine; class FindVars; class FunctionCollector; diff --git a/src/codegen/module.cpp b/src/codegen/module.cpp index 82b736a13..6f631d40e 100644 --- a/src/codegen/module.cpp +++ b/src/codegen/module.cpp @@ -4,6 +4,7 @@ #include #include #include +// #include #if USE_OPENMP #include #endif @@ -178,7 +179,7 @@ string Module::compile() { writeShims(funcs, tmpdir, libname); for (auto &statement : funcs) { std::cout << "----- statement --------" << std::endl; - std::cout << statement; + // std::cout << statement; std::cout << std::endl; } std::cout << tmpdir << std::endl << libname << std::endl; @@ -233,10 +234,61 @@ string Module::getSource() { return source.str(); } +void* Module::getFuncPtr(std::string& sofile, std::string name) { + std::cout << "opening shared object 1\n"; + if (so_lib_handle) { + dlclose(so_lib_handle); + } + std::cout << "opening shared object 2\n"; + so_lib_handle = dlopen(sofile.data(), RTLD_NOW | RTLD_LOCAL); + std::cout << "opening shared object : " << sofile << std::endl; + return dlsym(so_lib_handle, name.data()); +} + void* Module::getFuncPtr(std::string name) { return dlsym(lib_handle, name.data()); } +int Module::callFuncPackedRaw(std::string name, std::string& sofile, void** args) { + typedef int (*fnptr_t)(void**); + static_assert(sizeof(void*) == sizeof(fnptr_t), + "Unable to cast dlsym() returned void pointer to function pointer"); + void* v_func_ptr = getFuncPtr(sofile, name); + fnptr_t func_ptr; + *reinterpret_cast(&func_ptr) = v_func_ptr; + +#if USE_OPENMP + omp_sched_t existingSched; + ParallelSchedule tacoSched; + int existingChunkSize, tacoChunkSize; + int existingNumThreads = omp_get_max_threads(); + omp_get_schedule(&existingSched, &existingChunkSize); + taco_get_parallel_schedule(&tacoSched, &tacoChunkSize); + switch (tacoSched) { + case ParallelSchedule::Static: + omp_set_schedule(omp_sched_static, tacoChunkSize); + break; + case ParallelSchedule::Dynamic: + omp_set_schedule(omp_sched_dynamic, tacoChunkSize); + break; + default: + break; + } + omp_set_num_threads(taco_get_num_threads()); +#endif + + std::cout << "calling the function\n"; + int ret = func_ptr(args); + std::cout << "function call completed\n"; + +#if USE_OPENMP + omp_set_schedule(existingSched, existingChunkSize); + omp_set_num_threads(existingNumThreads); +#endif + + return ret; +} + int Module::callFuncPackedRaw(std::string name, void** args) { typedef int (*fnptr_t)(void**); static_assert(sizeof(void*) == sizeof(fnptr_t), @@ -265,7 +317,13 @@ int Module::callFuncPackedRaw(std::string name, void** args) { omp_set_num_threads(taco_get_num_threads()); #endif + std::cout << "calling the function\n"; + // CALLGRIND_START_INSTRUMENTATION; + // CALLGRIND_TOGGLE_COLLECT; int ret = func_ptr(args); + // CALLGRIND_TOGGLE_COLLECT; + // CALLGRIND_STOP_INSTRUMENTATION; + std::cout << "function call completed\n"; #if USE_OPENMP omp_set_schedule(existingSched, existingChunkSize); diff --git a/src/index_notation/index_notation.cpp b/src/index_notation/index_notation.cpp index 51fb8770c..2e26460c7 100644 --- a/src/index_notation/index_notation.cpp +++ b/src/index_notation/index_notation.cpp @@ -2438,6 +2438,7 @@ bool isConcreteNotation(IndexStmt stmt, std::string* reason) { return isConcrete; } +// make reduction notation Assignment makeReductionNotation(Assignment assignment) { IndexExpr expr = assignment.getRhs(); std::vector free = assignment.getLhs().getIndexVars(); @@ -2513,7 +2514,10 @@ IndexStmt makeReductionNotation(IndexStmt stmt) { return makeReductionNotation(to(stmt)); } +// make concrete notation IndexStmt makeConcreteNotation(IndexStmt stmt) { + // std::cout << "concrete notation original assignment: " << stmt << std::endl; + std::string reason; taco_iassert(isReductionNotation(stmt, &reason)) << "Not reduction notation: " << stmt << std::endl << reason; @@ -2521,6 +2525,7 @@ IndexStmt makeConcreteNotation(IndexStmt stmt) { // Free variables and reductions covering the whole rhs become top level loops vector freeVars = to(stmt).getFreeVars(); + std::cout << "free vars: " << freeVars << std::endl; struct RemoveTopLevelReductions : IndexNotationRewriter { using IndexNotationRewriter::visit; @@ -2535,12 +2540,17 @@ IndexStmt makeConcreteNotation(IndexStmt stmt) { topLevelReductions.push_back(reduction.getVar()); rhs = reduction.getExpr(); } + // std::cout << "top level reductions: " << topLevelReductions << std::endl; if (rhs != node->rhs) { - stmt = Assignment(node->lhs, rhs, Add()); + stmt = Assignment(node->lhs, rhs, Add()); // write with add + int idx = 0; for (auto& i : util::reverse(topLevelReductions)) { + std::cout << idx << ": " << stmt << std::endl; + idx++; stmt = forall(i, stmt); } + std::cout << idx << ": " << stmt << std::endl; } else { stmt = node; @@ -2548,11 +2558,18 @@ IndexStmt makeConcreteNotation(IndexStmt stmt) { } }; stmt = RemoveTopLevelReductions().rewrite(stmt); + // std::cout << "after remove top level reductions: " << stmt << std::endl; + // now we form the stmt in reverse order of freeVars + int idx = 0; for (auto& i : util::reverse(freeVars)) { + std::cout << idx << ": " << stmt << std::endl; stmt = forall(i, stmt); + idx++; } + std::cout << idx << ": " << stmt << std::endl; + std::cout << "replacing reductions with whereas statements\n"; // Replace other reductions with where and forall statements struct ReplaceReductionsWithWheres : IndexNotationRewriter { using IndexNotationRewriter::visit; diff --git a/src/index_notation/transformations.cpp b/src/index_notation/transformations.cpp index 011779caf..c1d82a9fd 100644 --- a/src/index_notation/transformations.cpp +++ b/src/index_notation/transformations.cpp @@ -1,11 +1,16 @@ #include "taco/index_notation/transformations.h" +#include "lower/iteration_graph.h" +#include "lower/tensor_path.h" #include "taco/cuda.h" #include "taco/index_notation/index_notation.h" +#include "taco/index_notation/index_notation_nodes_abstract.h" #include "taco/index_notation/index_notation_rewriter.h" #include "taco/index_notation/index_notation_nodes.h" #include "taco/index_notation/index_notation_printer.h" #include "taco/error/error_messages.h" +#include "taco/index_notation/intrinsic.h" +#include "taco/type.h" #include "taco/util/collections.h" #include "taco/lower/iterator.h" #include "taco/lower/merge_lattice.h" @@ -307,6 +312,7 @@ IndexStmt Precompute::apply(IndexStmt stmt, std::string* reason) const { IndexExpr e = precompute.getExpr(); IndexVar iw = precompute.getiw(); + // these lines of code looks interesting when creating the producer consumer relationship IndexStmt consumer = forall(i, replace(s, {{e, ws(i)}})); IndexStmt producer = forall(iw, Assignment(ws(iw), replace(e, {{i,iw}}), assign.getOperator())); @@ -595,7 +601,7 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const { IndexStmt rewriteParallel(IndexStmt stmt) { std::cout << "1 rewriting IndexStmt to support parallelize schedule directive\n--------------------------------------------\n"; - std::cout << stmt << std::endl; + // std::cout << stmt << std::endl; provGraph = ProvenanceGraph(stmt); std::cout << "2 rewriting IndexStmt to support parallelize schedule directive\n--------------------------------------------\n"; @@ -618,7 +624,7 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const { } std::cout << "4 rewriting IndexStmt to support parallelize schedule directive\n--------------------------------------------\n"; - std::cout << stmt << std::endl; + // std::cout << stmt << std::endl; return rewrite(stmt); } @@ -1306,6 +1312,7 @@ static vector topologicallySort(map> hardDeps, map> softDeps, vector originalOrder) { + std::cout << "originalOrder: " << std::endl; vector sortedVars; unsigned long countVars = originalOrder.size(); while (sortedVars.size() < countVars) { @@ -1327,6 +1334,9 @@ topologicallySort(map> hardDeps, } // No free var found there is a cycle + std::cout << "this is where the assert fails\n"; + std::cout << "freeVarPos: " << freeVarPos << std::endl; + std::cout << "limit: " << std::numeric_limits::max() << std::endl; taco_iassert(freeVarPos != std::numeric_limits::max()) << "Cycles in iteration graphs must be resolved, through transpose, " << "before the expression is passed to the topological sorting " @@ -1352,19 +1362,668 @@ topologicallySort(map> hardDeps, return sortedVars; } -IndexStmt justTraverseThroughTheIndexStmt(IndexStmt stmt) { - struct IndexStatementTraverse : public IndexNotationPrinter { - IndexStatementTraverse(std::ostream& os) : IndexNotationPrinter(os) {}; - using IndexNotationPrinter::visit; +bool checkFromBack(const TensorPath& resultTensorPath, + const vector& tensorPaths, + string& removedAccessNode, + vector& producerVars, + vector& consumerVars, + vector& modifiedResultIndexesAccessed, + vector& sortedAllIndexes) { + + std::cout << "check from back function execution\n"; + + const std::vector& resultIndexesVisited = resultTensorPath.getVariables(); + IndexVar lastVisitedIndexVar = resultIndexesVisited.back(); + + std::cout << "last visited index variable: " << lastVisitedIndexVar << std::endl; + + bool onlyLastTensorContainLastIndexOfOutput = true; + bool fissionFromBack = false; + + // check from the back + for (unsigned long i=0; i& indexesVisited = otherIndexPaths.getVariables(); + cout << "index paths: " << otherIndexPaths << endl; + + // if (i < tensorPaths.size()-1) { + // check if other tensors also contain last index of output tensor + for (auto index : indexesVisited) { + cout << "checking " << index << " " << lastVisitedIndexVar << endl; + if (index == lastVisitedIndexVar) { + onlyLastTensorContainLastIndexOfOutput = false; + } + } + // } + } + + if (onlyLastTensorContainLastIndexOfOutput) { // last accessed tensorVariable + const TensorPath& otherIndexPaths = tensorPaths.back(); + const vector& indexesVisited = otherIndexPaths.getVariables(); + cout << "index paths: " << otherIndexPaths << endl; + + cout << "index variable maybe removed from the back\n"; + auto lastTensorLastVisited = indexesVisited.back(); + cout << "last index last visited " << lastTensorLastVisited << endl; + + if (lastTensorLastVisited == lastVisitedIndexVar) { + cout << "we can diffuse from the back\n"; + fissionFromBack = true; + removedAccessNode = otherIndexPaths.getAccess().getTensorVar().getName(); + cout << "removed access node " << removedAccessNode << endl; + + // mark producer accessed index variables + for (auto indexVar : sortedAllIndexes) { + if (indexVar != lastVisitedIndexVar) { // add everything except the last accessed index + std::cout << "producer vars: " << indexVar << std::endl; + producerVars.push_back(indexVar); + } + } + + for (auto indexVar : sortedAllIndexes) { + if (indexVar != lastVisitedIndexVar) { + if ( + find(resultIndexesVisited.begin(), resultIndexesVisited.end(), indexVar) + != resultIndexesVisited.end() || + find(indexesVisited.begin(), indexesVisited.end(), indexVar) + != indexesVisited.end() + ) { + modifiedResultIndexesAccessed.push_back(indexVar); + } + } + } + + // // get modified index for the intermediate calculated tensor expression + // for (unsigned long j=0; j& tensorPaths, + string& removedAccessNode, + vector& producerVars, + vector& consumerVars, + vector& modifiedResultIndexesAccessed, + vector& sortedAllIndexes) { + + std::cout << "check from front function execution\n"; + + const std::vector& resultIndexesVisited = resultTensorPath.getVariables(); + IndexVar firstVisitedIndexVar = resultIndexesVisited.front(); + + std::cout << "first fisited index variable: " << firstVisitedIndexVar << std::endl; + std::cout << "tensor path size: " << tensorPaths.size() << std::endl; + + bool onlyFirstTensorContainFirstIndexOfOutput = true; + bool fissionFromFront = false; + + // check from the front + for (long i=tensorPaths.size()-1; i>0; i--) { // change tensor paths to recursively use the functionality + std::cout << "i: " << i << std::endl; + const TensorPath& otherIndexPaths = tensorPaths.at(i); + const vector& indexesVisited = otherIndexPaths.getVariables(); + cout << "index paths: " << otherIndexPaths << endl; + + if (i != 0) { // check if other tensors also contain last index of output tensor + for (auto index : indexesVisited) { + cout << "checking " << index << " " << firstVisitedIndexVar << endl; + if (index == firstVisitedIndexVar) { + onlyFirstTensorContainFirstIndexOfOutput = false; + } + } + } + } + + + if (onlyFirstTensorContainFirstIndexOfOutput) { // last accessed tensorVariable + const TensorPath& otherIndexPaths = tensorPaths.front(); + const vector& indexesVisited = otherIndexPaths.getVariables(); + cout << "index paths: " << otherIndexPaths << endl; + + cout << "index variable maybe removed from the front\n"; + auto firstTensorFirstVisited = indexesVisited.front(); + cout << "first index first visited " << firstTensorFirstVisited << endl; + + if (firstTensorFirstVisited == firstVisitedIndexVar) { + cout << "we can diffuse from the front\n"; + fissionFromFront = true; + removedAccessNode = otherIndexPaths.getAccess().getTensorVar().getName(); + cout << "removed access node " << removedAccessNode << endl; + + // mark producer accessed index variables + for (auto indexVar : sortedAllIndexes) { + if (indexVar != firstVisitedIndexVar) { // add everything except the first accessed index + producerVars.emplace_back(indexVar); + } + } + + for (auto indexVar : sortedAllIndexes) { + if (indexVar != firstVisitedIndexVar) { + if ( + find(resultIndexesVisited.begin(), resultIndexesVisited.end(), indexVar) + != resultIndexesVisited.end() || + find(indexesVisited.begin(), indexesVisited.end(), indexVar) + != indexesVisited.end() + ) { + modifiedResultIndexesAccessed.push_back(indexVar); + } + } + } + + std::cout << "printing modifiedResultIndexesAccessed\n"; + for (auto& idx : modifiedResultIndexesAccessed) { + std::cout << "modifiedResultIndexesAccessed: " << idx << std::endl; + } + std::cout << "printed modifiedResultIndexesAccessed\n"; + + // get modified index for the intermediate calculated tensor expression + // for (unsigned long j=0; j forallParallelUnit; map forallOutputRaceStrategy; + vector sortedIndexes; + Assignment innerBody; + + SortedIndexVars() {}; + + void visit(const ForallNode* node) { + Forall forallNode(node); + IndexVar i = forallNode.getIndexVar(); + std::cout << forallNode << std::endl; + + sortedIndexes.push_back(i); + forallParallelUnit[i] = forallNode.getParallelUnit(); + forallOutputRaceStrategy[i] = forallNode.getOutputRaceStrategy(); + + if (isa(forallNode.getStmt())) { + cout << "assignment node found: " << forallNode.getStmt() << endl;; + innerBody = to(forallNode.getStmt()); + return; // Only reorder first contiguous section of ForAlls + } + + IndexNotationVisitor::visit(node); + } }; std::cout << "traversing through the index statement\n"; - IndexNotationPrinter printer(std::cout); + SortedIndexVars sortedIndexVars; + stmt.accept(&sortedIndexVars); std::cout << std::endl; - stmt.accept(&printer); - return stmt; + + struct IndexExprBuilder : public IndexNotationVisitor { + + using IndexNotationVisitor::visit; + vector accessLeftToRight; + map>> indexDimensionsMap; + + void visit(const AccessNode* node) { + Access accessNode(node); + std::cout << "access node: " << accessNode << std::endl; + accessLeftToRight.push_back(accessNode); + + TensorVar tensorVar = accessNode.getTensorVar(); + + for (unsigned long i=0; i < accessNode.getIndexVars().size(); i++) { + auto var = accessNode.getIndexVars()[i]; + + if (indexDimensionsMap.find(var) != indexDimensionsMap.end()) { + indexDimensionsMap[var].emplace_back( + pair(tensorVar.getType().getShape().getDimension(i), + tensorVar.getType())); + } + else { + indexDimensionsMap[var] = { + pair( + tensorVar.getType().getShape().getDimension(i), + tensorVar.getType()) + }; + } + } + + } + + }; + + IndexExpr rhsExpr = assignment.getRhs(); + Access lhsAccess = to(assignment.getLhs()); + std::cout << "right hand side expression: " << rhsExpr << std::endl; + IndexExprBuilder indexExprBuilder; + rhsExpr.accept(&indexExprBuilder); + TensorVar resultVar = lhsAccess.getTensorVar(); + + for (auto item : indexExprBuilder.indexDimensionsMap) { + auto indexVar = item.first; + cout << "var: " << indexVar << " "; + for (auto elem : item.second) { + cout << elem.first << " " << elem.second << " " ; + } + cout << endl; + } + + + // now I have the iteration graph + IterationGraph iterationGraph = IterationGraph::make(assignment); + std::cout << "/*******************************************/\n"; + std::cout << "/********** ITERATION GRAPH ****************/\n"; + std::cout << "/*******************************************/\n"; + std::cout << iterationGraph << std::endl; + + const TensorPath& resultTensorPath = iterationGraph.getResultTensorPath(); + const std::vector& tensorPaths = iterationGraph.getTensorPaths(); + + + string removedAccessNode; + vector producerVars; // producer accessed index variables + vector consumerVars; // consumer accessed index variables + vector fusedVars; + vector modifiedResultIndexesAccessed; + bool fissionFromBack = false; + if (side == "b") { + fissionFromBack = true; + } + + if (fissionFromBack) { + fissionFromBack = checkFromBack(resultTensorPath, tensorPaths, + removedAccessNode, producerVars, consumerVars, + modifiedResultIndexesAccessed, sortedIndexVars.sortedIndexes + ); + } + + bool fissionFromFront = false; + if (side == "f") { + fissionFromFront = true; + } + if (fissionFromBack == false && fissionFromFront) { + fissionFromFront = checkFromFront(resultTensorPath, tensorPaths, + removedAccessNode, producerVars, consumerVars, + modifiedResultIndexesAccessed, sortedIndexVars.sortedIndexes + ); + } + + if (!fissionFromBack && !fissionFromFront) { + cout << "fission operation cannot be performed from the back\n"; + return stmt; + } + + vector newAccessDims{}; + for (auto var : modifiedResultIndexesAccessed) { + auto item = indexExprBuilder.indexDimensionsMap[var]; + cout << "shared vars: " << var << endl; + newAccessDims.emplace_back(item[0].first); + } + TensorVar newAccessVar(resultVar.getName() + "_inner", + Type(resultVar.getType().getDataType(), newAccessDims)); + cout << "new inner assignment statement: " << modifiedResultIndexesAccessed << std::endl; + Access newResultAccess(newAccessVar, modifiedResultIndexesAccessed); + cout << "new access variable for iterative apply: " << newResultAccess << std::endl; + + if (fissionFromBack) { + std::cout << "fission from the back is possible\n"; + } + if (fissionFromFront) { + std::cout << "fission from the front is possible\n"; + } + + // // check from the front + // struct IndexExprSeparator : public IndexNotationVisitor { + + // using IndexNotationVisitor::visit; + // vector accessLeftToRight; + + // void visit(const MulNode* node) { + // Mul mulNode(node); + // IndexExpr lhs = mulNode.getA(); + // IndexExpr rhs = mulNode.getB(); + // std::cout << "access node: " << accessNode << std::endl; + // accessLeftToRight.push_back(accessNode); + // } + + // }; + + + cout << "\n\nProducer accessed index variables\n"; + auto it = producerVars.begin(); + for (; it != producerVars.end(); it++) { + cout << *it << endl; + } + cout << "\n\nConsumer accessed index variables\n"; + it = consumerVars.begin(); + for (; it != consumerVars.end(); it++) { + cout << *it << endl; + } + cout << endl << endl; + + // check common vars that can be fused + for (auto var : sortedIndexVars.sortedIndexes) { + if (find(producerVars.begin(), producerVars.end(), var) != producerVars.end() && + find(consumerVars.begin(), consumerVars.end(), var) != consumerVars.end()) { + fusedVars.emplace_back(var); + } + else { + break; + } + } + + for (auto& fv : fusedVars) { + std::cout << "fusable vars: " << fv << std::endl; + } + + vector sharedVars; + for (auto var : sortedIndexVars.sortedIndexes) { + if (find(fusedVars.begin(), fusedVars.end(), var) == fusedVars.end() && + find(producerVars.begin(), producerVars.end(), var) != producerVars.end() && + find(consumerVars.begin(), consumerVars.end(), var) != consumerVars.end() + ) { + sharedVars.emplace_back(var); + } + } + + for (auto& sv : sharedVars) { + std::cout << "shared vars: " << sv << std::endl; + } + + vector sharedDims{}; + for (auto var : sharedVars) { + auto item = indexExprBuilder.indexDimensionsMap[var]; + cout << "shared vars: " << var << endl; + sharedDims.emplace_back(item[0].first); + } + + + // get removing tensorvars and workspace dimension + const Type& type = resultTensorPath.getAccess().getTensorVar().getType(); + const Format& format = resultTensorPath.getAccess().getTensorVar().getFormat(); + TensorVar intermediateTensor("ws", type, format); + cout << intermediateTensor << endl; + + // TensorVar A("A", Type(), taco::dense); + TensorVar tempVar("t" + resultVar.getName(), + Type(resultVar.getType().getDataType(), sharedDims)); + cout << "tensor order: " << tempVar.getOrder() << endl; + cout << "tensor format: " << tempVar.getFormat() << endl; + cout << "format order: " << tempVar.getFormat().getOrder() << endl; + + // TensorVar* a = new TensorVar("A", Type()); + // TensorVar ws("ws", Type(type(), {jdim}) ); + + // get removing indexExpr and the rest of the indexExpr + Access workspace(tempVar, sharedVars); + std::cout << "workspace access tensor: " << workspace << std::endl; + + + + // construct producer expression right hand side + cout << "generating consumer expression\n"; + IndexExpr producerExpr; + int num_muls = 0; + for (Access accessNode : indexExprBuilder.accessLeftToRight) { + std::cout << "accessNodes: " << accessNode << endl; + if (removedAccessNode != accessNode.getTensorVar().getName()) { + if (producerExpr == NULL) { + std::cout << "index expression is null"; + producerExpr = accessNode; + std::cout << "producerExpr: " << producerExpr << std::endl; + } else { + num_muls++; + producerExpr = producerExpr * accessNode; + std::cout << "producerExpr: " << producerExpr << std::endl; + } + } + } + std::cout << producerExpr << std::endl; + Assignment producerAssignment(newResultAccess, + producerExpr); + std::cout << "new inner assignment statement: " << producerAssignment << std::endl; + Assignment producerInnerBody(workspace, + producerExpr, + sortedIndexVars.innerBody.getOperator() + ); + std::cout << "producerInnerBody: " << producerInnerBody << std::endl; + + // construct consumer expression right hand side + IndexExpr consumerExpr; + if (fissionFromBack) { + consumerExpr = workspace; + } + cout << "generating consumer expression: " << consumerExpr << std::endl; + for (Access accessNode : indexExprBuilder.accessLeftToRight) { + TensorVar tv = accessNode.getTensorVar(); + std::cout << "accessNodes: " << accessNode << endl; + if (removedAccessNode == accessNode.getTensorVar().getName()) { + if (consumerExpr == NULL) { + std::cout << "index expression is null"; + consumerExpr = accessNode; + std::cout << "consumerExpr: " << consumerExpr << std::endl; + } else { + consumerExpr = consumerExpr * accessNode; + std::cout << "consumerExpr: " << consumerExpr << std::endl; + } + } + } + if (fissionFromFront) { + consumerExpr = consumerExpr * workspace; + } + Assignment consumerInnerBody(lhsAccess, + consumerExpr, + sortedIndexVars.innerBody.getOperator() + ); + + cout << "Producer inner body: " << producerInnerBody << endl; + cout << "Consumer inner body: " << consumerInnerBody << endl; + + // rewrite indexstmt + // Reorder Foralls use a rewriter in case new nodes introduced outside of Forall + struct ProducerConsumerRewriter : public IndexNotationRewriter { + using IndexNotationRewriter::visit; + + const vector& producerConsumerVars; + const vector& fusedVars; + IndexStmt innerBody; + const map forallParallelUnit; + const map forallOutputRaceStrategy; + + ProducerConsumerRewriter(const vector& producerConsumerVars, + const vector& fusedVars, IndexStmt innerBody, + const map forallParallelUnit, + const map forallOutputRaceStrategy) + : producerConsumerVars(producerConsumerVars), fusedVars(fusedVars), innerBody(innerBody), + forallParallelUnit(forallParallelUnit), forallOutputRaceStrategy(forallOutputRaceStrategy) { + } + + void visit(const ForallNode* node) { + Forall foralli(node); + IndexVar i = foralli.getIndexVar(); + cout << "going through var: " << i << endl; + + // first forall must be in collected variables + // taco_iassert(util::contains(producerVars, i)); + // std::cout << "\ninner body of the statement\n" << innerBody; + // // done in reverse order? + // for (auto it = sortedVars.rbegin(); it != sortedVars.rend(); ++it) { + // stmt = forall(*it, stmt, forallParallelUnit.at(*it), forallOutputRaceStrategy.at(*it), foralli.getUnrollFactor()); + // } + stmt = rewrite(foralli.getStmt()); + cout << "after rewrite statement: " << stmt << endl; + + // omit the index variables in the fusedVar list + if (find(fusedVars.begin(), fusedVars.end(), i) == fusedVars.end() && + find(producerConsumerVars.begin(), producerConsumerVars.end(), i) != producerConsumerVars.end()) { + stmt = forall(i, stmt, forallParallelUnit.at(i), forallOutputRaceStrategy.at(i), foralli.getUnrollFactor()); + } + } + + void visit (const AssignmentNode* node) { + cout << "assignment node: " << node << endl; + stmt = innerBody; + cout << "producerStmt: " << innerBody << endl; + cout << "stmt: " << stmt << endl; + } + + }; + ProducerConsumerRewriter producerRewriter(producerVars, fusedVars, + producerInnerBody, + sortedIndexVars.forallParallelUnit, + sortedIndexVars.forallOutputRaceStrategy); + IndexStmt producerStmt = producerRewriter.rewrite(stmt); + std::cout << "\nAfter Producer rewriter\n"; + std::cout << producerStmt << std::endl; + if (num_muls > 1) { + producerStmt = loopFusionOverFission(producerStmt, producerInnerBody, + side, iters-1); + } + + + ProducerConsumerRewriter consumerRewriter(consumerVars, fusedVars, + consumerInnerBody, + sortedIndexVars.forallParallelUnit, + sortedIndexVars.forallOutputRaceStrategy); + IndexStmt consumerStmt = consumerRewriter.rewrite(stmt); + std::cout << "\nAfter Consumer rewriter\n"; + std::cout << consumerStmt << std::endl; + + + struct CombineProducerConsumerRewriter : public IndexNotationRewriter { + + const vector& fusedVars; + IndexStmt consumerStmt; + IndexStmt producerStmt; + const map forallParallelUnit; + const map forallOutputRaceStrategy; + + CombineProducerConsumerRewriter(const vector& fusedVars, + IndexStmt producerStmt, IndexStmt consumerStmt, + const map forallParallelUnit, + const map forallOutputRaceStrategy) + : fusedVars(fusedVars), consumerStmt(consumerStmt), producerStmt(producerStmt), + forallParallelUnit(forallParallelUnit), + forallOutputRaceStrategy(forallOutputRaceStrategy) {} + + using IndexNotationRewriter::visit; + + void visit(const ForallNode* node) { + Forall foralli(node); + IndexVar i = foralli.getIndexVar(); + cout << "going through var: " << i << endl; + + // omit the index variables in the fusedVar list + if (find(fusedVars.begin(), fusedVars.end(), i) != fusedVars.end()) { + cout << "fused var in stmt\n"; + stmt = rewrite(foralli.getStmt()); + cout << "rewritten stmt: " << stmt << endl; + stmt = forall(i, stmt, forallParallelUnit.at(i), forallOutputRaceStrategy.at(i), foralli.getUnrollFactor()); + } + else { + cout << "fused var not in stmt\n"; + cout << "producerStmt: " << producerStmt << endl; + cout << "consumerStmt: " << consumerStmt << endl; + stmt = where(consumerStmt, producerStmt); + cout << "where stmt: " << stmt << endl; + } + + cout << "after rewrite statement: " << stmt << endl; + } + + }; + + CombineProducerConsumerRewriter combineRewriter(fusedVars, + producerStmt, consumerStmt, + sortedIndexVars.forallParallelUnit, + sortedIndexVars.forallOutputRaceStrategy); + IndexStmt combinedStmt = combineRewriter.rewrite(stmt); + std::cout << "\nAfter Combine rewriter\n"; + std::cout << combinedStmt << std::endl; + + + return combinedStmt; } @@ -1431,6 +2090,7 @@ IndexStmt reorderLoopsTopologically(IndexStmt stmt) { }; Iterators iterators(stmt); + std::cout << "DAG builder with iterators" << std::endl; DAGBuilder dagBuilder(iterators); stmt.accept(&dagBuilder); std::cout << "After DAGBuilder\n"; @@ -1442,6 +2102,7 @@ IndexStmt reorderLoopsTopologically(IndexStmt stmt) { tensorVarOrders[tensorLevelVar.first] = varOrderFromTensorLevels(tensorLevelVar.second); } + // hard dependencies const auto hardDeps = depsFromVarOrders(tensorVarOrders); struct CollectSoftDependencies : public IndexNotationVisitor { @@ -1463,14 +2124,17 @@ IndexStmt reorderLoopsTopologically(IndexStmt stmt) { } } }; + // soft dependencies CollectSoftDependencies collectSoftDeps; stmt.accept(&collectSoftDeps); std::cout << "After CollectSoftDependencies\n"; std::cout << stmt << std::endl; + // topological sort const auto sortedVars = topologicallySort(hardDeps, collectSoftDeps.softDeps, dagBuilder.indexVarOriginalOrder); + // rewrite indexstmt // Reorder Foralls use a rewriter in case new nodes introduced outside of Forall struct TopoReorderRewriter : public IndexNotationRewriter { using IndexNotationRewriter::visit; @@ -1493,7 +2157,9 @@ IndexStmt reorderLoopsTopologically(IndexStmt stmt) { // first forall must be in collected variables taco_iassert(util::contains(sortedVars, i)); + std::cout << "\ninner body of the statement\n" << innerBody; stmt = innerBody; + // done in reverse order? for (auto it = sortedVars.rbegin(); it != sortedVars.rend(); ++it) { stmt = forall(*it, stmt, forallParallelUnit.at(*it), forallOutputRaceStrategy.at(*it), foralli.getUnrollFactor()); } diff --git a/src/ir/ir_printer.cpp b/src/ir/ir_printer.cpp index fa224bde4..eddca3f29 100644 --- a/src/ir/ir_printer.cpp +++ b/src/ir/ir_printer.cpp @@ -550,7 +550,7 @@ void IRPrinter::visit(const Store* op) { } void IRPrinter::visit(const For* op) { - std::cout << "This is IRPrinter::visit For op method\n"; + // std::cout << "This is IRPrinter::visit For op method\n"; if (is_ISPC_code_stream_enabled()) { doIndent(); stream2 << keywordString("for") << " (" diff --git a/src/ir/ir_rewriter.cpp b/src/ir/ir_rewriter.cpp index fdadf530e..2e4827497 100644 --- a/src/ir/ir_rewriter.cpp +++ b/src/ir/ir_rewriter.cpp @@ -292,7 +292,7 @@ void IRRewriter::visit(const Store* op) { } void IRRewriter::visit(const For* op) { - std::cout << "This is IRRewriter::visit(const For* op) method: For: " << op << std::endl; + // std::cout << "This is IRRewriter::visit(const For* op) method: For: " << op << std::endl; Expr var = rewrite(op->var); Expr start = rewrite(op->start); Expr end = rewrite(op->end); diff --git a/src/lower/iteration_graph.cpp b/src/lower/iteration_graph.cpp index 77735a8d2..482d84aae 100644 --- a/src/lower/iteration_graph.cpp +++ b/src/lower/iteration_graph.cpp @@ -48,6 +48,8 @@ struct IterationGraph::Content { IterationGraph::IterationGraph() { } +// remember that iteration graph does not have an ordering +// I got the ordering from topologically reorder index Ryan wrote IterationGraph IterationGraph::make(Assignment assignment) { TensorVar tensor = assignment.getLhs().getTensorVar(); IndexExpr expr = assignment.getRhs(); @@ -64,8 +66,16 @@ IterationGraph IterationGraph::make(Assignment assignment) { oldToSplitVar.insert({indexVar, indexVar}); } + // access nodes of right hand side match(expr, function([&](const AccessNode* op) { + std::cout << "access node: " << op->tensorVar << " <- " << IndexExpr(op) << std::endl; + std::cout << "index var: "; + for (auto indexVar : op->indexVars) { + std::cout << indexVar << " "; + } + std::cout << std::endl; + auto type = op->tensorVar.getType(); taco_iassert((size_t)type.getShape().getOrder() == op->indexVars.size()) << "Tensor access " << IndexExpr(op) << " but tensor format only has " diff --git a/src/lower/iterator.cpp b/src/lower/iterator.cpp index 0f0c024c5..eb3d8ac3b 100644 --- a/src/lower/iterator.cpp +++ b/src/lower/iterator.cpp @@ -569,6 +569,9 @@ void Iterators::createAccessIterators(Access access, Format format, Expr tensorI ProvenanceGraph provGraph, const map &tensorVars) { TensorVar tensorConcrete = access.getTensorVar(); + cout << "tensor: " << tensorConcrete << " " ; + cout << "tensorConcrete order: " << tensorConcrete.getOrder(); + cout << ", format order: " << format.getOrder() << endl; taco_iassert(tensorConcrete.getOrder() == format.getOrder()) << tensorConcrete << ", Format" << format; Shape shape = tensorConcrete.getType().getShape(); diff --git a/src/lower/lowerer_impl_imperative.cpp b/src/lower/lowerer_impl_imperative.cpp index 28bd6c7c2..1355c80a1 100644 --- a/src/lower/lowerer_impl_imperative.cpp +++ b/src/lower/lowerer_impl_imperative.cpp @@ -28,7 +28,7 @@ class LowererImplImperative::Visitor : public IndexNotationVisitorStrict { public: Visitor(LowererImplImperative* impl) : impl(impl) {} Stmt lower(IndexStmt stmt) { - std::cout << "lowering IndexStmt to ir:Stmt - IndexStmt: " << stmt << std::endl; + // std::cout << "lowering IndexStmt to ir:Stmt - IndexStmt: " << stmt << std::endl; this->stmt = Stmt(); impl->accessibleIterators.scope(); IndexStmtVisitorStrict::visit(stmt); @@ -138,6 +138,7 @@ static bool returnsTrue(IndexExpr expr) { } void visit(const CastNode* op) { + std::cout << "visiting cast node\n"; expr = rewrite(op->a); } @@ -418,7 +419,7 @@ LowererImplImperative::lower(IndexStmt stmt, string name, Stmt LowererImplImperative::lowerAssignment(Assignment assignment) { - std::cout << "\n\n converting assignment IndexStmt============================================ Assignment\n"; + // std::cout << "\n\n converting assignment IndexStmt============================================ Assignment\n"; taco_iassert(generateAssembleCode() || generateComputeCode()); Stmt computeStmt; @@ -426,7 +427,7 @@ Stmt LowererImplImperative::lowerAssignment(Assignment assignment) Expr var = getTensorVar(result); const bool needComputeAssign = util::contains(needCompute, result); - std::cout << "does assignment need compute assign: " << needComputeAssign << std::endl; + // std::cout << "does assignment need compute assign: " << needComputeAssign << std::endl; Expr rhs; if (needComputeAssign) { rhs = lower(assignment.getRhs()); @@ -434,26 +435,26 @@ Stmt LowererImplImperative::lowerAssignment(Assignment assignment) // Assignment to scalar variables. if (isScalar(result.getType())) { - std::cout << "assignment to scalar variables\n"; + // std::cout << "assignment to scalar variables\n"; if (needComputeAssign) { - std::cout << "compute assign\n"; + // std::cout << "compute assign\n"; if (!assignment.getOperator().defined()) { - std::cout << "assignment operator is not defined\n"; - std::cout << "var: " << var << ", rhs, : " << rhs << std::endl; + // std::cout << "assignment operator is not defined\n"; + // std::cout << "var: " << var << ", rhs, : " << rhs << std::endl; computeStmt = Assign::make(var, rhs); } else { taco_iassert(isa(assignment.getOperator())); - std::cout << "assignment depth -- loopDepth: " << loopDepth << std::endl; - std::cout << "is markAssignsAtomicDepth > 0: " << (markAssignsAtomicDepth > 0) << std::endl; - for (auto &tensors_ : whereTemps) { - std::cout << tensors_ << ", "; - } - std::cout << std::endl; - std::cout << result << std::endl; + // std::cout << "assignment depth -- loopDepth: " << loopDepth << std::endl; + // std::cout << "is markAssignsAtomicDepth > 0: " << (markAssignsAtomicDepth > 0) << std::endl; + // for (auto &tensors_ : whereTemps) { + // // std::cout << tensors_ << ", "; + // } + // std::cout << std::endl; + // std::cout << result << std::endl; int tempVarInitLoopDepth = whereTempsWithLoopDepth.find(result)->second; - std::cout << "tempInitLoopDepth: " << tempVarInitLoopDepth << std::endl; + // std::cout << "tempInitLoopDepth: " << tempVarInitLoopDepth << std::endl; bool reduction = false; std::map::iterator itr; @@ -461,24 +462,24 @@ Stmt LowererImplImperative::lowerAssignment(Assignment assignment) if (itr->first<=loopDepth && itr->first>tempVarInitLoopDepth && itr->second == ParallelUnit::CPUSimd) { reduction = true; } - std::cout << itr->first << "\t" << ParallelUnit_NAMES[(int) itr->second] << std::endl; + // std::cout << itr->first << "\t" << ParallelUnit_NAMES[(int) itr->second] << std::endl; } // less than or equal to loopDepth but greater than temp variable initialized loop depth bool useAtomics = markAssignsAtomicDepth > 0 && (!util::contains(whereTemps, result) || reduction); - std::cout << "whereTemps and result: " << !util::contains(whereTemps, result) << std::endl; - std::cout << "assignment to scalar variables useAtomics: " << useAtomics << std::endl; + // std::cout << "whereTemps and result: " << !util::contains(whereTemps, result) << std::endl; + // std::cout << "assignment to scalar variables useAtomics: " << useAtomics << std::endl; computeStmt = compoundAssign(var, rhs, useAtomics, atomicParallelUnit); - std::cout << "computeStatment: " << computeStmt << std::endl; + // std::cout << "computeStatment: " << computeStmt << std::endl; } } else { - std::cout << "not compute assign\n"; + // std::cout << "not compute assign\n"; } } // Assignments to tensor variables (non-scalar). else { - std::cout << "assignment to tensor variables\n"; + // std::cout << "assignment to tensor variables\n"; Expr values = getValuesArray(result); Expr loc = generateValueLocExpr(assignment.getLhs()); @@ -512,7 +513,7 @@ Stmt LowererImplImperative::lowerAssignment(Assignment assignment) } if (needComputeAssign && values.defined()) { - std::cout << "assign compute statement\n"; + // std::cout << "assign compute statement\n"; if (!assignment.getOperator().defined()) { computeStmt = Store::make(values, loc, rhs); } @@ -627,34 +628,35 @@ LowererImplImperative::splitAppenderAndInserters(const vector& results /* * This is the for loop lowering part */ + Stmt LowererImplImperative::lowerForall(Forall forall) { loopDepth++; forUnits.insert(std::pair(loopDepth,forall.getParallelUnit())); - std::cout << "doing lowerForall: " << forall << std::endl; + // std::cout << "doing lowerForall: " << forall << std::endl; bool hasExactBound = provGraph.hasExactBound(forall.getIndexVar()); bool forallNeedsUnderivedGuards = !hasExactBound && emitUnderivedGuards; - std::cout << "printing temporary variables with their atomic depths\n"; + // std::cout << "printing temporary variables with their atomic depths\n"; map::iterator itr; for (itr = whereTempsWithLoopDepth.begin(); itr != whereTempsWithLoopDepth.end(); ++itr) { - std::cout << itr->first << "\t" << itr->second << "\n"; + // std::cout << itr->first << "\t" << itr->second << "\n"; } if (!ignoreVectorize && forallNeedsUnderivedGuards && (forall.getParallelUnit() == ParallelUnit::CPUVector || forall.getUnrollFactor() > 0)) { - std::cout << "calling lowerForallCloned(forall)\n"; + // std::cout << "calling lowerForallCloned(forall)\n"; return lowerForallCloned(forall); } - std::cout << "inParallelLoopDepth: " << inParallelLoopDepth << "========================\n"; + // std::cout << "inParallelLoopDepth: " << inParallelLoopDepth << "========================\n"; if (forall.getParallelUnit() != ParallelUnit::NotParallel) { inParallelLoopDepth++; } - std::cout << "inParallelLoopDepth: " << inParallelLoopDepth << "========================\n"; + // std::cout << "inParallelLoopDepth: " << inParallelLoopDepth << "========================\n"; // Recover any available parents that were not recoverable previously vector recoverySteps; @@ -842,23 +844,23 @@ Stmt LowererImplImperative::lowerForall(Forall forall) } if (!isWhereProducer && hasPosDescendant && underivedAncestors.size() > 1 && provGraph.isPosVariable(iterator.getIndexVar()) && posDescendant == forall.getIndexVar()) { - std::cout << "calling lowerForallFusedPosition(forall\n"; + // std::cout << "calling lowerForallFusedPosition(forall\n"; loops = lowerForallFusedPosition(forall, iterator, locators, inserters, appenders, reducedAccesses, recoveryStmt); } else if (canAccelWithSparseIteration) { - std::cout << "calling lowerForallDenseAcceleration(forall\n"; + // std::cout << "calling lowerForallDenseAcceleration(forall\n"; loops = lowerForallDenseAcceleration(forall, locators, inserters, appenders, reducedAccesses, recoveryStmt); } // Emit dimension coordinate iteration loop else if (iterator.isDimensionIterator()) { - std::cout << "calling lowerForallDimension(forall\n"; + // std::cout << "calling lowerForallDimension(forall\n"; loops = lowerForallDimension(forall, point.locators(), inserters, appenders, reducedAccesses, recoveryStmt); } // Emit position iteration loop else if (iterator.hasPosIter()) { - std::cout << "calling lowerForallPosition(forall\n"; + // std::cout << "calling lowerForallPosition(forall\n"; loops = lowerForallPosition(forall, iterator, locators, inserters, appenders, reducedAccesses, recoveryStmt); } @@ -877,9 +879,9 @@ Stmt LowererImplImperative::lowerForall(Forall forall) forall.getStmt(), reducedAccesses); } - std::cout << "printing loops ----------------------------------------------------------------------------------------------\n"; - std::cout << loops << std::endl; - std::cout << "loops printed -----------------------------------------------------------------------------------------------\n"; + // std::cout << "printing loops ----------------------------------------------------------------------------------------------\n"; + // std::cout << loops << std::endl; + // std::cout << "loops printed -----------------------------------------------------------------------------------------------\n"; // taco_iassert(loops.defined()); if (!generateComputeCode() && !hasStores(loops)) { @@ -1203,22 +1205,22 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, set reducedAccesses, ir::Stmt recoveryStmt) { - std::cout << "1 Stmt LowererImplImperative::lowerForallDimension\n"; - std::cout << "1 Stmt LowererImplImperative::lowerForallDimension markAssignsAtomicDepth: " << markAssignsAtomicDepth << std::endl; + // std::cout << "1 Stmt LowererImplImperative::lowerForallDimension\n"; + // std::cout << "1 Stmt LowererImplImperative::lowerForallDimension markAssignsAtomicDepth: " << markAssignsAtomicDepth << std::endl; Expr coordinate = getCoordinateVar(forall.getIndexVar()); if (forall.getParallelUnit() != ParallelUnit::NotParallel && forall.getOutputRaceStrategy() == OutputRaceStrategy::Atomics) { markAssignsAtomicDepth++; - std::cout << "1 Stmt LowererImplImperative::lowerForallDimension getParallelUnit() is Not NotParallel and outputRaceStrategy is Atomics\n"; - std::cout << "markAssignsAtomicDepth: " << markAssignsAtomicDepth << std::endl; + // std::cout << "1 Stmt LowererImplImperative::lowerForallDimension getParallelUnit() is Not NotParallel and outputRaceStrategy is Atomics\n"; + // std::cout << "markAssignsAtomicDepth: " << markAssignsAtomicDepth << std::endl; atomicParallelUnit = forall.getParallelUnit(); } else { - std::cout << "1 Stmt LowererImplImperative::lowerForallDimension getParallelUnit() is NotParallel or outputRaceStrategy is not Atomics\n"; + // std::cout << "1 Stmt LowererImplImperative::lowerForallDimension getParallelUnit() is NotParallel or outputRaceStrategy is not Atomics\n"; } - std::cout << "original forall : " << forall << std::endl; - std::cout << "inside IndexStmt: " << forall.getStmt() << std::endl; + // std::cout << "original forall : " << forall << std::endl; + // std::cout << "inside IndexStmt: " << forall.getStmt() << std::endl; Stmt body = lowerForallBody(coordinate, forall.getStmt(), locators, inserters, appenders, reducedAccesses); @@ -1235,7 +1237,7 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, LoopKind kind = LoopKind::Serial; if (should_use_ISPC_codegen()) { - std::cout << "Foreach compatible loop\n"; + // std::cout << "Foreach compatible loop\n"; if (forall.getParallelUnit() == ParallelUnit::CPUSimd) { kind = LoopKind::Foreach; } @@ -1253,7 +1255,7 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, kind = LoopKind::Runtime; } - std::cout << "2 Stmt LowererImplImperative::lowerForallDimension\n"; + // std::cout << "2 Stmt LowererImplImperative::lowerForallDimension\n"; return Block::blanks(For::make(coordinate, bounds[0], bounds[1], 1, body, kind, ignoreVectorize ? ParallelUnit::NotParallel : forall.getParallelUnit(), ignoreVectorize ? 0 : forall.getUnrollFactor()), @@ -1267,7 +1269,7 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, set reducedAccesses, ir::Stmt recoveryStmt) { - std::cout << "1 Stmt LowererImplImperative::lowerForallDenseAcceleration\n"; + // std::cout << "1 Stmt LowererImplImperative::lowerForallDenseAcceleration\n"; taco_iassert(locators.size() == 1) << "Optimizing a dense workspace is only supported when the consumer is the only RHS tensor"; taco_iassert(provGraph.isFullyDerived(forall.getIndexVar())) << "Sparsely accelerating a dense workspace only works with fully derived index vars"; taco_iassert(forall.getParallelUnit() == ParallelUnit::NotParallel) << "Sparsely accelerating a dense workspace only works within serial loops"; @@ -1293,8 +1295,8 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, } Stmt declareVar = VarDecl::make(coordinate, Load::make(indexList, loopVar)); - std::cout << "original forall : " << forall << std::endl; - std::cout << "inside IndexStmt: " << forall.getStmt() << std::endl; + // std::cout << "original forall : " << forall << std::endl; + // std::cout << "inside IndexStmt: " << forall.getStmt() << std::endl; Stmt body = lowerForallBody(coordinate, forall.getStmt(), locators, inserters, appenders, reducedAccesses); Stmt resetGuard = ir::Store::make(bitGuard, coordinate, ir::Literal::make(false), markAssignsAtomicDepth > 0, atomicParallelUnit); @@ -1320,7 +1322,7 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, kind = LoopKind::Runtime; } - std::cout << "2 Stmt LowererImplImperative::lowerForallDenseAcceleration\n"; + // std::cout << "2 Stmt LowererImplImperative::lowerForallDenseAcceleration\n"; return Block::blanks(For::make(loopVar, 0, indexListSize, 1, body, kind, ignoreVectorize ? ParallelUnit::NotParallel : forall.getParallelUnit(), ignoreVectorize ? 0 : forall.getUnrollFactor()), @@ -1344,7 +1346,7 @@ Stmt LowererImplImperative::lowerForallPosition(Forall forall, Iterator iterator set reducedAccesses, ir::Stmt recoveryStmt) { - std::cout << "1 Stmt LowererImplImperative::lowerForallPosition\n" << std::endl; + // std::cout << "1 Stmt LowererImplImperative::lowerForallPosition\n" << std::endl; Expr coordinate = getCoordinateVar(forall.getIndexVar()); Stmt declareCoordinate = Stmt(); @@ -1380,8 +1382,8 @@ Stmt LowererImplImperative::lowerForallPosition(Forall forall, Iterator iterator // see we are inside a forall. ex: forall(i, forall(j, y(i) += A(i,j) * x(j))) // when you call forall.getStmt it returns forall(j, y(i) += A(i,j) * x(j)) which is the // IndexStmt inside the forall IndexStmt - std::cout << "original forall : " << forall << std::endl; - std::cout << "inside IndexStmt: " << forall.getStmt() << std::endl; + // std::cout << "original forall : " << forall << std::endl; + // std::cout << "inside IndexStmt: " << forall.getStmt() << std::endl; Stmt body = lowerForallBody(coordinate, forall.getStmt(), locators, inserters, appenders, reducedAccesses); @@ -1443,7 +1445,7 @@ Stmt LowererImplImperative::lowerForallPosition(Forall forall, Iterator iterator kind = LoopKind::Runtime; } - std::cout << "2 Stmt LowererImplImperative::lowerForallPosition\n" << std::endl; + // std::cout << "2 Stmt LowererImplImperative::lowerForallPosition\n" << std::endl; // Loop with preamble and postamble return Block::blanks( boundsCompute, @@ -1462,7 +1464,7 @@ Stmt LowererImplImperative::lowerForallFusedPosition(Forall forall, Iterator ite set reducedAccesses, ir::Stmt recoveryStmt) { - std::cout << "1 Stmt LowererImplImperative::lowerForallFusedPosition\n" << std::endl; + // std::cout << "1 Stmt LowererImplImperative::lowerForallFusedPosition\n" << std::endl; Expr coordinate = getCoordinateVar(forall.getIndexVar()); Stmt declareCoordinate = Stmt(); if (provGraph.isCoordVariable(forall.getIndexVar())) { @@ -1553,8 +1555,8 @@ Stmt LowererImplImperative::lowerForallFusedPosition(Forall forall, Iterator ite markAssignsAtomicDepth++; } - std::cout << "original forall : " << forall << std::endl; - std::cout << "inside IndexStmt: " << forall.getStmt() << std::endl; + // std::cout << "original forall : " << forall << std::endl; + // std::cout << "inside IndexStmt: " << forall.getStmt() << std::endl; Stmt body = lowerForallBody(coordinate, forall.getStmt(), locators, inserters, appenders, reducedAccesses); @@ -1612,7 +1614,7 @@ Stmt LowererImplImperative::lowerForallFusedPosition(Forall forall, Iterator ite kind = LoopKind::Runtime; } - std::cout << "2 Stmt LowererImplImperative::lowerForallFusedPosition\n" << std::endl; + // std::cout << "2 Stmt LowererImplImperative::lowerForallFusedPosition\n" << std::endl; // Loop with preamble and postamble return Block::blanks(boundsCompute, Block::make(Block::make(searchForUnderivedStart), @@ -1713,6 +1715,7 @@ Stmt LowererImplImperative::lowerMergePoint(MergeLattice pointLattice, ir::Assign::make(indexSetIter.getCoordVar(), indexSetIter.getPosVar()) ); // Code to increment both iterator variables. + std::cout << "some casting stuff happening\n"; auto incr = ir::Block::make( compoundAssign(iter.getIteratorVar(), ir::Cast::make(Eq::make(iter.getCoordVar(), setMatch), iter.getIteratorVar().type())), compoundAssign(indexSetIter.getIteratorVar(), ir::Cast::make(Eq::make(indexSetIter.getCoordVar(), setMatch), indexSetIter.getIteratorVar().type())), @@ -1876,7 +1879,7 @@ Stmt LowererImplImperative::lowerForallBody(Expr coordinate, IndexStmt stmt, vector appenders, const set& reducedAccesses) { - std::cout << "lowering a forall body----------------------------------------------------\n"; + // std::cout << "lowering a forall body----------------------------------------------------\n"; Stmt initVals = resizeAndInitValues(appenders, reducedAccesses); @@ -1893,7 +1896,7 @@ Stmt LowererImplImperative::lowerForallBody(Expr coordinate, IndexStmt stmt, // Code of loop body statement Stmt body = lower(stmt); - std::cout << "\nBefore: [" << stmt << "]\nAfter : [" << body << "]\n"; + // std::cout << "\nBefore: [" << stmt << "]\nAfter : [" << body << "]\n"; // Code to append coordinates Stmt appendCoords = appendCoordinate(appenders, coordinate); @@ -1911,10 +1914,12 @@ Expr LowererImplImperative::getTemporarySize(Where where) { TensorVar temporary = where.getTemporary(); Dimension temporarySize = temporary.getType().getShape().getDimension(0); Access temporaryAccess = getResultAccesses(where.getProducer()).first[0]; + std::cout << "temporaryAccess: " << temporaryAccess; std::vector indexVars = temporaryAccess.getIndexVars(); if(util::all(indexVars, [&](const IndexVar& var) { return provGraph.isUnderived(var);})) { // All index vars underived then use tensor properties to get tensor size + std::cout << "All index vars underived then use tensor properties to get tensor size\n"; taco_iassert(util::contains(dimensions, indexVars[0])) << "Missing " << indexVars[0]; ir::Expr size = dimensions.at(indexVars[0]); for(size_t i = 1; i < indexVars.size(); ++i) { @@ -1925,16 +1930,19 @@ Expr LowererImplImperative::getTemporarySize(Where where) { } if (temporarySize.isFixed()) { + std::cout << "temporary is fixed\n" ; return ir::Literal::make(temporarySize.getSize()); } if (temporarySize.isIndexVarSized()) { + std::cout << "temporary is index var sized\n"; IndexVar var = temporarySize.getIndexVarSize(); vector bounds = provGraph.deriveIterBounds(var, definedIndexVarsOrdered, underivedBounds, indexVarToExprMap, iterators); return ir::Sub::make(bounds[1], bounds[0]); } + std::cout << "should this be an error\n"; taco_ierror; // TODO return Expr(); } @@ -2003,7 +2011,7 @@ vector LowererImplImperative::codeToInitializeDenseAcceleratorArrays(Where Expr p = Var::make("p" + temporary.getName(), Int()); Stmt guardZeroInit = Store::make(alreadySetArr, p, ir::Literal::zero(bitGuardType)); - std::cout << "vector LowererImplImperative::codeToInitializeDenseAcceleratorArrays\n" << std::endl; + // std::cout << "vector LowererImplImperative::codeToInitializeDenseAcceleratorArrays\n" << std::endl; Stmt zeroInitLoop = For::make(p, 0, bitGuardSize, 1, guardZeroInit, LoopKind::Serial); Stmt inits = Block::make(alreadySetDecl, indexListDecl, allocateAlreadySet, allocateIndexList, zeroInitLoop); return {inits, freeTemps}; @@ -2205,8 +2213,10 @@ vector LowererImplImperative::codeToInitializeTemporaryParallel(Where wher vector LowererImplImperative::codeToInitializeTemporary(Where where) { TensorVar temporary = where.getTemporary(); + cout << "temporary found: " << temporary << std::endl; const bool accelerateDense = canAccelerateDenseTemp(where).first; + cout << "accelerateDense: " << accelerateDense << std::endl; Stmt freeTemporary = Stmt(); Stmt initializeTemporary = Stmt(); @@ -2217,6 +2227,7 @@ vector LowererImplImperative::codeToInitializeTemporary(Where where) { initializeTemporary = Block::make(initializeTemporary, initTempSet); tempToBitGuard[temporary] = tempSet; } else { + cout << "higher order temporary found: " << temporary << std::endl; // TODO: Need to support keeping track of initialized elements for // temporaries that don't have sparse accelerator taco_iassert(!util::contains(guardedTemps, temporary) || accelerateDense); @@ -2234,19 +2245,32 @@ vector LowererImplImperative::codeToInitializeTemporary(Where where) { needComputeValues(where, temporary)) { values = ir::Var::make(temporary.getName(), temporary.getType().getDataType(), true, false); - taco_iassert(temporary.getType().getOrder() == 1) - << " Temporary order was " << temporary.getType().getOrder(); // TODO + std::cout << "values: " << values << std::endl; + std::cout << "dataType: " << values.type() << std::endl; + + // taco_iassert(temporary.getType().getOrder() == 1) + // << " Temporary order was " << temporary.getType().getOrder(); // TODO + Expr size = getTemporarySize(where); + std::cout << "temporarySize: " << size << std::endl; + // no decl needed for shared memory Stmt decl = Stmt(); if ((isa(where.getProducer()) && inParallelLoopDepth == 0) || !should_use_CUDA_codegen()) { decl = VarDecl::make(values, ir::Literal::make(0)); + std::cout << "decl statement: " << decl << std::endl; } Stmt allocate = Allocate::make(values, size); + std::cout << "allocate stmt: " << allocate << std::endl; freeTemporary = Block::make(freeTemporary, Free::make(values)); + std::cout << "free temp: " << freeTemporary << std::endl; initializeTemporary = Block::make(decl, initializeTemporary, allocate); + std::cout << "initializeTemporary: " << initializeTemporary << std::endl; + + // taco_iassert(temporary.getType().getOrder() == 1) + // << " Temporary order was " << temporary.getType().getOrder(); // TODO } /// Make a struct object that lowerAssignment and lowerAccess can read @@ -2259,7 +2283,7 @@ vector LowererImplImperative::codeToInitializeTemporary(Where where) { } Stmt LowererImplImperative::lowerWhere(Where where) { - std::cout << "\n--------------------------------------- lowering where statement: " << where << "\n\n\n"; + // std::cout << "\n--------------------------------------- lowering where statement: " << where << "\n\n\n"; TensorVar temporary = where.getTemporary(); bool accelerateDenseWorkSpace, sortAccelerator; std::tie(accelerateDenseWorkSpace, sortAccelerator) = @@ -2296,7 +2320,7 @@ Stmt LowererImplImperative::lowerWhere(Where where) { }) ); - std::cout << "\ninitiating lowering of where consumer: " << where.getConsumer() << std::endl; + // std::cout << "\ninitiating lowering of where consumer: " << where.getConsumer() << std::endl; Stmt consumer = lower(where.getConsumer()); if (accelerateDenseWorkSpace && sortAccelerator) { // We need to sort the indices array @@ -2320,13 +2344,13 @@ Stmt LowererImplImperative::lowerWhere(Where where) { true, false); Expr size = getTemporarySize(where); Stmt zeroInit = Store::make(values, p, ir::Literal::zero(temporary.getType().getDataType())); - std::cout << "Stmt LowererImplImperative::lowerWhere\n"; + // std::cout << "Stmt LowererImplImperative::lowerWhere\n"; Stmt loopInit = For::make(p, 0, size, 1, zeroInit, LoopKind::Serial); initializeTemporary = Block::make(initializeTemporary, loopInit); } whereConsumers.push_back(consumer); - std::cout << "\nwhere temporaries: " << where.getTemporary() << std::endl; + // std::cout << "\nwhere temporaries: " << where.getTemporary() << std::endl; whereTemps.push_back(where.getTemporary()); captureNextLocatePos = true; @@ -2339,7 +2363,7 @@ Stmt LowererImplImperative::lowerWhere(Where where) { whereTempsWithLoopDepth.insert(std::pair(where.getTemporary(), loopDepth)); - std::cout << "\ninitiating lowering of where producer: " << where.getConsumer() << std::endl; + // std::cout << "\ninitiating lowering of where producer: " << where.getConsumer() << std::endl; Stmt producer = lower(where.getProducer()); if (accelerateDenseWorkSpace) { const Expr indexListSizeExpr = tempToIndexListSize.at(temporary); @@ -2458,7 +2482,7 @@ Stmt LowererImplImperative::lowerAssemble(Assemble assemble) { resultModeOrdering[iter.getMode().getLevel() - 1]); Expr pos = iter.getPosVar(); Stmt initPos = VarDecl::make(pos, iter.locate(locateCoords)[0]); - std::cout << "Stmt LowererImplImperative::lowerAssemble\n"; + // std::cout << "Stmt LowererImplImperative::lowerAssemble\n"; insertEdgeLoop = For::make(coords.back(), 0, dim, 1, Block::make(initPos, insertEdgeLoop)); } else { @@ -2496,7 +2520,7 @@ Stmt LowererImplImperative::lowerAssemble(Assemble assemble) { initAssembleStmts.push_back(initValues); } } else if (zeroInit) { - initAssembleStmts.push_back(zeroInitValues(resultTensorVar, 0, prevSize)); + initAssembleStmts.push_back(zeroInitValues(resultTensorVar, 0, prevSize)); // init values } } Stmt initAssemble = Block::make(initAssembleStmts); @@ -2540,7 +2564,7 @@ Stmt LowererImplImperative::lowerMulti(Multi multi) { } Stmt LowererImplImperative::lowerSuchThat(SuchThat suchThat) { - std::cout << "lowering such that statement\n"; + // std::cout << "lowering such that statement\n"; Stmt stmt = lower(suchThat.getStmt()); return Block::make(stmt); } @@ -2654,6 +2678,7 @@ Expr LowererImplImperative::lowerSqrt(Sqrt sqrt) { Expr LowererImplImperative::lowerCast(Cast cast) { + std::cout << "casting: " << cast.getA() << ", dataType: " << cast.getDataType() << std::endl; return ir::Cast::make(lower(cast.getA()), cast.getDataType()); } @@ -2870,7 +2895,7 @@ Stmt LowererImplImperative::initResultArrays(vector writes, // iteration of all the iterators is not full. We can check this by seeing if we can recover a // full iterator from our set of iterators. Expr size = generateAssembleCode() ? getCapacityVar(tensor) : parentSize; - result.push_back(zeroInitValues(tensor, 0, size)); + result.push_back(zeroInitValues(tensor, 0, size)); // init values } } return result.empty() ? Stmt() : Block::blanks(result); @@ -3021,7 +3046,7 @@ Stmt LowererImplImperative::initResultArrays(IndexVar var, vector writes util::contains(reducedAccesses, write)) { // Zero-initialize values array if might not assign to every element // in values array during compute - result.push_back(zeroInitValues(tensor, resultParentPos, stride)); + result.push_back(zeroInitValues(tensor, resultParentPos, stride)); // init values } } } @@ -3068,7 +3093,7 @@ Stmt LowererImplImperative::resizeAndInitValues(const std::vector& app Stmt LowererImplImperative::zeroInitValues(Expr tensor, Expr begin, Expr size) { - std::cout << "1 Stmt LowererImplImperative::zeroInitValues\n"; + // std::cout << "1 Stmt LowererImplImperative::zeroInitValues\n"; Expr lower = simplify(ir::Mul::make(begin, size)); Expr upper = simplify(ir::Mul::make(ir::Add::make(begin, 1), size)); Expr p = Var::make("p" + util::toString(tensor), Int()); @@ -3081,9 +3106,10 @@ Stmt LowererImplImperative::zeroInitValues(Expr tensor, Expr begin, Expr size) { return ir::VarDecl::make(ir::Var::make("status", Int()), ir::Call::make("cudaMemset", {values, ir::Literal::make(0, Int()), ir::Mul::make(ir::Sub::make(upper, lower), ir::Literal::make(values.type().getNumBytes()))}, Int())); } - std::cout << "2 Stmt LowererImplImperative::zeroInitValues\n"; + // std::cout << "2 Stmt LowererImplImperative::zeroInitValues\n"; + // if generating ispc code, we will keep the LoopKind as Init so that we can initializa it if tasks are used if (should_use_ISPC_codegen()) { - return For::make(p, lower, upper, 1, zeroInit, LoopKind::Foreach); + return For::make(p, lower, upper, 1, zeroInit, LoopKind::Init); } return For::make(p, lower, upper, 1, zeroInit, parallel); } @@ -3366,6 +3392,7 @@ Stmt LowererImplImperative::codeToIncIteratorVars(Expr coordinate, IndexVar coor for (auto& iterator : levelIterators) { Expr ivar = iterator.getIteratorVar(); if (iterator.isUnique()) { + std::cout << "casting \n"; Expr increment = iterator.isFull() ? 1 : ir::Cast::make(Eq::make(iterator.getCoordVar(), @@ -3636,6 +3663,7 @@ Expr LowererImplImperative::generateAssembleGuard(IndexExpr expr) { } void visit(const CastNode* node) { + std::cout << "lowering to cast node\n"; expr = lower(node->a); } diff --git a/src/lower/tensor_path.h b/src/lower/tensor_path.h index 4f5dc49af..da52fb782 100644 --- a/src/lower/tensor_path.h +++ b/src/lower/tensor_path.h @@ -2,6 +2,7 @@ #define TACO_TENSOR_PATH_H #include +#include #include #include "taco/util/comparable.h" @@ -47,14 +48,13 @@ class TensorPath : public util::Comparable { friend bool operator==(const TensorPath&, const TensorPath&); friend bool operator<(const TensorPath&, const TensorPath&); + friend std::ostream& operator<<(std::ostream&, const TensorPath&); private: struct Content; std::shared_ptr content; }; -std::ostream& operator<<(std::ostream&, const TensorPath&); - /// A step along a tensor path. class TensorPathStep : public util::Comparable { diff --git a/src/tensor.cpp b/src/tensor.cpp index 5e02d2660..176856196 100644 --- a/src/tensor.cpp +++ b/src/tensor.cpp @@ -10,6 +10,7 @@ #include #include +#include "../test/util.h" #include "taco/cuda.h" #include "taco/format.h" #include "taco/taco_tensor_t.h" @@ -806,7 +807,36 @@ void TensorBase::assemble() { } } -void TensorBase::compute() { +void TensorBase::compute(std::ofstream& statfile, std::string& sofile) { + taco_uassert(!needsCompile()) << error::compute_without_compile; + // if (!needsCompute()) { + // return; + // } + setNeedsCompute(false); + // Sync operand tensors if needed. + auto operands = getTensors(getAssignment().getRhs()); + for (auto& operand : operands) { + // std::cout << "operand: " << operand.second << std::endl; + operand.second.syncValues(); + operand.second.removeDependentTensor(*this); + } + + auto arguments = packArguments(*this); + + taco::util::TimeResults timevalue; + bool time = true; + TOOL_BENCHMARK_TIMER2(this->content->module->callFuncPacked("compute", sofile, arguments.data()), + "\nkernel execution time: ", timevalue); + // this->content->module->callFuncPacked("compute", arguments.data()); + + if (content->assembleWhileCompute) { + setNeedsAssemble(false); + taco_tensor_t* tensorData = ((taco_tensor_t*)arguments[0]); + content->valuesSize = unpackTensorData(*tensorData, *this); + } +} + +void TensorBase::compute(std::ofstream& statfile) { taco_uassert(!needsCompile()) << error::compute_without_compile; // if (!needsCompute()) { // return; @@ -820,7 +850,37 @@ void TensorBase::compute() { } auto arguments = packArguments(*this); + + taco::util::TimeResults timevalue; + bool time = true; + TOOL_BENCHMARK_TIMER2(this->content->module->callFuncPacked("compute", arguments.data()), + "\nkernel execution time: ", timevalue); + // this->content->module->callFuncPacked("compute", arguments.data()); + + if (content->assembleWhileCompute) { + setNeedsAssemble(false); + taco_tensor_t* tensorData = ((taco_tensor_t*)arguments[0]); + content->valuesSize = unpackTensorData(*tensorData, *this); + } +} + +void TensorBase::compute() { + taco_uassert(!needsCompile()) << error::compute_without_compile; + if (!needsCompute()) { + return; + } + setNeedsCompute(false); + // Sync operand tensors if needed. + auto operands = getTensors(getAssignment().getRhs()); + for (auto& operand : operands) { + operand.second.syncValues(); + operand.second.removeDependentTensor(*this); + } + + auto arguments = packArguments(*this); + std::cout << "running the compute function from the shared library\n"; this->content->module->callFuncPacked("compute", arguments.data()); + std::cout << "compute function executed\n"; if (content->assembleWhileCompute) { setNeedsAssemble(false); diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 02464ce26..f4d848de0 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -11,6 +11,7 @@ add_executable(taco-test ${TEST_SOURCES} ${TEST_HEADERS}) target_link_libraries(taco-test taco-gtest) target_link_libraries(taco-test pthread) target_link_libraries(taco-test taco) +target_link_libraries(taco-test papi) if(${CMAKE_VERSION} VERSION_LESS "3.9.0") add_test(NAME taco-test COMMAND taco-test) diff --git a/test/kernels/mttkrp_gemm/mttkrp_ryan.c b/test/kernels/mttkrp_gemm/mttkrp_ryan.c new file mode 100644 index 000000000..9d0536b8c --- /dev/null +++ b/test/kernels/mttkrp_gemm/mttkrp_ryan.c @@ -0,0 +1,177 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A1845, taco_tensor_t *matmul_5_5_5, taco_tensor_t *A1475, taco_tensor_t *A1416) { + int A18451_dimension = (int)(A1845->dimensions[0]); + int A18452_dimension = (int)(A1845->dimensions[1]); + double* restrict A1845_vals = (double*)(A1845->vals); + + A1845_vals = (double*)malloc(sizeof(double) * (A18451_dimension * A18452_dimension)); + + A1845->vals = (uint8_t*)A1845_vals; + return 0; +} + +int compute(taco_tensor_t *A1845, taco_tensor_t *matmul_5_5_5, taco_tensor_t *A1475, taco_tensor_t *A1416) { + int A18451_dimension = (int)(A1845->dimensions[0]); + int A18452_dimension = (int)(A1845->dimensions[1]); + double* restrict A1845_vals = (double*)(A1845->vals); + int* restrict matmul_5_5_51_pos = (int*)(matmul_5_5_5->indices[0][0]); + int* restrict matmul_5_5_51_crd = (int*)(matmul_5_5_5->indices[0][1]); + int* restrict matmul_5_5_52_pos = (int*)(matmul_5_5_5->indices[1][0]); + int* restrict matmul_5_5_52_crd = (int*)(matmul_5_5_5->indices[1][1]); + int* restrict matmul_5_5_53_pos = (int*)(matmul_5_5_5->indices[2][0]); + int* restrict matmul_5_5_53_crd = (int*)(matmul_5_5_5->indices[2][1]); + double* restrict matmul_5_5_5_vals = (double*)(matmul_5_5_5->vals); + int A14751_dimension = (int)(A1475->dimensions[0]); + int A14752_dimension = (int)(A1475->dimensions[1]); + double* restrict A1475_vals = (double*)(A1475->vals); + int A14161_dimension = (int)(A1416->dimensions[0]); + int A14162_dimension = (int)(A1416->dimensions[1]); + double* restrict A1416_vals = (double*)(A1416->vals); + + #pragma omp parallel for schedule(static) + for (int32_t pA1845 = 0; pA1845 < (A18451_dimension * A18452_dimension); pA1845++) { + A1845_vals[pA1845] = 0.0; + } + + #pragma omp parallel for schedule(runtime) + for (int32_t i1542matmul_5_5_5 = matmul_5_5_51_pos[0]; i1542matmul_5_5_5 < matmul_5_5_51_pos[1]; i1542matmul_5_5_5++) { + int32_t i1542 = matmul_5_5_51_crd[i1542matmul_5_5_5]; + for (int32_t i1545 = 0; i1545 < A14162_dimension; i1545++) { + int32_t i1545A1845 = i1542 * A18452_dimension + i1545; + double ti1543A1845_val = 0.0; + for (int32_t i1543matmul_5_5_5 = matmul_5_5_52_pos[i1542matmul_5_5_5]; i1543matmul_5_5_5 < matmul_5_5_52_pos[(i1542matmul_5_5_5 + 1)]; i1543matmul_5_5_5++) { + int32_t i1543 = matmul_5_5_52_crd[i1543matmul_5_5_5]; + int32_t i1545A1416 = i1543 * A14162_dimension + i1545; + for (int32_t i1544matmul_5_5_5 = matmul_5_5_53_pos[i1543matmul_5_5_5]; i1544matmul_5_5_5 < matmul_5_5_53_pos[(i1543matmul_5_5_5 + 1)]; i1544matmul_5_5_5++) { + int32_t i1544 = matmul_5_5_53_crd[i1544matmul_5_5_5]; + int32_t i1545A1475 = i1544 * A14752_dimension + i1545; + ti1543A1845_val += (matmul_5_5_5_vals[i1544matmul_5_5_5] * A1475_vals[i1545A1475]) * A1416_vals[i1545A1416]; + } + } + A1845_vals[i1545A1845] = ti1543A1845_val; + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/mttkrp_gemm/taco_default.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} diff --git a/test/kernels/mttkrp_gemm/mttkrp_ryan.h b/test/kernels/mttkrp_gemm/mttkrp_ryan.h new file mode 100644 index 000000000..3d0c06f50 --- /dev/null +++ b/test/kernels/mttkrp_gemm/mttkrp_ryan.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A1845, taco_tensor_t *matmul_5_5_5, taco_tensor_t *A1475, taco_tensor_t *A1416); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A1845, taco_tensor_t *matmul_5_5_5, taco_tensor_t *A1475, taco_tensor_t *A1416); +#endif diff --git a/test/kernels/mttkrp_gemm/taco_default.c b/test/kernels/mttkrp_gemm/taco_default.c new file mode 100644 index 000000000..edf8cdb16 --- /dev/null +++ b/test/kernels/mttkrp_gemm/taco_default.c @@ -0,0 +1,183 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A1538, taco_tensor_t *matmul_5_5_5, taco_tensor_t *A1475, taco_tensor_t *A1416, taco_tensor_t *A1479) { + int A15381_dimension = (int)(A1538->dimensions[0]); + int A15382_dimension = (int)(A1538->dimensions[1]); + double* restrict A1538_vals = (double*)(A1538->vals); + + A1538_vals = (double*)malloc(sizeof(double) * (A15381_dimension * A15382_dimension)); + + A1538->vals = (uint8_t*)A1538_vals; + return 0; +} + +int compute(taco_tensor_t *A1538, taco_tensor_t *matmul_5_5_5, taco_tensor_t *A1475, taco_tensor_t *A1416, taco_tensor_t *A1479) { + int A15381_dimension = (int)(A1538->dimensions[0]); + int A15382_dimension = (int)(A1538->dimensions[1]); + double* restrict A1538_vals = (double*)(A1538->vals); + int* restrict matmul_5_5_51_pos = (int*)(matmul_5_5_5->indices[0][0]); + int* restrict matmul_5_5_51_crd = (int*)(matmul_5_5_5->indices[0][1]); + int* restrict matmul_5_5_52_pos = (int*)(matmul_5_5_5->indices[1][0]); + int* restrict matmul_5_5_52_crd = (int*)(matmul_5_5_5->indices[1][1]); + int* restrict matmul_5_5_53_pos = (int*)(matmul_5_5_5->indices[2][0]); + int* restrict matmul_5_5_53_crd = (int*)(matmul_5_5_5->indices[2][1]); + double* restrict matmul_5_5_5_vals = (double*)(matmul_5_5_5->vals); + int A14751_dimension = (int)(A1475->dimensions[0]); + int A14752_dimension = (int)(A1475->dimensions[1]); + double* restrict A1475_vals = (double*)(A1475->vals); + int A14161_dimension = (int)(A1416->dimensions[0]); + int A14162_dimension = (int)(A1416->dimensions[1]); + double* restrict A1416_vals = (double*)(A1416->vals); + int A14791_dimension = (int)(A1479->dimensions[0]); + int A14792_dimension = (int)(A1479->dimensions[1]); + double* restrict A1479_vals = (double*)(A1479->vals); + + #pragma omp parallel for schedule(static) + for (int32_t pA1538 = 0; pA1538 < (A15381_dimension * A15382_dimension); pA1538++) { + A1538_vals[pA1538] = 0.0; + } + + #pragma omp parallel for schedule(runtime) + for (int32_t i1542matmul_5_5_5 = matmul_5_5_51_pos[0]; i1542matmul_5_5_5 < matmul_5_5_51_pos[1]; i1542matmul_5_5_5++) { + int32_t i1542 = matmul_5_5_51_crd[i1542matmul_5_5_5]; + for (int32_t i1546 = 0; i1546 < A14792_dimension; i1546++) { + int32_t i1546A1538 = i1542 * A15382_dimension + i1546; + double ti1543A1538_val = 0.0; + for (int32_t i1543matmul_5_5_5 = matmul_5_5_52_pos[i1542matmul_5_5_5]; i1543matmul_5_5_5 < matmul_5_5_52_pos[(i1542matmul_5_5_5 + 1)]; i1543matmul_5_5_5++) { + int32_t i1543 = matmul_5_5_52_crd[i1543matmul_5_5_5]; + for (int32_t i1544matmul_5_5_5 = matmul_5_5_53_pos[i1543matmul_5_5_5]; i1544matmul_5_5_5 < matmul_5_5_53_pos[(i1543matmul_5_5_5 + 1)]; i1544matmul_5_5_5++) { + int32_t i1544 = matmul_5_5_53_crd[i1544matmul_5_5_5]; + for (int32_t i1545 = 0; i1545 < A14791_dimension; i1545++) { + int32_t i1545A1475 = i1544 * A14752_dimension + i1545; + int32_t i1545A1416 = i1543 * A14162_dimension + i1545; + int32_t i1546A1479 = i1545 * A14792_dimension + i1546; + ti1543A1538_val += ((matmul_5_5_5_vals[i1544matmul_5_5_5] * A1475_vals[i1545A1475]) * A1416_vals[i1545A1416]) * A1479_vals[i1546A1479]; + } + } + } + A1538_vals[i1546A1538] = ti1543A1538_val; + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/mttkrp_gemm/taco_default.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]), (taco_tensor_t*)(parameterPack[4])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]), (taco_tensor_t*)(parameterPack[4])); +} diff --git a/test/kernels/mttkrp_gemm/taco_default.h b/test/kernels/mttkrp_gemm/taco_default.h new file mode 100644 index 000000000..54274569e --- /dev/null +++ b/test/kernels/mttkrp_gemm/taco_default.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A1538, taco_tensor_t *matmul_5_5_5, taco_tensor_t *A1475, taco_tensor_t *A1416, taco_tensor_t *A1479); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A1538, taco_tensor_t *matmul_5_5_5, taco_tensor_t *A1475, taco_tensor_t *A1416, taco_tensor_t *A1479); +#endif diff --git a/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.c b/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.c new file mode 100644 index 000000000..a5e031e7a --- /dev/null +++ b/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.c @@ -0,0 +1,199 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A2531, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451) { + int* restrict A25312_pos = (int*)(A2531->indices[1][0]); + int* restrict A25312_crd = (int*)(A2531->indices[1][1]); + double* restrict A2531_vals = (double*)(A2531->vals); + int* restrict cage32_pos = (int*)(cage3->indices[1][0]); + int* restrict cage32_crd = (int*)(cage3->indices[1][1]); + int A13921_dimension = (int)(A1392->dimensions[0]); + + A25312_pos = (int32_t*)malloc(sizeof(int32_t) * 6); + A25312_pos[0] = 0; + for (int32_t pA25312 = 1; pA25312 < 6; pA25312++) { + A25312_pos[pA25312] = 0; + } + int32_t A25312_crd_size = 1048576; + A25312_crd = (int32_t*)malloc(sizeof(int32_t) * A25312_crd_size); + int32_t i1468A2531 = 0; + + for (int32_t i1467 = 0; i1467 < A13921_dimension; i1467++) { + int32_t pA25312_begin = i1468A2531; + + for (int32_t i1468cage3 = cage32_pos[i1467]; i1468cage3 < cage32_pos[(i1467 + 1)]; i1468cage3++) { + int32_t i1468 = cage32_crd[i1468cage3]; + if (A25312_crd_size <= i1468A2531) { + A25312_crd = (int32_t*)realloc(A25312_crd, sizeof(int32_t) * (A25312_crd_size * 2)); + A25312_crd_size *= 2; + } + A25312_crd[i1468A2531] = i1468; + i1468A2531++; + } + + A25312_pos[i1467 + 1] = i1468A2531 - pA25312_begin; + } + + int32_t csA25312 = 0; + for (int32_t pA253120 = 1; pA253120 < 6; pA253120++) { + csA25312 += A25312_pos[pA253120]; + A25312_pos[pA253120] = csA25312; + } + + A2531_vals = (double*)malloc(sizeof(double) * i1468A2531); + + A2531->indices[1][0] = (uint8_t*)(A25312_pos); + A2531->indices[1][1] = (uint8_t*)(A25312_crd); + A2531->vals = (uint8_t*)A2531_vals; + return 0; +} + +int compute(taco_tensor_t *A2531, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451) { + double* restrict A2531_vals = (double*)(A2531->vals); + int* restrict cage32_pos = (int*)(cage3->indices[1][0]); + int* restrict cage32_crd = (int*)(cage3->indices[1][1]); + double* restrict cage3_vals = (double*)(cage3->vals); + int A13921_dimension = (int)(A1392->dimensions[0]); + int A13922_dimension = (int)(A1392->dimensions[1]); + double* restrict A1392_vals = (double*)(A1392->vals); + int A14512_dimension = (int)(A1451->dimensions[1]); + double* restrict A1451_vals = (double*)(A1451->vals); + +// int32_t i1468A2531 = 0; + + #pragma omp parallel for schedule(runtime) + for (int32_t i1467 = 0; i1467 < A13921_dimension; i1467++) { + for (int32_t i1468cage3 = cage32_pos[i1467]; i1468cage3 < cage32_pos[(i1467 + 1)]; i1468cage3++) { + int32_t i1468 = cage32_crd[i1468cage3]; + double ti1469A2531_val = 0.0; + for (int32_t i1469 = 0; i1469 < A14512_dimension; i1469++) { + int32_t i1469A1392 = i1467 * A13922_dimension + i1469; + int32_t i1469A1451 = i1468 * A14512_dimension + i1469; + ti1469A2531_val += (cage3_vals[i1468cage3] * A1392_vals[i1469A1392]) * A1451_vals[i1469A1451]; + } + A2531_vals[i1468cage3] = ti1469A2531_val; + // i1468A2531++; + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} diff --git a/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.h b/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.h new file mode 100644 index 000000000..a9d6b760d --- /dev/null +++ b/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A2531, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A2531, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451); +#endif diff --git a/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.so b/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.so new file mode 100755 index 0000000000000000000000000000000000000000..c2c5ca30ea045392ec3b72aaa875d7ea80b5ac1e GIT binary patch literal 14360 zcmeHOeQ;aVmA}tYVi5~T2_|s@A&(%a4Y6WF2sTN;vL)v^t0dmUfzSpNS(a>1ZOOGH z;?PV8>zK`3A)4Jnp`HB*JJXr&w%cZQVPToh#*RZsfbKYaZ|Q8^<)d-~B{2oEB#rlX z?z>l(p5?MTv$KEYCQs*{-#zEt?~ixi@q_+AbBUrb87%A$M%Z<(4+w~Nu}N&LyyY&KMgJ+xJBeQ zTf5jqdq^s(!|PTK8x}n4H*I5!KHqx&_Nuy7N8kJQ>BDFHvInzIj#CztemNd8Q;O%1 zs@N{8DJipC82g-i*jnPAIe`EXA zhd%pewDGaXYyXlt>u4?e+mpX}?PJHl%a7OApZK#VRA&uyP~ZhcQm+Q47QuG|D<=Q_ zS@46i;M7hk9&?xgP|W_kS;`rjMZONWipLxl%wlH?1ecqI!M8!)#ja$1s#=hG1Gi&m z4Pv0Y44y(KmtV*RO!8-twt(5$xRZ;qod#k5Ebueb(Fk(xP9je7haDVdHwpV+6ZXyJ zQGt{Em}op2dlYUJxJ%$q8!+lb@|Uq;IUxnUQple%*;&ZtFK1RU?}Rvep5u1LL^-#L zgi`|FDR50H1cRNu@mMgC4D~02K^E+ebthRc3c`XdZ5x9fk^V?$cOn_-Z`;_^6OTpO zLhU^fBdvH!Fgy?<1EHSo2O@0!=8dhvo_M@3*q@3eyL%%>Rz>hwWFVObM`9hpSbSHA zibFc#uLr|jUk^q@-90Sa+sA;!gYC$sfA`i%s6X80jiQh<;l@Zb-XCGTp`M<2m~}*` zf8db-;? z5%nbEo?2qU6kE-Lk&aL@gi^OB5~8+2)W)dWDo>3OsijDurLie^vu9NXANU6m1O-y-_?Tp@2K?A{H(<$~!+HcZR!O{trNC1l`3)kC`10;< zjC?EGE3nj(ZtQd=?0}G$`;Q!^2j`I%g4kyyF84>932znhU8>5}q`c(Eml-(i4Pq(h za@jmQf8RKhhs&an!nr&gvmqBMM834Piz@LnH4mrOmZ35amwOkb)#Tw~N^zCCJY23F zl+cieQ{FOYdAMu~N^8x-#ZiN+Y|q2R(ShST^YAi`7(d$Jhc+!eXXzr9X+y)wlBqFm z@R)5hCoy&FawJV%@5ZllSp#CkcTxJp_!PqRONdv9ALIN2;;CyUhB-fvcxtkV!<;W6 zo|6@J_A%!Cahv6ew%mSP_6!)roW$Dj3y}| z7EN;by;I}UD))|3D@^my?_t(5OJt?DB5wULV!O+TZJWbf3JH*e9bf=`+Od zorprh;JtTy?|Pnk$Ggqjwr#8S(1%Zg(ZaiGv~;ktLrYt)uOvX+!pJDgs>Dv;Ul z-bY|2;2!X$chv>booYb$xw|wy;NFQyot9o-p&_Cn;-W|^A~uRNXz9}0DT-~+-g$9q zYX7ive+a|Dn`uUoKK{tBe~PrqfIi-$|3cG8wX}~iX>-@5f68<9Ie#*yg-_^v+-oU2 z)u;!F!So)t8VK8;OZ%R)dh17=nU_$t!K~ubf9tC{p$!fzlraW03bY}!;s8aomXTjs zHf2`XTEa&ImVkS^mRUR6qMz50Lru%bX^VI8UBy@RA3puxz34OLgr0Pl4}Nfw#Hauj z-i-AYU@q^eZ0%<-zk)`9f9Ruh;2Gw8bQC4)>72a<*gq>{8-ev{lu#M?H4C!O10a_;{gdJxrL z;m@os$&@P_!&6%L)yYPfJCBmwV|_i-s(9<)KsiRCaAI$-FSB;OPk9wmkezIT-~}XF zU(2+X@5OM^`*-99e^EF+1)7;U6)BU5PhHuu}pV+hfw2$ogGWNNC{e*WUyUeeg z@SZx?5*}M)e`4<!C zjj5(L*|N78K#P7%)0--?m0)SmSG4Fa08lZL-XH^6Clb(+EBr%zfl!fHsV7{{M;}D0 zH?w}qkCh{19kTM=FgH$LH+W7-U2%j?zu8m&Wtder=+*~X24A!W>i4)SlFdzN>m6D; z;I=jD*80Y@ZyjsWt!rrjxf;{fx}o7F9a!p_BeZE`)7(nZO;+~itfZdDwuj2TT`)|r zp3k82S3!Xh*r|NRClC8G(n}7ZpPDK)6xb(|EOBJ z(Uz{r4nrTKbofQ%rRK$kawxGQI83k>{Wx_`jdMQBeivdx@3iPgosVwAa@3M(wgxi2 zRxboy>_wlOAAd;Nm1#o1{?@PmzD56B%lL-)f{rd9n;uX8^yUlxHBM*7LMvN1>9)6o zPigvDt!h%stayeBKyy{~R-M#e_UR`B8TWTK>K~k5BIecLC>CpXr8fA2HB-LWTfb;; zwXgoPKkfGHdC#wWr0Fl}c6UqoQ*GpflI+KyW4zeir(c@RU;p_{ZEm~1!;LL*kK08Z zs4voJFs#8~ptf^4hwjEa*1~&GHvOnJ@@tEh-r=qZ=;iKolhx8>!;rN!+4Uy7rOBl< zmFrEeYzjG1xhn(da<`Vg3(cLxBBU<~=v&YV%{3SvTJ3Pb=-5YtA+CWoB8NTLwKD5Ap{smp=1Xf<=fXDH z$h=6y`lw~F0XY|9kf&R%Kx=AG`mi_oeCW0YXBAd|J{vJZwc1ym3$X_}tFU)Et1t$# zZ+}LW!_J3Q(ayK~R4Ga|c&uwQ8$en#Z#~n}n{igT+oh#{2w3QZc&_kd*K*ugj zP5l702CMt)pbQP|hW;>U1L!zt8z==n^g)#ewlHPDrCen%wH<_x8yMY<)S)bgL7BtV z?5Nn_EZb$<$L?IR=Ehsx%LyiZOm{X6Ia()V2q3Zp&vR5yI>k0RT;D2bbW}WQS?^FY z)<#F=BXhitnuq6l9d(1H>m6f?<&%~%bh)>5JfM6-C-0^|1!(`E7J%F>)G%LYfEMNu4ez8NKfjY>S!<|^8H zC*oAEM@qboiia&;hdO92X>(L~d9fLzHrNT-HpudOlj(d>zZq@dbL>+rf32Vmnr7sz zQ5(?Ob{O_vNB=&JIP*FBO8$PH!{#mH?ehxK2GHlSPpK`&f$cY8zXPWJZfeSO#X%M? zg-b1PsRb^zz@-+r)B=}U;8F|xpSOU#*OvF#^4=Op5+3M|or2`)4xNJCAi3{2;xkh2 zJI!K|F7MH=5bNnRGgw`M1Jy&IZp4?2=Z=xyEr$?JMqVa zyu2HxGa3at^75dnoCq~`#z1rDZh2?DNZ?}vkhn`kWI2b0oV>f1`Qg5Wm;bORD317? z&*dxlq)5k+nWu~D39J8qM1D6n=lGeZ=v-03g@WE7=xRZm1-(nqsG#=?`mmt#`^WmG zrrXuZZSDA0mQru?-0G>Rz9q%E+P$@Po|;uv2Fg~TL`I_-C6NPlw!`vQA#XIIjSmI; zegOYyt`-@ku2;62?J0kG7jVm7O1zJvJXo!z7b&95SFU$lQD$*m<=D1pzT(0#De)*4 z$L(cn%D%FAzOtaSq2h_g#R^JiS;Y*z<%$xBS>3Fx><(zcN*GTGe@#=A^N!1zf|USJHR4aCl`n9*cH&uI#?0ZZ&lKlhIHj*&9lBdBVoDysSi4!txqZEc2AF zzEpxQXbQgnt4+1FMnkIAz4)rxS=|v&;R9cFT@8R_Di&GS8Hq*uyTf4V@opC*%D9`?|xdcOabTXJ1mE zm4($=$?dO1rK@kN_V>2>YFY)uhlof<5reB&p$M_mKy_yy>P3H_5uDnSxDqaeukl`;j1U&>g}O(*zZ zEEKh15Xki{KgEL8O0H{#xRuHEuMnTZeT*nG=8_M+`_J4^80LlqJ?$|wV3>sOs;E%@(XA9Z&130d)4Gi8gUE1Q7DGX-Lx>x zlG{bixZG{cxZGvU_|if^a29sW44j6sm7Se3CpZC={uQ z2^@SM_3Gh2RLsBQGR~c z25h!=_y}nVbFxRSL?kcFmM@7E%lABRrO147ph1e^%Q-%C+>#%T0uLSYyr|<1J99q% zk+5U-3%Z-*Gv`Y$$7kAkXcqiCz=catUh6PM_o-?DVlc;Mk$<1#c4qc3`UH3}KdUde ze35ktXOUv%d5hyS{j#o}MSczNV&&f;?3n$sh6HZ*%i1)H{iVEKGyTEZXOZ6p+?6+v zWYZ3ce9iu2PtIcJc_BZ;FH7KKCjYL}yq)dLC2$(|6vl{q(E9CnTken@S1iP3nqJmVR9Htz!Rz?9_;Liw}*Oy9XJUk zf}zv^BQL6+NHWslxovfQU2!4qh!za>_lI`Fg(}&<8@^g0_>gs^dV6<6C6D8tYXze5 zXtXCD>X1dlu?ya8u>`dFZ=$24w--eg-dvs_$`g#n;Fc8(HgEB6^auT$eB=@r6o!KL zhfs9#_c-4@o4gxanjneOX3(z*i<)l>Ab8SA+v@|H8@++x z=H}+D{D?yX*bHAllMCK-uJB#Ux%f@{?B!c6^e`mfJ;NDq+6I+e_}>+R`95?-UH7K*E^sf* z%YGI=L6c+O3@PC@$lJ51pCiv%`U#5s$hj|A;Qwf<4)?BPp2Y56^eSkw-=JMG7Q@Se z2WNK=Kk-+0cQDR%g%VxN)3G}SO@k)!4ncDFNBR@p@mK*9L|T8OCqxP&*4L9{9-g}g zQBP+aY?2%Z!=_xETT~4^XP%gBlBPc^72;msu?<7f~WCBIFfD=ZaRxc-+AodoI10?Q^3 z@;yXSx4VefKP^TsW0DGB<&E2SWbB`zYf#^rMnoZFW+kq}z~6cR7-m?^BZQG}Lh> zGsjaGum$e7`2JKFB=va zNGk8SAi@LXl^+sd^7Y4szNClqg%NiNUQ&LK0!-#F-wP#`^_6BMFX^8mAwPflCN61* z>HR_GEA3@W`qGbFQjb)$krrbr|DFDk`jWnaq3xncS$_GR(>II0Xbx$PjUz@_O%5?4QoqWCOMOWvkS^o-qTf{w&TQ8#du|r} zlS2PK!%XhitZxNi#z^=p*BUC#IZY#CmZiUYF=HgzEA%_RME@G0Z?5O-Lch#pT(Xi@ zAtK}XS^aXk!QGh;B5sx?-x4!MD%V_RD5?g@{hDQomup8(2$2S%e}Xn=1Qky`9y$Kx m`cLbV9D9i7g6$?F8ZafA^QXVzGKp}-Hrm(`8chl&mi;#j97$dP literal 0 HcmV?d00001 diff --git a/test/kernels/sddmm_spmm/csr_dense_spmm.c b/test/kernels/sddmm_spmm/csr_dense_spmm.c new file mode 100644 index 000000000..7f710f6c1 --- /dev/null +++ b/test/kernels/sddmm_spmm/csr_dense_spmm.c @@ -0,0 +1,190 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A2535, taco_tensor_t *A2531, taco_tensor_t *A1455) { + int A25352_dimension = (int)(A2535->dimensions[1]); + double* restrict A2535_vals = (double*)(A2535->vals); + + A2535_vals = (double*)malloc(sizeof(double) * (5 * A25352_dimension)); + + A2535->vals = (uint8_t*)A2535_vals; + return 0; +} + +int compute(taco_tensor_t *C, taco_tensor_t *A, taco_tensor_t *B) { + int C1_dimension = (int)(C->dimensions[0]); + int C2_dimension = (int)(C->dimensions[1]); + double* restrict C_vals = (double*)(C->vals); + int A1_dimension = (int)(A->dimensions[0]); + int* restrict A2_pos = (int*)(A->indices[1][0]); + int* restrict A2_crd = (int*)(A->indices[1][1]); + double* restrict A_vals = (double*)(A->vals); + int B1_dimension = (int)(B->dimensions[0]); + int B2_dimension = (int)(B->dimensions[1]); + double* restrict B_vals = (double*)(B->vals); + + #pragma omp parallel for schedule(static) + for (int32_t pC = 0; pC < (C1_dimension * C2_dimension); pC++) { + C_vals[pC] = 0.0; + } + + #pragma omp parallel for schedule(dynamic, 1) + for (int32_t i0 = 0; i0 < ((A1_dimension + 15) / 16); i0++) { + for (int32_t i1 = 0; i1 < 16; i1++) { + int32_t i = i0 * 16 + i1; + if (i >= A1_dimension) + continue; + + for (int32_t jpos0 = A2_pos[i] / 4; jpos0 < ((A2_pos[(i + 1)] + 3) / 4); jpos0++) { + int32_t jposA = jpos0 * 4; + if (jpos0 * 4 < A2_pos[i] || (jpos0 * 4 + 4) + ((jpos0 * 4 + 4) - jpos0 * 4) >= A2_pos[(i + 1)]) { + for (int32_t k = 0; k < B2_dimension; k++) { + int32_t kC = i * C2_dimension + k; + for (int32_t jpos1 = 0; jpos1 < 4; jpos1++) { + int32_t jposA = jpos0 * 4 + jpos1; + if (jposA < A2_pos[i] || jposA >= A2_pos[(i + 1)]) + continue; + + int32_t j = A2_crd[jposA]; + int32_t kB = j * B2_dimension + k; + C_vals[kC] = C_vals[kC] + A_vals[jposA] * B_vals[kB]; + } + } + } + else { + #pragma clang loop interleave(enable) vectorize(enable) + for (int32_t k = 0; k < B2_dimension; k++) { + int32_t kC = i * C2_dimension + k; + for (int32_t jpos1 = 0; jpos1 < 4; jpos1++) { + int32_t jposA = jpos0 * 4 + jpos1; + int32_t j = A2_crd[jposA]; + int32_t kB = j * B2_dimension + k; + C_vals[kC] = C_vals[kC] + A_vals[jposA] * B_vals[kB]; + } + } + } + } + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2])); +} diff --git a/test/kernels/sddmm_spmm/csr_dense_spmm.h b/test/kernels/sddmm_spmm/csr_dense_spmm.h new file mode 100644 index 000000000..cf0cf205c --- /dev/null +++ b/test/kernels/sddmm_spmm/csr_dense_spmm.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A2535, taco_tensor_t *A2531, taco_tensor_t *A1455); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A2535, taco_tensor_t *A2531, taco_tensor_t *A1455); +#endif diff --git a/test/kernels/sddmm_spmm/csr_dense_spmm.so b/test/kernels/sddmm_spmm/csr_dense_spmm.so new file mode 100755 index 0000000000000000000000000000000000000000..398362532976cb094350fa44b666bd3babbcabb3 GIT binary patch literal 14520 zcmeHOdvIITnZH+35*Z6g2_cwA$xRT&X=24LAy_0Sk}W4!SsC2Kp$3vFvMfKgw(PZ~ z;y@ZqWK(Vxg)B`=c9ufA!|t@Z-ID3j(g!oO^K{Z>aRN!`K)c>41-a#6QZ|?Z(f+=3 z&ylaM?RIDOkDb}s1M+vi-}gA*dEE23_vFEv@Oq0ZOCnjN&q>79U8W;`E~qT$84$lz zDb2&*rP30nJ7-cfEmwY>4hbqr3InK&x%f?dim{*y%Lw|2MaTDXD(r|QJNNav+m zGJ=lqst9V=)g&7z{f4>ROg&srP|~GT_uFI><#%4q*y&VH&@)m z%bTrT?7Te$<<((zTL>Rk{5IUNMf%{xNcZJs_S%jA{DAcz7o1!@aPztjDuU!L{K(Bb z{0=Izy{Oz$?66hIkIYeIl3jrxRky0<>|2Mvb?7@k{mo-LQ;x=t{^m!DzxP*14!-hN z4c8sK_&{^j{`gBj$-d{TEB?EazkBIjXYVh*QQ;f@S~E;%O*bRp1v5x#m?$&gJAlok ze`*%|?`OfIv*5p)1>ZLd{-arN>IVfsbGizGnc}a6qGDDCUpb5ZCdih~pnpH~-O`m( zpQ22WhM6CSq?G8eu+1|4MbaZC{p-2@;8nW*cJA$U&;^o18o5#j#Ivq&{e6pcd^vbZ z4}rR+MbqM0&-Ls0b0o(5y#nWPex$pB4=T4vYAE6WCtJAwc9S0pV}W$J)WvaOm1DR= z8s+hC=7K!O{XBkE7(^m19i53tPcqt_j6|eJTcRx~MVdjRNT|Lx(iHEGx3u*n6{I^rEY@uU>% z=#pUH8EHgN-8(kNqusIAKr^Z|9j=NucXr36j%a&(XH05}Q*%U^aWbCh>FkaqrRMH9 zuU1#I8+!3}Dbmy0))9&J^u#+@<1jXcC{xL})ZW(Ef~>cv(_2AIjB=}`NW3YUjG}Ih zJw3clBB-<80;{~`dZvOh;ZRj|meaAh;N(B?A9cM9 z{TLg&lH>J)FmCuzP6PSm1oAdZadM0qO4%g_#L`!H+K1aax0@T|~+maK6Md(YOJ3 z2xZU-1MW27CA?f(LwJ>#R59SR)`(PUz|nApRBph<+(N=i11{zY!u9z4Y=qM(r-23e2OGk>N4Qy@Iq44Z`sw%oPkA(qz()wE%^~OebjzL z%HQ%Bp!|*d@z=H3j~wx>WH&aNN4oKA#LL8wFn%}j)P-Y%jK80FYTB_wjPE9%nsn?S zY{km|mq-NI@!NC?aJ4Xgp zeG{to+vL?~s!i}iO?COU{OF{~wj2+@|L~aYk<4`l}?g5_r6snk0@#pxQ&x4oA z92JwKKOBL}psv1ctQi_QANWGxuAflv1-1n0e@dAxo7IQjYCulSEDmQnl_oV~yRj4k zRqOYtz$*sT%m#NjyT{`P7tZ#3XcM4nPk0VND-$UVX6`EwXIe^BE$G<}T{}ciLloBb zcuL6xjud9n?{R}>Hmt~YT@UbkP)ZNFT%qxb{Lm$v8oQW#?UQ`If6&$M#0Usv*XM_R zV}rNrUSJVU0aR61J zL~*EDTbe34%G~BCJB}Z$Be2-M6@PkF}nv&`0?*|K%kfkD_+YFj;PU7?X|_1vL9 zTGB_XuF#R835)NrE4ve>+3UUmLuT19G-`oCy&6Ml0qq?%_L7=;mPH@t;fAvd&r>1W z1KO)<)-gAnbv&+S7ak)Qq3o*RP34!M+TdBwQ4%FRb!v8%UA5{> z2oJ9SRx{7D)~n5W#vy(I5cR(Lw09)WDN-4{fQ?Y#=zz}v39x7(d-47wyV*%|2a0iLg4hPZn?cuELcqn!x ztl2!segcUUihY8XKBgdxYB4CBY4?=U_Ubwzz%tMO}od>!f}25aM7tH=l&vFe}WW9+YbfRSMMod zvQRdhcQ3g>E$P>ttG=Yi4W;8n@AaQ@^}EpCFneJ7hLW|;f&35A>eY?N(7Z4d$+@L!PRj3f@}UF63iv2hDqJc zuF_%1PC@1*S&dYiT``XoEP#gqlQw)2WEYT{ad=deO=FA1t333aP^S>*MHHCgpZD}} zU@6tT7oCaX7GA{fZHi1m=JhaYMZ*&LS3~lMDxX*7bJVkPJ}h4f%O6$@R-Bn!51QfL zA*Bz?&91xu4NPhM9GEto4!D*r1m?EQgWwod3g{`IVUoLTa{(U(Gz2KfAP2el#UzCJ z$c-+myYC4EPEVchdG=g}HAL#_r-Ll2NY5SQG{}P)OcvqH_w;G|S$%AyHU}ZgI_3qk z3-KJBRAXnTNgOX?0G$QS+8&J;JBdbGz!$Jy<7pYS&xb7|zsB7;oghFx1C$LP$&R`&(MNH?J`gt{W4k64ZmxbX$ zoZEIjsGX7zW3FZ8A(}Jl0Yf}Rx)K?RF+yRx9Vf8VQ3PU`$j$o#=_9sK`lzC&kGO+t zwtDJa1KLOVyqd;DE~6ozMuaLFvRvNjX;sq`Zr8y4Kj!mkeZ3(KTB~ZcOK5vqf#m1e zN)gcBhRSzf><(wJN~qvQ!TGT`(6Z%v^t&IT6wKkYVA4>e1v9tzB3lgy+z{se0}L%j zDkM$n%{I13y9Pduy_fpXQCh+8-4@sy*cSL=AcAc`KiuonVkuUCbjDhY>B=z8j5L@V zf>>?TpzoIf*ONm*->+Ry4F-yaXwLLu%sn~iJFXt6&92%V$_9@~uBQ$Mibhn-bu@|k zhWxH4PmlueqVt%WbS*ZC(X2`{mECvT?|SOEZ%925$~tz}X6ui!hAU-HlIzLCMdwtS zD_rWn(%e454@b?b}NW7I(n5*}-)45pR@GR~;4`PgoFU>evg|uCVxN!cb{o zU{_s_MHEAuRW!70TwV4-XxV$zw>fkO-=AS5+tu`8j6C9Lkg6tB--Xn-SeRqW5n;vP*aQY~F7iH)c5;5g?oa!O4gcKmJ=DuR6AVHRNMdNgHL>?zcL4pFS_ZOVfnN?>>K*Z@@2#5qKb>@oJ>0kJ8Gq7 ztaN_3cbl4BJ3Mw1E2j@Qc1?uRW>g=KnR(0S9@#fpzBYo#+UIzD=hEK?X(t`Tj8mfx z2ZnNsYvkd;>4{KmWR2tTT~`IQZs@jooc7?ui!&6Zb*51uk zLJCTc3wD&%_S{W65YmoKX;S$5Uk0qvQe|$A4;FFg?&3cCCT;G8QbUR z(Xdx(HeXdHxK66pY-?#OyQ?y`%7MXZ4Or@jKc|k)J;1yqnU`G!FR7oRqfyycrc6P2 zt^@U3HfOMP`DkEp*9|oluUEXS@8i@=b!qyTyfL#@!E|fS?SC(y_bq=gr)C}~&3zRN zGWOgqJyViP=@}*0P8sdrshF|t$Zw|e2Ha=-w(QzI+Q$vOY0aHuTGizhgZHB6jOR1f zf9O2)Tly`mh6$Wp8p{o5o}j0J9eC^+d#2R}@8vkso%@(drS8hCw+A!jAb7<6U~!NG zp|(~tRrZQADrQx;n%(MAf|*{Jvo8KMCc4s!lUSHnuzt(kh4}v9+65s!q;}ULv^9!} zplvHol9ON4hS-m#AAYbctqWpog_%VO3{*0(le9cnOjSkiHDF0_uq7ciAAQk^s!y&4 zx%hZtp8A4Y&D@8`Zy(zZP|Zy8hlr>50%>{^fVJyd>zZCrdav^SyZJnAhL1vrP6IE4 zK7p0zWfcE1_}4MNc(4T3qorsmY65KsJqp?fx)9UQe$e2he0~)4X;25*gO(D-yqW}k z9rO{_qye49rNt_VCMmL9exdnhqhso)4kqVve8w1pS@4I{hBqOx!JRn zV6w;XlO)7R<5r|FGF$PR04~#8gDR){QA?GxWUqCDQ_0$@oTYo_tap|_JU8I1OwZfk z+-J2Oo_CqEGTDw?!fxc)xuu z*B4{$8qSON*)-QsqH#{?KQHAwiH`H~3ePXkhPwf%o!^h;xwZiFe1MJWRJNemmF4bjx^AE~Z7spW+q4 z!GrO%rTE`S;=8%I?1y>9zRBr7aQY)ok8^sC(>FN1$mv|(F^f3;G^eXM?dAQkp}P7u zrF2VUDv?Ynw|Z~!mY3a}VqC?pib`+!s%1JVtw4l&BkEls2JUQ=(>>7F8UX5%ZSdpO49Mr6&9%3JW_G-=U@phG5joS8I|RaoR>>75;@2J zUhhi%|DNvHO8R0Ji>-`xCYsw?R<_+-xf-_J$>wNJvLl*o^~Utaw8$bVVSP!Gt(VF7 zoU-7%myG)WN_9nr-jGUJ2fkXilr?px@M*5BvK&A%m58rvi6`RSZ85O)IdENXh4U&W692LfL(1dsiQa6(=Gj( z`mCHsY+*0P$8x?p)7XV?(oVuTo3y$T^DQoUs@i4<=q3Ew#JS+pk^p~jI? ziJrEWM7&99OC+T&eE4nSv`G5vr|K6=e%_i>aYx~P(Nui;_j{6Cu|pR3Z>H+IqzSR$7z9@AXvF$sfZ=Dp zB-Wj&_yXJ~Qj8GGTX7Rcd`C43t+e}9X41b>67$+r{pt51NNz2xHb!m2#cE>4XQ(Xm zBUV+jzF2+DxL9?~_@|}#+obpMW`PTU1@}e7lOm>P@;sA%DcA2Zjg!xC z{E!J>$MM2K51!Iy{A9zu6Y-ceGJJZ!YrrW^u@3^v61_=~XBa=v0-q^*g~zYLtU z1K*IavkivBe+@RMbiYdBkNi>|YV|Do@mcU4v)~72 z!G8#RruI3?^rzqV`W?s3_q~Q^@n6c?ZTda4*PuVs{jePOZ@z!_3CGR%w-)ktGv6mW z%lZ-X4{uL)UPhbV!1c}dz19GqsU5a3ecl-c(gA#_ffreg>Cdn}PTkl{-I_qwlT0-? zV*@vNtt^u4h{Wh(Sr6_)HFZW>+B+Mg?U5$zsCpvNRIfz$sM_Pnc$4?m)xOG^mDq)~ zNVK~u2j@AXaEP%S2{CNB;eOhibA8-s?Jrrv?1`-1ufz^O$TujpA zZt>z6;bn)9vbH9PajnswR>_OYny}Pq62}?R`*OUyr>!$Fg^57c9dD140nc@{CnYZ{ z-HWWZr4wwD?sj@v*L&lw{Bdk;g0bLq6aBf=HASA^8%435N%DxG=;EXo_a-GTp2QA3 z@WA4YsTP<-6D^oYINO$J?iBfGV`F#x%YwrpDlYP95Rs)zpWK#v5)Tr4St z_fvdXk^I2Ox3CxIF+s)qD5$VRK8#5PL+^%!y*RfCdK*_3@e4gcw?IzsnglP-cY=0t zdy1cQ%=M@C5i(L0=RiRR2}Ba*3wu%j1Y~4iMM_BGeVL#NuQ1tTmscoX*!Kb>mBRZx zets19y~TStsIWw38B-4!qrEtP3OZs`Mqa#s6qLQ!0WQiH=Tt#OeT5sr3;ISMV&7e$q99Jq_TM(yi*vM~mBKN%7xfly z{|Om-g(b#^IB$!8w_>iZuov_>*isV-dvPwe^P{HFg9=N67xV~F>N{aC&hO&i1&vVS zB8hqnI}z7z$Vf%li*x?yVzQDXh5eoUBkTqJSBQ-Eh4%s6kjA$$2|GdQgR{|Iyf^4o zbSt4Je3gqiF4I=3%2 z83@~ literal 0 HcmV?d00001 diff --git a/test/kernels/sddmm_spmm/fused_kernel.c b/test/kernels/sddmm_spmm/fused_kernel.c new file mode 100644 index 000000000..1572bce5a --- /dev/null +++ b/test/kernels/sddmm_spmm/fused_kernel.c @@ -0,0 +1,183 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A1459, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451, taco_tensor_t *A1455) { + int A14592_dimension = (int)(A1459->dimensions[1]); + double* restrict A1459_vals = (double*)(A1459->vals); + + A1459_vals = (double*)malloc(sizeof(double) * (5 * A14592_dimension)); + + A1459->vals = (uint8_t*)A1459_vals; + return 0; +} + +int compute(taco_tensor_t *A1459, taco_tensor_t *B, taco_tensor_t *A1392, taco_tensor_t *A1451, taco_tensor_t *A1455) { + int A14591_dimension = (int)(A1459->dimensions[0]); + int A14592_dimension = (int)(A1459->dimensions[1]); + double* restrict A1459_vals = (double*)(A1459->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + double* restrict B_vals = (double*)(B->vals); + int A13921_dimension = (int)(A1392->dimensions[0]); + int A13922_dimension = (int)(A1392->dimensions[1]); + double* restrict A1392_vals = (double*)(A1392->vals); + int A14511_dimension = (int)(A1451->dimensions[0]); + int A14512_dimension = (int)(A1451->dimensions[1]); + double* restrict A1451_vals = (double*)(A1451->vals); + int A14551_dimension = (int)(A1455->dimensions[0]); + int A14552_dimension = (int)(A1455->dimensions[1]); + double* restrict A1455_vals = (double*)(A1455->vals); + + #pragma omp parallel for schedule(static) + for (int32_t pA1459 = 0; pA1459 < (A14591_dimension * A14592_dimension); pA1459++) { + A1459_vals[pA1459] = 0.0; + } + + + #pragma omp parallel for schedule(runtime) + for (int32_t i0 = 0; i0 < ((A13921_dimension + 15) / 16); i0++) { + + for (int32_t i1 = 0; i1 < 16; i1++) { + int32_t i1467 = i0 * 16 + i1; + if (i1467 >= A13921_dimension) + continue; + + for (int32_t i1468B = B2_pos[i1467]; i1468B < B2_pos[(i1467 + 1)]; i1468B++) { + int32_t i1468 = B2_crd[i1468B]; + double tA1459_val = 0.0; + for (int32_t i1469 = 0; i1469 < A14512_dimension; i1469++) { + int32_t i1469A1392 = i1467 * A13922_dimension + i1469; + int32_t i1469A1451 = i1468 * A14512_dimension + i1469; + tA1459_val += (B_vals[i1468B] * A1392_vals[i1469A1392]) * A1451_vals[i1469A1451]; + } + for (int32_t i1470 = 0; i1470 < A14552_dimension; i1470++) { + int32_t i1470A1459 = i1467 * A14592_dimension + i1470; + int32_t i1470A1455 = i1468 * A14552_dimension + i1470; + A1459_vals[i1470A1459] = A1459_vals[i1470A1459] + tA1459_val * A1455_vals[i1470A1455]; + } + } + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/fused_kernel.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]), (taco_tensor_t*)(parameterPack[4])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]), (taco_tensor_t*)(parameterPack[4])); +} diff --git a/test/kernels/sddmm_spmm/fused_kernel.h b/test/kernels/sddmm_spmm/fused_kernel.h new file mode 100644 index 000000000..e67e5a761 --- /dev/null +++ b/test/kernels/sddmm_spmm/fused_kernel.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A1459, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451, taco_tensor_t *A1455); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A1459, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451, taco_tensor_t *A1455); +#endif diff --git a/test/kernels/sddmm_spmm/fused_kernel.so b/test/kernels/sddmm_spmm/fused_kernel.so new file mode 100755 index 0000000000000000000000000000000000000000..10619e0ca4f31ef30f7d2630d603c67321ada156 GIT binary patch literal 14512 zcmeHOeRNyJm7ixRjuIO^C757}OGFS|r^Jd~La-qzk}W6CQ~@_}+VGLevMkwNY{}IJ zIMhu{6;mFHXj3UDAe^Z&FFqtgu{fxNwr3T^=1@-kJ1L9$g zY%#u9v8%c4qPe1Zx&2odkfe+`IY4!k<8$HNoCS6AjHC~h8Tfudr5>@Q=eWu!cUfSIIH?}N~zcK%P3LkMma z^)2KVoA8IEqC32ARj^^fXY*~_+1tyX)bG7ArXD%F_{(p8aq)LQaPfF2RYBz{K4fMw zJ_ntOZApDuMWwY_d1#SSp|VTypI@BR8S!>P)yfxVH3D!%^FBL|=V z=#H9$=RX!{ek}as?`B`Kw^w}m+28*7Rr}EQKikkW@o)sH3#RMQ;N>MsYk)aR;Jbj8 zl0UHk{!a_wKVJafzX1NN1#t4W6CZQBe1U!iAy}&Yjli8|Bj6hsu(K6qt4he<4|xZ> zjEy**MWudjr;?$FC#jp|@+;UwCi$y`{6U&TNS*wfu`{5{StUCpIw7BRr;s0{$%1qp zcuEg|I@pSN?b#&c_lxJK+7M@V3A|pklXyz>JS=BR*p4Nfo&DM_=P}WMiWJ+eATL;7-wgO)3NeJp-|5Adw8llYsyW^hNuUED!-gB-q!_VgrMLo^Ub{O$`K+z436cJ28tHJQcBl@IWG* zWTAmU2K`u|3yq5J+7=GRL%rSzx-=hd4o70~FdGQ=_s2r4J525faOGq;nux^%NfwER zMYjfnama=HSs>BdHxLLW65#>v9L|Q&%2YDU`unzWSUh^IpC}*FMn2UHTJ0SSUTH-D2@4-`2P{*JsBfhGy?-GWVd_`cXclBW= zm$7dMdAZNY{mD~7T1a9)lepZ6Z6=%v`5~v1t4VpuPpvd?+Jne1GUW>J!bfYS0LQ8^ zn=TaK_F0s%YEdukA)-sX)LDR&?`5hjz-eS;sxQFh+CpWG1-RTB3HKD>h5M^kfK!`f zYA?X0kEm=%0WOX@T&1%BmqS7Yy#@H?T+;X&EWk0~vq?+8V$(8VzN-t<%+>Dq^hSBRhF{3nQ~A)FrL{Qbm}Yo`x!K2AKj zbowCY|CV@i<@A2ecM(snnjYc&oy3!irUyB{m3VT^bSLL;C7!0nbUWvpi6<9LdpLg+ z@#LE6dd`3IGAE1BHp-Lwjk{fMKj3%0?HR#A;~O2Ww~s{J>dL)F;Yy}uH!gvJ?OJw` z0;)A#(DdIVuYjwz!VbCWlAZafIhCD9$V+oP^q(cc>h+yya~TC1fuzFdXec4|+hmf3hG@Z7u6#hiw(V~air zULkdKjIq~`z@*O*-#HzDgu#11?ETR9X!N|>y&c=PX%DG9lQ#rz5+BorjkI83#e+_OhaDpciF}-TJ;0213-QHQ?7$l3aJsI*?le>RT(<( zhm8xOXa+QQ{?GaR-Z6D=1oOz7^|{cwSAXUmf7v?zYb#pQo^8#OxhcrP`WYfVS5nJb zpP`9Fy?Y$gug`4Jf8VA*lR02?F0+@{nAyi01kD#o8l}`~C{2QlPg%9lB$zKyS+?d; z^@#}$q7z^tc@P90cpgN3;&}}gxl7Vx)Xkp&rOL-ZefrcE{S_}N-^-=7<33k4aB68K zFp@>KkxYd2^dGJMzV3Hz(3Bak{sKfaf7W_j%Wn9o1BTMSzNoz5JvpuENtdTB zYrREFzv_Um9zt0r>9VzD*E|OuKUG%G%aN(YLu1j$VLbh&0^5UEjII7rdaO);9ag#_ z(4kMHU%IF%hqdu3i}ISL9N&_ydG6#{zmg|IJ0RAHO7>At^bM}o(cgJ9OVvl7gj-tm zSGA1KN)j}6ZlMP zaK9&_*8U39?^!Lg!4*-f&NQ5A%dR`xFlIb&5!GQWPajqyUiA(u)6!!OZ_~+$ddJa- zx?|$LUq;l}I5i6c;%^#uX%RK?yp4*^sXKnC?)_JC&0gstJoOkiZ?By=8) zvA%(J-C_Mnwq5Zy{S>V_4AqI<_rjx<%4whSCx}73NV2R@(od4e%9tH zlFU|>`;-gX7lh5{c{SiH3ZQ0~`ht`yI z)waQmSAF!jc5{cT-lINx%xil?-TMn@rSmrR!PBIkbR{9$p?C%(eugW=tmhU z8%JfW?3}u+mC+#Vz#yEY0cbeYa2|tj%CF4K`QMQ~roiQgF$O!Vm#V9+muu<6j;7b| z`;~BgH(cMDyc3?LX@YiRT{9jI!`B$l=dx8wdQ55Zxop&POa<#{(T;L7qkMM_M3o8c z3*?VVj16W%)pKa&Qj{)jOCPpsX-qYnA02k8qbV$41XL?nY!0GQnuyhkAJ2Q*cz~^9 z6#`bNr7t+t(Is!^^C+Bzv_ev}DtORl0w6p^#8~Wvuujl=rhe+- zw@`WJNn=SxM`wWg^c)&m3G+@$s2{`Xb-Fj4hg;wCT^mdtx1==M2byar4;k8pftKKt7jz!>A8iochB;_J$f?4LUi=#yILbW`Q$hgY5TQU86} z%JNoy!aJT@*{V!Gbq=pSL*~6$gvtEO&6VHYT@5?>>D9;57nMCP!rbhU$EP12 zeo3E%QKLROA9(dVoerM%pnv4QAGYbI;lf5NrEC0HQysa;`Fy@D^qfy`)bu0%?3&MV zSLhwCYQO$!PD3duEx$HRZ?WZWG=Mh!DA?*;Em$oBVQjqsoU}oZft-p0Jm_m$NBP?A zL}9I-aHx;$L8&*pIp5ll$67LK1xwbe&t8+hprkH+nzonR$v+xqoww@N``gmTt+b+6 zCpWcZtnb${ewVFTw>C9rd>dJdZr#9F{^pFeadfOj2bTKxr)^{uG;WM%iPmDKkz z(CF;*1(RH*^A#PUGK(?CM#pwv)7o&R;djQmqh(rZ)6XckW;Qt6Fhz1t{vn@lx^7QS z%Y3Xh_b*_OvE}~3$W-S(Yh;|ceU#CELDfu;AioU}GBU&i0jlXQky~L$j z>Kn%HGMzv9_{pOkJ@)?0N9&8ub+p~`bDd4g_PgqR8CzzD6}8mk==vi(Mw)KL6S}EZ z%Xn~%Tavcudi{bD~PVcCK~m-YVqCST7x#-iUDbOR$UGG zh}w0c;S?%@4(66$?{;;nD=;A;e7CE{r(>fgK@Tr?l0h8{7?5^ISYf#fPip@-78zG> z?&Rw<3J3V^iWRMfpL|nQSk1_9aP8cBmb(_Re7xzTtYtn7C)@|`Y({TynXcxA#`9CE zdBQUL8X}Ng_lf(! zfcAs>F;XL-2S6VK{Q;K5DbO`o{3@Y;Hz@6!PvPW01o|TAKF})Iq@>)xl_^6G<;u#% zw*Am^0jr?`w1+knW_N6|SKq2u+-DnMx4i4-_g?Q>MKI}O+Ox+YN6(8)eq`>(=OEhu z3f<1`d;NpuUVCGD@n-u}nWeRSsl5@H*IwIfcS06A z&34-d=<-&N&)4zsVDISxWnR1E!7{JCI!*SD#R+rMeH{kPf&EsA1q^v!EMOw{7j3tMMk zs}p(h#Xf5LgBGvdnYNa7@D`)1cu#?18oz3cBX=-*l`iX~B1S$vgt9d#=Em{x!hPO@u&PtN4?;uR#p@1%D;@Qw%G z@xcGN2ju%^`F>fxPo^cB61@wjBzbzbO{vl#v%ky8cb@Wh7x8oN?C-f(3VFUZBL6PI z%lFxQPXtEmF{M9V%*Tk%i27#t8Ih6ie0N~&q$J&0@<`4gfeZwlTj?EQZe z`MbHf-Umg;J}2lu3;HcVj|qBO(3b=~FKD?Km=%J)N6__x%HJtAx3t{ktli#)-@;PP z8{Id!>+7yhajs!^L!-NX-D(46YtTZ&iN+AfiM!C|c_-uzr_xab6L&AZzDYS#Dw_eI zY%|+a4txl>Wj7Umgt9zYt&1;GMp>p@YrnL@V!zV9{qkjs1JkC=tyt_gRoq=3+*6dd?3XYFiHzgF+r8HKpNNOn z(l2JA(ArQe8tLm<+jo8Adg#WJkzgV@5KQ*EL&jseq)c|g@?)l0mMV9iD8uhw3gQ8r zEe#EZBb{{v_|>wft~-{(PjhvR^#GEoXn12!I2w-kg}~C!fg6V!*VnCI=dA0I|8IYv zvrc{@U8%nQ?mn0?n&zyFL}PXQN6EUucq|zXC1Y`bgMA@3FceC}*e~gwr-&#<69enlp$XB{P+dw^Pt1?9<7|tV=y3SAR)DAyHI@A`C`RlKz!4MTLKrv0#`c#JW=`Y{4Xu>s>K! zVRD@-#;wdD7si4D3x02u>q-HPUn$F(T#t(J#Z0bi#kh^h^{*J8|9d=@Td)cbis!yq zzJhs#H;eJg+4!PjeE#qERBpjdPR8F9%d4ziF4zSH7M!l-@2dqc>?~t)-zmnIBc8}v z2=TlHkudUi)WSjwo$8&X+OyQ&$NyRR9S zyRI33cQGJ13%hC_PSe=RUdx*cQ~+f-CCDd5F3)7Slzgp_A2iLA_X_-w3EwF2*^M4N zrET~q1#u_xG3(;^{BhTTQ#<832rN(ZrcEg^f1U(hs{Qo7p%i`!IQN4%knlS@urmVp zhE&v1=EQuf_qD;;YPYTHXcZw=aMv7r;Layp$gf za{2jjUB?7&j_aa#W~KW39`2v{@nqkDe5p9Fqr(0{3_Ybz{>|763)p{4$d8H2={cvg zT=>l#Pxc2fj^=o>tA)Hd-m3|?lUpqQ+Q#LJG2-q4zG^d@U|9mOP~EXWPk*c{*dOS|i7F8YriK{BQT2zD;coYh>zf)&3-N%p zKrkK;?m`e%GQJB@T0z8`b*Bafc0r|p;}LE}qOnM%KZX!8(X>b^5$+D$6OKp2{b;EO z<8lYkj6fuc7%nqXj|w(z^=@emwBF{U7`lM47`QixK2T8C927}KLlWyw!~(s+Xg9^v z`R=&QyQQrKk~qEwS~X!&^KAu$&^u{+v;X#HuRn16rcK*gI|3cvW`C=xMue;JAhg+5 zP?Z9t*fozw29!}4*iZA=JjXcFY9@7PLZESP%_ZP-Ag?BgFz5!F|P56o%oIjQX2vLs1-KK{FNz zF*L;n&XW=mhkP7L22}EzWvHR3kKzLZMS+&4?(i^9=1%Myz-$0b#tqslb5Yz{+_*%z z`PHMYubXkMH<;*U?(SVtXc{z$%M8ig8;&RXV$mWdfU4D_I5*Aa)yfWR2q^p56i&;T&u_uSwy2KY|@RWN#@4P8^GfZEZmjq zfl4sigLOo(ebGov=7U{b@$kKp!xbtl^KgjFQpl&!6_!e@eE&O$PNei|1j{;4@;)S~ zOW=|iS9r-W=0+C3V#cJtyf;bOEkv;#^CaKb2o`_2V7w!e`tp7yDeYU7s9tO(v+t$k zdB6jVY)gH450g~Bhk^)CWTP-?VCbEY)R*@)NpBLuvi(v{((NdxcTAF(_c}=jg+8^P zWX%01|Hv{DmG?hM#|T7{^-F!(|0v2xznO%Pm9|A@qv+wc5y-~(_ zn{Px^S7AzkDb$zuPDv*Vg^`!<9VO-ObAZeG<^5Dr*s?dyb?-@&&0d zAnTR(vL=1`-c(ZaHQL<&M}@u|e|g@PbXZm-s9FCXP5ScwEU8Bt7W%T^((Koep;uRO ze#m>a{CgF1f2F>p--a&T5v9JokK4o*Q_4YvC&^2C1SpN2)R*@V`S(DRG&o4I-%?Mu z>k}v=5vecl_fsoLi!mqvp8JyelKucih5EDa0fZvW@4_VYB&9%%LVbC^8gd$1Qcl`1 z^FKoYVY2=5UNz!0G|ci;S7Dk0Ls)$wh`iKaZNjC#q|+#u`9jgJIt|Wj*DQNs0sR+* z{znWmv)^WY;~qO(Je#@kY9qU87BMnr{pA467)gFy=*#=KG-H;fzmp-%+|M&Yp91p= zlVl}bjf~6}%DuYE;BF}dkvGe(TR`7^jiKoLEBa*5^oVnfcA@VO6Nv24UL@yJHJ4)e mi=4T+Cm>EbuQRx;oS34}-2P?2O(I +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A2531, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451) { + int* restrict A25312_pos = (int*)(A2531->indices[1][0]); + int* restrict A25312_crd = (int*)(A2531->indices[1][1]); + double* restrict A2531_vals = (double*)(A2531->vals); + int* restrict cage32_pos = (int*)(cage3->indices[1][0]); + int* restrict cage32_crd = (int*)(cage3->indices[1][1]); + int A13921_dimension = (int)(A1392->dimensions[0]); + + A25312_pos = (int32_t*)malloc(sizeof(int32_t) * 6); + A25312_pos[0] = 0; + for (int32_t pA25312 = 1; pA25312 < 6; pA25312++) { + A25312_pos[pA25312] = 0; + } + int32_t A25312_crd_size = 1048576; + A25312_crd = (int32_t*)malloc(sizeof(int32_t) * A25312_crd_size); + int32_t i1468A2531 = 0; + + for (int32_t i1467 = 0; i1467 < A13921_dimension; i1467++) { + int32_t pA25312_begin = i1468A2531; + + for (int32_t i1468cage3 = cage32_pos[i1467]; i1468cage3 < cage32_pos[(i1467 + 1)]; i1468cage3++) { + int32_t i1468 = cage32_crd[i1468cage3]; + if (A25312_crd_size <= i1468A2531) { + A25312_crd = (int32_t*)realloc(A25312_crd, sizeof(int32_t) * (A25312_crd_size * 2)); + A25312_crd_size *= 2; + } + A25312_crd[i1468A2531] = i1468; + i1468A2531++; + } + + A25312_pos[i1467 + 1] = i1468A2531 - pA25312_begin; + } + + int32_t csA25312 = 0; + for (int32_t pA253120 = 1; pA253120 < 6; pA253120++) { + csA25312 += A25312_pos[pA253120]; + A25312_pos[pA253120] = csA25312; + } + + A2531_vals = (double*)malloc(sizeof(double) * i1468A2531); + + A2531->indices[1][0] = (uint8_t*)(A25312_pos); + A2531->indices[1][1] = (uint8_t*)(A25312_crd); + A2531->vals = (uint8_t*)A2531_vals; + return 0; +} + +int compute(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) { + + int A1_dimension = (int)(A->dimensions[0]); + double* restrict A_vals = (double*)(A->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + double* restrict B_vals = (double*)(B->vals); + int C1_dimension = (int)(C->dimensions[0]); + int C2_dimension = (int)(C->dimensions[1]); + double* restrict C_vals = (double*)(C->vals); + int D1_dimension = (int)(D->dimensions[0]); + int D2_dimension = (int)(D->dimensions[1]); + double* restrict D_vals = (double*)(D->vals); + + int32_t jA = 0; + + #pragma omp parallel for schedule(runtime) + for (int32_t i0 = 0; i0 < ((C1_dimension + 15) / 16); i0++) { + for (int32_t i1 = 0; i1 < 16; i1++) { + int32_t i = i0 * 16 + i1; + if (i >= C1_dimension) + continue; + + for (int32_t jB = B2_pos[i]; jB < B2_pos[(i + 1)]; jB++) { + int32_t j = B2_crd[jB]; + double tkA_val = 0.0; + for (int32_t k = 0; k < D2_dimension; k++) { + int32_t kC = i * C2_dimension + k; + int32_t kD = j * D2_dimension + k; + tkA_val += (B_vals[jB] * C_vals[kC]) * D_vals[kD]; + } + A_vals[jB] = tkA_val; + // jA++; + } + } + } + return 0; + +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/sddmm_ryan.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} diff --git a/test/kernels/sddmm_spmm/sddmm_ryan.h b/test/kernels/sddmm_spmm/sddmm_ryan.h new file mode 100644 index 000000000..f0f9e372a --- /dev/null +++ b/test/kernels/sddmm_spmm/sddmm_ryan.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A2531, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D); +#endif diff --git a/test/kernels/sddmm_spmm/sddmm_ryan.so b/test/kernels/sddmm_spmm/sddmm_ryan.so new file mode 100755 index 0000000000000000000000000000000000000000..c3deae084b90146089e3f8a64d8e5d1ad2488095 GIT binary patch literal 14352 zcmeHOeQ;aVmA}tYViBu)N(hMqB@yA#+660i3BhiXV%d`Olu^J12SN)~WLdI3wIx@Q z!J(NXtP+;DLXhp!h5hJmx-*?&ce`z-OBc#?7dr{T0lH%-v`p!4-9kZbpoCCpHqdB) z=e~R8r)RtD>>vBbPHysa?)lww&i(#)_Z{yIMY?Jfg~{MxpJl`yy1+!*MZ&>#A_CIR zT3IcA*RspF?4sGEIWg}F6OxoM9|tIp#dywK%2`k!k4QRJW8(V+m3qXIo_DpG?p^KU z5lN>+RU~zpV$2Cj&!kAVQ%%T8O0pEXf0gT__`wS~JBQ{>m`N)2J_kK2=l?W#h2V`M zzlGYxCE7z$Q5{~l<*?zvv*r5jY4xGJ048@==;TYJl%DFcjDJSdfPMh?AKb_CjTS})dj;M6nNP@sUHXCn+M+o zY(DvKE`T3g0H=2H;jxDq0Q1>jvOqaw3&^(u_u;XJWeeE35rS*&!r*%#?`0RU5udLr z^%LBVn~jTs@+^1?{ak(p8?(uuLfSIsW;=!aP7+4=7d(WIcsR@C+&wMuX4S;66!yO$ z?Ay!Z15S3j#k`=gN8uWQ9~6!Hs0pJ^B!2-LmlIOp>xBG>%?>Ra%P>;+3tWh^CphkA z`$YL~5(&=>d|KeTRES3VhtsKOHW$m}qEQwdObzB(Gzr3@;ht^LzCKUn=ukR663vXJa)ZMOGpl*`0v=GETBXPMux@>s2kRR~*xSEug{PY?V zr@cWeX@dTd*r`DFC!Gg=Zi_)4cxR_F0 zrPYGVwSyAcEjZ;ZgKoiPTToiJ1s6vRuCl{|UnGry?zG@_7F^H2>Cy{}E?wzk`hoFW zO?gVsA9Ee8NR+Qxi=^@u8h+I^?T8USK>b9=ll}lsmW#z zbH0XnYO0w-oIkq)JT=A4LC&8hp1N>mALoBfJT=M82h;#wmTGbe*mo*PlO^tB1jf zi%f%VGAN5+x3_QeD%fG8>-7BQhoAtHD34n&Y(5LFbSbJ>((!YA(L3N3Qb)xYd-o_z z22JtXXOfUG`M@oKo1dWG32YDaY`;mr`_0F}=!F~VgN0~opI&fY(LjK)UwZ_!WxTMZ zxe#3+Del(}aXg}RL)sfD^fyEbdws=`)kaX;lpj}hBckmu-<=>=7fzu!_O;hlQ%KJ8(ia-X&jw6x}2xx9Z|-G2nbC0Kxor26(-6R$d9DHkzb z4HP@eA>%)gpH{ffEKXrRw-7ddw#_&Tn-FMd#f* zU_DsKX+tfqsB2gOLdwf+sDgU`t7sP;fF6mvA1(wRb_Uue)Z!0;=1WS@_>GQovPb9MbY2Z?dEog{%lpWo`C#b4TZPE?n0o(76eFO1 z=ZE>-n&vpIdO!ZIde0lcLI++^_x~F&d5_kh$0tq)RPXX*kXUkBz4ci<&+Cq$R-aTG zUPW`BXn7&2E`M!XanrGuanwHvX=iQzh>}!qb=IP0-ay+F;>VKej>$WJkyO(Yg&xh< z)}t*?s@a#(BReKft2>@i_x}}o4ApK96*ny|E?2h2%X<8U)15GP7Dc$#`C_qK3ADY0 z0vv_HDdj7bgZ_H^f^}u&AxO5Dvc0GSo2|99GtDgVBs~$M}`*Jyt5u;#)OA0UxiY)ult4Yn$;CjLnkc0~)(%c0lTZ+aO-@ z0n#x-X(ILMSDQ0yv|YdZ7|ppa+#a|saC_j3fhguIL-)Ld&YS5*Dfx}xnvb;uUvfe;OzQiOzH6-x49qMz4l~~>;#MM#UW!dFi~0)QYHh> zp9#mOHo70$vodHLH;y!(&YxBGzQPv^(_YXR-~EP#H%{@=RhCE$+xY1=_hWl6JK09| zi|f4@%49yEoZ{wBZFE1jryh2UA2%M)pHue!4CX5OLF35o*NrI{HS?qOC}5OnHF~lg zt%4=V_+i-iG4`QW%!I2Vm<8U_>;F|QhvPp98m+og zq|xarJ!S%78de!^) zA~jIlQVwCIDmo81d2X0QCoj*RQARI(l1|^H&k3`>t%mciaQnH>_vrfagiIxa&N5R>8V7$`+Hu}9MU1X(>TfsU2D|;$dMxVg8i^{%SHA;dg z6&CG6t!{j9V0_Qzp_bQL-ZIy9z0lc^e@fX}*yIai!!GrmDVN)>+*{HMUuh`a0tOLR z>3TC#UkaHKUuhFXjGt3BGtG$e{xG+O{F}O7*wR3YQ|puUq6r%GLKmKPmtHUlZ?{5+ zt59ERhdxH>_|xswKJ4lCQYu(1+fFKgpDKW{oAo_g^OLzNO9O1 zfS{K>9n|>oi=@3+?<2-Q&9gaV* z8}I0ir}g4h-=hNP=we@>@r3bg&^Qq(YJa}Xc>UzXVqVc&U|`+MKjkbgUlnLuxu+@E zb~04Z{JUQZDR1e<(}r6M$KTT@Uau*oe~0nn)=oY%o4@hXD|<9IHU@0TyEQL$ps`Y? z!LSj7f!YpB%q^J5dVDv^W*pHc-f++|)EqIEYlTjyqtk^U>*#bFoo+{`SLs}Cbb3qg zy-z)s)3C=b*Yv{8Xl`1tjEf`2jcA3gW(*I#WxQ&1?4!!IyfBkRZY|}Lf64cM%%cIz zu8#exWt_V3U3Jeb&3fUEW<)v=p$=_DTnFty4!g0t6*vDoDjY0eFV>yviXPhV0!YJh zsuy#p*a{5tLbnrWbIXY!w%K419n`Ki_Gn(}Di>y`-tvOF0#UUwqWRQDjDgY)UOAi= zuqe9uabvU=rOF>0I9ht{J&YmCdPXM`y>N?HFWiaTubcV7N2I@rbbf}#B%o8Pa*1PO z4wUX;x{<~Yg2YZt3C{)Q=scXyJ`IfJp_6i=+uYh@*|+lSlnL( zWoT#(`s1MOpwpl|pcMGf2UYI6ktt(dmYPAV01V0_;1VQ!=TLL?ef%bRqO6_ z?PJ$nyzx_4YikK6eN1=Ojxy6aAwvX_+wioa?2~kg?(ldYsOj+3-|yJs@fDpNo`(At z1w767EDm^D^R-(%rxZudvBc8~EZ}MA@c1ALoeqyHNauc8g}vt7pWaD4y2foRYvs10HH~+B0_|PW8I4 zCg7>R#}V-O^3IwbPkn$Fn=xvGosjK;EWfvz%@_5X(*{A$KE?4j3fiD^PR=^D0j+I^ zVedus@1uybpl77!AC`DrfjZtk&mnCLeJ=Zy+F}~mejD~%V29o5N2y zj+{LBe4Ge1cFII6=U#bdy;9&)0+6^@L}WR~g`B**mighngO~rXC@7BDoUi07_?$?` zk(H;5>Itj=cSL?Sx99l2sOVx*!4-mjLeLF@b_sg3ph-dR5cD2F<@b**ot>ZYHEi$2 zx3W>+wf<}T%}rO2a;{}hORK+meWQu8t571d(ae&_fx6IP_4AN78_~swf_p!J@7k+H zMyczSo9yMk)k{}owesEqAXRe@LX8u@Lb~AzH+JJ#W1PyD-O?R>NeJW za@A60S#5j$LmjIWl+Ll9nRxiZ8i+YHR#*2~Xu?VyPYr)fQj`xoA7csv8OMLWf1UY1 zn~ATZ?{4w1!IsSIz#WzVs+Q@HMqI1IUf05}W%IsYGTl4wgPHZXRph(6nK_uc=@D zzv0uqCix_LM+b-c24Ti5ny)FDN;mP3mQ5p>bS@Fkr859W2IFjaES}AgufH z_Sd1(P1iPshP#8!-GbpmL?ok#(GBZSgw*I*Q-5kyv=hrFa`CfB=aeD3#pN_Vg&Y+5da z76k{ZV+TcRT42b#EB-{)_}uUNlNrc-+A*Wb*s0HPJ!4gl|6iiNz+KV<&s3Y)?=D)L8mgF3)6nKKTYAKW-Z*pAtBRR#n(6 zaCttFi8mplu-SH&)2x@{bNk&8cI3H7+M%0yWuEc#DDe5pPj__l;ZwkQJBaf0yDnf0 zwZkuwrZAPcS0G{u3*}29^X1Dovw7wVT`w#1;43*kcid_mpF1y_c*D+}kL|*a-7hG@ z@wxM*m*aEo+_3;&04`jJEUm-X*Ma-05R>`-0`foQIQ*T&z(mP-C;^|(&+2V1KX+Yv zZvp$y;fz9lP~%?HF2CPHbScM8%=+65+-C(u{5oOB?w56!!0monoeS6}e={%U#h9-; zms=K)9|BJD_I@P)%lZ7riVN8Jj*#DH^UFFeaJ!$_G;dEgv-^p?B;==Ueq3(>udb2V zaJ4w#&o@ri0ADNG)=D;V`FZxW>H|sjkpgMi+-Ne1gV(IPSTr{rjg!Mz7M@Uj>1h8@ zx;Hiy?ZZhZ8;y;QG4i4sO5_rK{%be1wazcZ9nqq(OeVGqE>yY9F8FH2;6v6oIy}4! zDi)4=u2qSqlgXiUtWOp#+t)Wd9L?;CrBF`c!sU;m5Yc1`zF5&{*NuT~p=jv(Ao;;X zg?99g7|KiTTeDDdG!>UvUp5^bh^6|-CoXvF^?_~SPDtX284c;eq8_{v5Zvda?JbcT zIs%dC4P9L~g?gerfsRPXmLnY0xVu@U6qKa^$y=?xUAm5u!`L5k)SK&nM}BtDC%-)M zK{Gwvq)VIhT{9g3ZGLyNxx!(s;@3CpsAsiW?N&$*d!`58tPLu;aJs7ot&Vf^`s>Z+ zUFB0~$$kOnK$|z;94X-|$lG&X=SH5hbPg2xkw;&&%IVQo9qv-e{MlW@=vB~M#-sx> zmcko?9|v|nKjt?L_A$;4#Igg--?u9TO_S#E;y`kDBr@5-bgGJpA}x~`ijjhdjSS_O zpXcsJ)Zd>5nV z!H4bG4};Km`bK)Jl5NrN!%ehysji<91S9)Bo+vFnckqc)NE zWaC0WQhC1x5gsTnYsiAJ>Q4)ONe^3v5f{gv3g!1Fz-9jOeNa+aUTH@1lKwFgtoh3~ za7kaYy*J4GrM;p}U*1bfYE$QJEp>_hnb4R0e^8toCCyod5x2MBH*EUyeYHZRVPQws zTblhxMCj-v$A^5cm46RnudmdX^fBntJDJp%_pC1QN+;zY!h_@`JqnciPU_3|Wcl|t zQ`9*Kvfffpmg|p@Mj}#QzE4lDAuYyy{CD<8>Pz|@lC1ib_imv$FssUAQcqHHl(Oo} z_wxavFXf~eJO2t22$SWP=i-qC^eHcEmK>NnbOsW0ga(q-H#`WHTvv)i@H zo>@TugwX$@X{PdP*LMQ2VTma`uSRuyUq$CZkHvW4m(CFt1mYdeI}{=+GUBiO>r*UF7zkGcLX2MHav3t n$@QPsCpq>ItpwMVW;9?+wC7KMyJZvMiecK=5jt!NHkSQ2tzdX7 literal 0 HcmV?d00001 diff --git a/test/kernels/sddmm_spmm/taco_original.c b/test/kernels/sddmm_spmm/taco_original.c new file mode 100644 index 000000000..4f084ff5e --- /dev/null +++ b/test/kernels/sddmm_spmm/taco_original.c @@ -0,0 +1,166 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A1463, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451, taco_tensor_t *A1455) { + int A14632_dimension = (int)(A1463->dimensions[1]); + double* restrict A1463_vals = (double*)(A1463->vals); + + A1463_vals = (double*)malloc(sizeof(double) * (5 * A14632_dimension)); + + A1463->vals = (uint8_t*)A1463_vals; + return 0; +} + +int compute(taco_tensor_t *A1463, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451, taco_tensor_t *A1455) { + int A14632_dimension = (int)(A1463->dimensions[1]); + double* restrict A1463_vals = (double*)(A1463->vals); + int* restrict cage32_pos = (int*)(cage3->indices[1][0]); + int* restrict cage32_crd = (int*)(cage3->indices[1][1]); + double* restrict cage3_vals = (double*)(cage3->vals); + int A13921_dimension = (int)(A1392->dimensions[0]); + int A13922_dimension = (int)(A1392->dimensions[1]); + double* restrict A1392_vals = (double*)(A1392->vals); + int A14512_dimension = (int)(A1451->dimensions[1]); + double* restrict A1451_vals = (double*)(A1451->vals); + int A14552_dimension = (int)(A1455->dimensions[1]); + double* restrict A1455_vals = (double*)(A1455->vals); + + #pragma omp parallel for schedule(runtime) + for (int32_t i1467 = 0; i1467 < A13921_dimension; i1467++) { + for (int32_t i1470 = 0; i1470 < A14552_dimension; i1470++) { + int32_t i1470A1463 = i1467 * A14632_dimension + i1470; + double ti1468A1463_val = 0.0; + for (int32_t i1468cage3 = cage32_pos[i1467]; i1468cage3 < cage32_pos[(i1467 + 1)]; i1468cage3++) { + int32_t i1468 = cage32_crd[i1468cage3]; + int32_t i1470A1455 = i1468 * A14552_dimension + i1470; + for (int32_t i1469 = 0; i1469 < A14512_dimension; i1469++) { + int32_t i1469A1392 = i1467 * A13922_dimension + i1469; + int32_t i1469A1451 = i1468 * A14512_dimension + i1469; + ti1468A1463_val += ((cage3_vals[i1468cage3] * A1392_vals[i1469A1392]) * A1451_vals[i1469A1451]) * A1455_vals[i1470A1455]; + } + } + A1463_vals[i1470A1463] = ti1468A1463_val; + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/taco_original.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]), (taco_tensor_t*)(parameterPack[4])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3]), (taco_tensor_t*)(parameterPack[4])); +} diff --git a/test/kernels/sddmm_spmm/taco_original.h b/test/kernels/sddmm_spmm/taco_original.h new file mode 100644 index 000000000..71ce53402 --- /dev/null +++ b/test/kernels/sddmm_spmm/taco_original.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A1463, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451, taco_tensor_t *A1455); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A1463, taco_tensor_t *cage3, taco_tensor_t *A1392, taco_tensor_t *A1451, taco_tensor_t *A1455); +#endif diff --git a/test/kernels/sddmm_spmm/taco_original.so b/test/kernels/sddmm_spmm/taco_original.so new file mode 100755 index 0000000000000000000000000000000000000000..f50931baa4193cd10932136d3af5a561d44047c9 GIT binary patch literal 14304 zcmeHO4RBOdmcB3DkU(O0zz9K6T5M;6lBOd869G*p9r9u|!a#7)0Xv=kNm7$^XSzEO zGzH8CTzi{#m|dr|YMfejYj(%lof=t}(OK6jgF!@BtD=q!yE;4U_}5u+MJ!!*aOnNc zefM;_Uy^aFwrZ=kiVLsLJ>U7cKkvSK-@Wf-Z>w*8iKYoo7IC{EZqpnCX;OmD6)FSL zBpO61ewT{trS7ar(Ue?6l>spo!XW`FV>Ye}*Gd-DAu~(|N(_8RQMMzN>?*G_###Bo8vAiudp_256yCDB%``4z}4LP*4ZOJI`YJ`@BZoVU`1E|{>Ve+KmE?w;nUyQ zw)pTD4@8*#y;l$1tbHqM=@D%=arGLcahsN1_afLXkaHTGum3W2Nr0Ty#30_zD z5ryk);P-d;$D;m3G7wMt{lecH?M(`Q1Vs4TJ2v=3;dr>aH<1j-J2tfR#iHSkKv!Sb zkWDY~2ZsXWAkf!)e^|6_+_1^t7mE$}|(*3M6*qa{GA1= z{lR}FVwD_&Lbzcye#6%=mCh0&g^kw`ZxQ|ADQWQ6tYpMj_4ZytGyG144PM=g6|6+; zQ~JEF@UU!}OIAo~{a~Eey)qLnlzykfATOOn1vsxsBy$(w=&*chD8PBHAYoGhPGxh_3vliWl5Hx$ z)zU9bwiV#q^(4?o&X(!vSvwXwgnnQ&S&|#qM~;_`NgMjft?Gg(cVh^@_9aco5#K|i zi8DE*8}B7vBYs@+n~A4eFfl6mJBg>RnK&Z(X5y(!CJsyfHsYzt6Ne;!3-NTTCiY4G zCgQ2-69bZ8N<1}rqEqruEpUj)A@DNUf41HE<#&C~FPruWVgFf&^UJY_%f4j4AzUK# z%(_ouuvO2@ngG=sFX*ZFlZ(&=n~6miT(Ki}X3}KG7`1wmhyCA?VrJFfNH@za!CpUh z$)cyu>Bq8n=-LVW^-IZWIGm>(R**xkg4y4`bpfb}!4>+*x(}~HZY$NNLQk)22A;hZ zP0Z@}IWemdyhi3|m=K?h!KK$w-!Tz^hQWKj>ACw^x*MLYo{p_s^zWWM4Mq>{TATJa zg!Hua#v0;N`<*92>qhl-o711Re%z>8dcUk0YV#PR>Fv%cNP7;vZGZU37;5Q9 zj7p}DO84G$$k`XMfAX>Y$#Jj!$us(6dhiWBQ-4ZNC7rqsEm@=% z%5T%t0CW^E8lc7lKU#2Vd;IWj?xhd7*?C@o(Mo4Ku&nzBm4<2vNtUb}5 zx~QjyolWh>KC*a5-qpOR-*}h(s|UkEyMS^VMn1q$8Fto?t|wz%>3Ka1*oi{dW}0)$ zPI48>bc{{mElq?PTkLkL?fN2C1Iz za8~P&dF)TVHZtsVSkBukt6n&7zo+Um`-8_IZ$0ofnvq;cdQA{|Ulfp*;DXk+)4Ij$ ztU+17Men^__eR8C^`0*?{36B)kKc&LUTG~Ic|nWV@3EGm&Xu0VapaFj?AuQ6`nQNZ zb_}i?8atfMh&^$-j9_^FXZ!viQa_@xt(mpfOqI4Fn8RH;-vW1+P`vjok=dkq8sFlo z?1@6LN_z!LP%YLgDb!nU=#7k8pz3LS(WIUCcb$fwI->_))*qvWzb4Lm(1a6^kE4SB zqK}-=^fgImnf<_Fj7}N{Zat0ha^5~~@d zrebz$>ZIpbc1f#t()0R-_Tc!MibwY>^rlXvURZX1*=U`wraYr(kdD zh2alU<8W$}NAr~@m803}`6e_KQ&#G=_S7k?OAUBDmiy4(mDyAOmCLmUf8|Xz=&3PZ zX88}Gi8%>l-Iw|#`wK`x>DFMrX(`M8!~oh;$H7)-9|J2NjP}%v033Q|?OWs^yB7l7 z;^nOeFg0Pefv_f(sI))47gCynS}|v3tOu;JG|Ye>-7s=N8@%%QO0xDSw~jkg*8AH> zUb6Zchn>~Q^(|@Z?K&RXvgVYvu{rHsCt6b0wKSkAo72{Y1EVb|V1qw@zKpzFyg-#p zI?2nPyqCdeu?(ZJrweYqoEKHKm zrgh}k*XZe;c+8s0eCbMGdK*-macy>|t5E^E_ii0?ZcW|GbhB}PlFsV7Hvl)KwmaAA zsSkXhmEKgysi*p!ULTY&0@gaQN`HnbdtuvgipmQ=3Ci-!ahu1s<={$+uAf^d5zdr#C0<8q1miP!!=S=ex!zWNr@eev**0C-M*?VyvfgX)ZPCjI|<9Y)1K1*+L&9=(#l{DL`AGWmF92sk~t>)k?kIntyY>%yB zq_oX;sKoNTWv;COn8#MrY;!;tHqEv&54}&*lk-zt`o&z1-*X8Y#pMOY`%J=qiR%zB z9h4H$EUs69$$j%=U0U;XS)Z@Vowfmsrr8?2Q)&VqHMr{0KS$8toyb$W4wiUq)el-c zHphsyq{CM2k=4e#Kg6BTeGj^_vmh8L>9myrcjFp{>;hzR4{q|~we8bN{(i2_(Zc>b z<AnNJ=JQ{5MDY1K z?F}f=yDX*8FXduH&#ChG9sMNjfsyzOeH->1l=v+9pwj0vbAGR-ErU!Bha}2O(6I(3 z-na1C_Cf;|Pbh%#tSXr6IjZ#d?4HZR_DI(MA*GM)k>t7DbBga!a_r%yTs2R5{l6pp z-P~N_)2g8-6g{Qr2a0~7=xo&i3l#mDqN^3h{z%xZL&23{)&fO^nVmO5g!H)8^w|=o_6^CWk=9egIFIn?p{|;YZtI_NP65H*m`y z65dBynXJ~*OO(;(X*b%gEVtOMv29&APpic6C~;{P+im4*%D=v7o;JUqqLJS2 z6}>APR>L-)j06(N{y?(F6*L~lIVIc(%bP;8%+>CFr3ByBG<^4WwA9rZ9qFj;$5+no z+E8o|AO30^+yIh;(eS$Na5Nn64T7bQjq8RQR@bhschq+C|BbgeYWa$E4fghhdf~>X znxi%njn&GJn6(4(STY<;#^L}6dV`{WD42+guV~MT;^wT7{#T&UwYS!`_HXjKHz`IA z5hX?y{j2Lyh3Mc=ZFh7~^^-`1llY99NcyRB#9%bh+Z_#u9KF$`*s6x#4n^mP?t& ziv8GyK230i1*a3d&J^p<%daoR_kRf4?mhT4;4p>qaqtmEd`;7+)|Y zo`d8T!K+H4HsQQ#m~mc3%{Z@~W}H`9GtR548Nap|P@F|vHwCA8$|^q1nFV$LC3qX) z`^(cad7e(cM(Gcj#=}htKVrhyDLlWxgQv6wmxkyhj#HufQR>2_iK+KJ1)S>H#4ie@ zj6^5++BDMYHl_j?5Ua%&^#Qe#Ae~<#j5{QHf#r)lUv93U$KZG})gg-DiBq)liFPsdA zT(_=nY?xk22B7%^@pxc2La37Q-H6c&Aa*P?*x$b!CIy^~SSwPEMIwE%K!~d*W4&VW z-fjv>LseB2m&=cO_#;t-WBL8-H+wd;`djbvQV5)1Iq>fcpvn}%H3>xqqd~?(iI~49 z5Dif{ocEr)JR90upozChf2*!M>fX(O5IM)b+k6|FJwE@&_3O8^cKACy&AwJsiHJ>; z5oh^YP>~9xK((eOjvk|!u|E`}H#PK*LhN8qA$b&rW(2u$R2zq{8L@w+kh{rJ5wn&L z>6?tvEA(0%QAn|RM)2LF4_WhqFu7@?@+ONZiX1E`X@(F$Q|#UpEfo_Iz$yrnYLV2Nju^XF3Ly z?j76ndp`gD%sAZ~ByKm`alQTsGE!lC-Y1_~LRLaJ)dpw~-+ou0QWvdK`v^*`CTOOlQCl<}L)0XZvL)ob8!T zK+gF>)$cnD&g|E$dtnCqW6J(M!%hCzY;Of%#z=X|^@hp%yrq#b>&~CSKCJA!zrvpW zo`rmx+xe=pFDGF^VwUMLWH?`-CVsWl;O-~{kvHp7jExy1lLa>zh7N<|f6cnY^W2eF zLZ(64AECt=$$_g87mp{N|K|YXv4?CvW$EApX@yB>E}ypdCKYM7kQR0%{w`y1Q``FA D$wJ#M literal 0 HcmV?d00001 diff --git a/test/kernels/spmm_gemm/gemm_default.c b/test/kernels/spmm_gemm/gemm_default.c new file mode 100644 index 000000000..605cc491f --- /dev/null +++ b/test/kernels/spmm_gemm/gemm_default.c @@ -0,0 +1,160 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A2039, taco_tensor_t *A2035, taco_tensor_t *A1450) { + int A20391_dimension = (int)(A2039->dimensions[0]); + int A20392_dimension = (int)(A2039->dimensions[1]); + double* restrict A2039_vals = (double*)(A2039->vals); + + A2039_vals = (double*)malloc(sizeof(double) * (A20391_dimension * A20392_dimension)); + + A2039->vals = (uint8_t*)A2039_vals; + return 0; +} + +int compute(taco_tensor_t *A2039, taco_tensor_t *A2035, taco_tensor_t *A1450) { + int A20391_dimension = (int)(A2039->dimensions[0]); + int A20392_dimension = (int)(A2039->dimensions[1]); + double* restrict A2039_vals = (double*)(A2039->vals); + int A20351_dimension = (int)(A2035->dimensions[0]); + int A20352_dimension = (int)(A2035->dimensions[1]); + double* restrict A2035_vals = (double*)(A2035->vals); + int A14501_dimension = (int)(A1450->dimensions[0]); + int A14502_dimension = (int)(A1450->dimensions[1]); + double* restrict A1450_vals = (double*)(A1450->vals); + + #pragma omp parallel for schedule(runtime) + for (int32_t i1517 = 0; i1517 < A20351_dimension; i1517++) { + for (int32_t i1520 = 0; i1520 < A14502_dimension; i1520++) { + int32_t i1520A2039 = i1517 * A20392_dimension + i1520; + double ti1519A2039_val = 0.0; + for (int32_t i1519 = 0; i1519 < A14501_dimension; i1519++) { + int32_t i1519A2035 = i1517 * A20352_dimension + i1519; + int32_t i1520A1450 = i1519 * A14502_dimension + i1520; + ti1519A2039_val += A2035_vals[i1519A2035] * A1450_vals[i1520A1450]; + } + A2039_vals[i1520A2039] = ti1519A2039_val; + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/spmm_gemm/gemm_default.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2])); +} diff --git a/test/kernels/spmm_gemm/gemm_default.h b/test/kernels/spmm_gemm/gemm_default.h new file mode 100644 index 000000000..769514531 --- /dev/null +++ b/test/kernels/spmm_gemm/gemm_default.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A2039, taco_tensor_t *A2035, taco_tensor_t *A1450); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A2039, taco_tensor_t *A2035, taco_tensor_t *A1450); +#endif diff --git a/test/kernels/spmm_gemm/gemm_default.so b/test/kernels/spmm_gemm/gemm_default.so new file mode 100755 index 0000000000000000000000000000000000000000..9de7a7933e926848e51c4a98410b02d55dc8517c GIT binary patch literal 14296 zcmeHOeQ;dWb-z!$veuT^UE5$}F!*5>A_H1WAdG}<&`K-&iCKeU%fTi#PpkcqG?8}2 zei)g!wyA8&c3I-I$-qo$aWm-*(=x$LX-McF$R8L(Nk$j~o{}b-mXB3%2jTGH0P*&B z-n(bD`y_=?YHNi-#zEtd(XS~zI)$0+S?IWU!f?1lTF+vi0hhfBCWDu>nfQ6 zX%)?)3coAFRfcTkbkU65Q;SWAsSv6GP#yDdox8$dK~*EebhyIAMiy2RmL$HB%K-UW1Fw zRN*?LDvtRL71cHNHs#?;RiQF!%Mx5|9ltzt_^%H?_56SS#qM-X@6i6}!_`mUckIyV z`?fAS^y#;wZQqN${-QU5PX2ims&l4FGWl{Y2ub(*$^qo|X87#IJ{4N}Hv9YkO*ell=xc z{$(|yUgC!&aL_bkjGaYdzSzeTPvT#b@~sv-G|m=@i$s^irMP(Bz)`Pk=Vn>(n#2!E zTw?`Y?;DE8^kgcSNa?!J2Vw&$p+`Z4-r2oT4@VM_zJX*alIY&pJ{XThx`Vxg5wmRe z5!ABwB4MjNeL7<&XZW z5H}lRP>2R-jbD8&meNWQme_b5@is98o{|E4ZL*B`iw6cSq7{Cp#L|}y;0abC_DOku zuJEvIT|in$^7+9yKldCKTuAvIRW;ODp81L8CQfTU^`29{1TTFaOqJmBQERB2E5WPT z2xzUWm)30AC8Ja=!O7d4>Pm2aCQ(^K2@Z!9Q*#Nt;AE8Cl4C@jl|Qfn%rmb*AY)$ zpBy&$6~t4QCwmP3$4gWZ<>gPfeze{5<%0pwm#zB*m~PLP$D&@>^8Jtpzg%d!bss@t zi#w_Ox}05?;Fq4-SBPkb#K|MJ$U9RFk0yTWr3{j4r^Ka zHFX4J_In-&Z5+#PPy^Y!9f90_4>>Wb*J`;(JP#x5&&KQg**;Gov)$v)_&t6tTh&+y zWV$^LAgKVL9xW5_Y}JtK&$pc~6!wp~_WvIDUJD%yWT6$vLRdR8VT-yxe4E6&d^x|T z;Ln6TyE`&x05o)lUeq%9@Aqh0ZrO{dSNL)r1?|YY_Klf;2Rx!>_Z#iWK5MkFdcIhKy&Ajyv;W{q4+an{*@;**wU2o%x4feii($wzIXn$50R-Q zVbzzjH;^Oh8eaz6WZajI18--3;d=B6v~P5`r=imI&i+-C@h(W6S|=m~e!wZvYlDfy_+ zwQc=W%Ue;`)|2=DChCeGfhU%=bbD%}uH5er8w`!Jkf|`$ zxB34diqd~Ir?hQ`^q%`>w`-p%xxthiJz8ltVxF(%b+aAQveeu?S3=OQyrN9e{eGN0 z3(tXUv!9}J@~z{nYhlevZS;tutqFMCt^->!Vn;EQ6b(-Af<&vPq&yD44P&XrkKy&5 zU;m*{pu!#+T3eyN-Q%DM;Lh8>pwW)ILI|7-bKMZmF9S{kmL^wNyUCnw=_c*L<1`Py zxx=^Jx5Ia@Pv?0|H+u43OkX*#&09bDbu=x1`13*`TXp@sV+W_lYi3*vz1dRp*zOe{ z_-SSG=W6D4WKQ~ykN&rsd;SArT)x`%+Zx+M?Y8gzHKZP<`Rejz5D$V zJ~M@ul&o)9#X@IWYM$GB)dwwPKiA~;Wv0lyPnj~zPpzqWZci=jWZqnPV)VSS_g64i zTw?v1qr2bBjKio|AFW5eOo3LT7h2%~EJ>N4cV^zibEg>-=muIv-1$#`Unq2j-tuRf zwal?V?uPF}5^EHOav<|z{xnKKX|7@Vwmb4KnLuaeIM~|!Pr+)q2eqBFkhH6q@4qAi z`6p3;yLUs!fr2qnQCOErx?PX%MX4{hq0rG-&~o+zcB3}Gr}l%ZN6#tg#V@!?J1EVn zcVz4jbdJ7c547y|)TY+AXYIGqn&D{6*jw7N{&k`~V_!=n%-xo?Hy;>l&j3sR^9v5L za=BpzlNwg`6s@G6$72?qJzX{;Fwbex&Q@ueFAj|Dxw@nAoyPw%*L5x1UN`!paz}Qp z+BtgMkx!i~6k4v|o7b}6uFLm;^dTi)^#& znGw4d!b1DLEx*%{YHw&9`zG#O+4FAnj8^j0z^c_aW|+wCZOCeNExR7uhn)@nEFKrc zwt?LQ)}3v&YninkJTC{x12kzg+;9ih$=b$KjW27N_i#V!{3!7F0jhqm%ks;Nh6g;& zm^?Mcqbgm68b^<7$MT0hqi){q*0T4bmfI(-{5X}<8Tsx)t79PXF%To zrFQ}EfGz{&J(u#pCZUYDmCI|Y90wuiK^=6)KKyB+a0pa5-RqsTcetwWckC0lU$*AD zn>;HBCVdP8QHS=?_~8^lW;-qw?H#AxRGZWNor*SR?W490PBmw5bJjgl>2o$bG|%U3 z9qjPvVsTRMtuSzm0+9nN8!qBxuVGg<;0b-0?~9|!#0gFN-?kqV!)_92_k zsgBw!x}CK?quJPlqqqmMYawem3k9PUJx&L3D%*{+E|eWaUih6|6%Q?NI(*eeUmr)= zag z2l)LrzwhSv+t_*rmZ@%s+HL+0-{OJzB~Z@*mfyzjkC^89|C&Los* z4^8Ql^MyFkDOn$%4NlS-8HwMaZ^c=M62D7+M9T9!bJ~wnqN9qDRMj9-O`JB-;(3YR zZC@(!rzF65URKQQ9FubV?w;$zG0kZI!%`l{GK1%OrzGDk%W*a}%4PSY)&D=j-_5Nx z{zP{4grsju`kthpN;*$EV2PyHO1fH7{ywsyz5Q0TZc8t|ou$=Vyf=Fr>TgOLT;ra` zW^Y5&N)r_~pe3fW%o=!r&UF~N2lA%#9L5l+*$?2y*6xteDf}p#t@e~Bz5(2}hYIhd ztdZ>Ys`Hdl7An^`7gyVympivyx=?XrcvN^5oAcJ{HPv5Rx=>kE)mr;l+foIsvo#45 z?_68~F}p`pSKkIrSP9{(FpkxV@`dvvp&*fQ{P%iSng5fC&?@>K7YeNk#beQdzEuM^ zHLr$lA{7lLQ$xX2zc*w~$N3f93ES&Ju`N*Ud8Gp1))ajASKAvKO-HKrL-@+sS09e2 z@!_w&xdA{b9gD2%i^L*{fe={w*tl+_d3F8jCbhng|KD|^TF+OsH$5;I9)KCMX=;5m z7OyuxV%85Q;;Bd|6;A*h9teq{kx()rzM?;?%Dc15u)hkOuD_+eW2npD&?OmTh{$3z zQD5DJCdASs^?k9l^pi+NQuvITOzGqtk&Yz?`eKo=IuJ{VEpqtnkaWKIZt8pX1+Dw8 zdXk}#EUG{eZbn&R|FTS3;a4&?+$N4MDHXP15b(NIj@tyUXXUtE@VZxyR|;M~%JF%E z*PU{_O7QwsjynXebLIHV@Ap)0!)kj-&iiutYSAjaS&r8fsmR!WJWv( zmD>bARZ67^=ck4h=clL@=clI?=clX{=clU`zoHzFoK0Lg1E+P$E!kd!Wt?0m@xvB;oy3a|c<_`q<5Ccv#BnOrK1#i~6fyI@XMj^XTX~~E z!btRnU%}fp-lrHKexAiOTl?vK?riue;D#ThedFCIu(|y34$2ha5E@R{TGS_7XsFIE|jojD(AAE?X{w{)k0&zvt5zcE{TM&`hC zz!y=cE$K<%ss$tVU*?eim4V~FOaJix^S8ifi@W;Bke|6OeKv>vpJ9(eaZ%Pdt;GgD zv!OQvS7*Y)C+(;*o{9IneG;FLILB#q%whjxqhB-Q!?r=*T{2&|2@z@E8aMX99QL1( z^455)e`4I?=l?MLRwE8c#~+jxAC>af_^p2fKAV3&Hsrb0)3gQ~`xVn{F;@qCw)iv4 zKB|1{fK5p_SEWnjTCfg1Znxl}hYFj8+h_W8w7B&@QNya7M&hnP@y39gGLV+_b*P&=A7d zqQUfF3N4jETwWc`(4#R#W9j<(O}>pCddHo9ih$FlzP>Yv7E=J%G!#w8LX3rzalJnn z3sW?lf7_kDjh*d~#AZ_O(4dyW865XjicAh&_7GW-E^%8St~~L zO^4`}S}hMKq)Bmps9pO`#BfGw%?0gxYyXb*AIk+!Sx4|{lXjG6@#Wp zQ#cKf+|EcMIS`MPF*?c;k-;D-$lUN?N_dUhy~uj|;$Tx0O6fK3q&LzpXG?z=%FLN6 z=Cm;-ISU-E1QzK{_dz8X>%&Bs>_99U=X|iYHxb#% z9L^vS&ch*`rHD+aD=iflr2l1xc69Xa0n49}_&mYXBXK6?5tun4yvX8HZ=Ch{+`%*~ zMX_8O$-1p?21@TzSf9@)OgTQE>cvuDe1F3`+E!p>oAvn|!<66GK*UI7qcmw?=-mnH z^LdA9lN9FmvmDbcD5rNZ%=5X3X^+&W_LGdY|KuMoBT+s-F&!fiiR)*5?tctrq~AtD zNPNy>dfJ2yee9Zw^|Ss6F!Bk%@7gCTU}`;4P+g@d38qy4kkl{IQeotIf5g;y9|D}~ z=kpy??k}5Rp6Q>VptOEIComN(C=GDEY%gckm-`ET&uBR=7`|p*qJJs%`Tkq~ej`;X zjJ(x5ljZl+o6W#|NJ?`0qNb{bhZo&q0^YSgg}^_WrsRjEFow|i7mk>yNodj2r~HZml`?dS7iziLWY^{KAXGy#UNhEfoD)?aDC zS)b`7$~j*u`d!uJtah!k=jPBqBK7Yz%@lvF`gQ&YAP&A%raex4ChP4#9J#&?)Fj;d8;hN^jI-cS#q_hsG6ksYn3IQ z*N&nPGR;!|u>3Hf0&Bs=+Gr{&HfV(2ZQ2Rjmfk1@HK Gef@9X*{se0 literal 0 HcmV?d00001 diff --git a/test/kernels/spmm_gemm/gemm_template.c b/test/kernels/spmm_gemm/gemm_template.c new file mode 100644 index 000000000..4a4e5faeb --- /dev/null +++ b/test/kernels/spmm_gemm/gemm_template.c @@ -0,0 +1,183 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A2039, taco_tensor_t *A2035, taco_tensor_t *A1450) { + int A20391_dimension = (int)(A2039->dimensions[0]); + int A20392_dimension = (int)(A2039->dimensions[1]); + double* restrict A2039_vals = (double*)(A2039->vals); + + A2039_vals = (double*)malloc(sizeof(double) * (A20391_dimension * A20392_dimension)); + + A2039->vals = (uint8_t*)A2039_vals; + return 0; +} + +int compute(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *C) { + int A1_dimension = (int)(A->dimensions[0]); + int A2_dimension = (int)(A->dimensions[1]); + double* restrict A_vals = (double*)(A->vals); + int B1_dimension = (int)(B->dimensions[0]); + int B2_dimension = (int)(B->dimensions[1]); + double* restrict B_vals = (double*)(B->vals); + int C1_dimension = (int)(C->dimensions[0]); + int C2_dimension = (int)(C->dimensions[1]); + double* restrict C_vals = (double*)(C->vals); + + #pragma omp parallel for schedule(static) + for (int32_t pA = 0; pA < (A1_dimension * A2_dimension); pA++) { + A_vals[pA] = 0.0; + } + + #pragma omp parallel for schedule(runtime) + for (int32_t i0 = 0; i0 < ((B1_dimension + 15) / 16); i0++) { + for (int32_t j0 = 0; j0 < ((C1_dimension + 15) / 16); j0++) { + for (int32_t k0 = 0; k0 < ((C2_dimension + 15) / 16); k0++) { + for (int32_t i1 = 0; i1 < 16; i1++) { + int32_t i = i0 * 16 + i1; + if (i >= B1_dimension) + continue; + + for (int32_t j1 = 0; j1 < 16; j1++) { + int32_t j = j0 * 16 + j1; + int32_t jB = i * B2_dimension + j; + int32_t jA = i * A2_dimension + j; + if (j >= C1_dimension) + continue; + + double tk1A_val = 0.0; + for (int32_t k1 = 0; k1 < 16; k1++) { + int32_t k = k0 * 16 + k1; + int32_t kC = j * C2_dimension + k; + if (k >= C2_dimension) + continue; + + tk1A_val += B_vals[jB] * C_vals[kC]; + } + A_vals[jA] = A_vals[jA] + tk1A_val; + } + } + } + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/spmm_gemm/gemm_template.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2])); +} diff --git a/test/kernels/spmm_gemm/gemm_template.h b/test/kernels/spmm_gemm/gemm_template.h new file mode 100644 index 000000000..769514531 --- /dev/null +++ b/test/kernels/spmm_gemm/gemm_template.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A2039, taco_tensor_t *A2035, taco_tensor_t *A1450); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A2039, taco_tensor_t *A2035, taco_tensor_t *A1450); +#endif diff --git a/test/kernels/spmm_gemm/gemm_template.so b/test/kernels/spmm_gemm/gemm_template.so new file mode 100755 index 0000000000000000000000000000000000000000..2cfcd7ad3adda27ba1b465d8278ca0b29860a341 GIT binary patch literal 14512 zcmeHOdvH`$nm;$)Krn50gNRWPu6T8VkS5WHiGZe)4%}FcW*|7IVAJXDJa&@qOg|(z z2ysGOdYWcfSJ&NIx0c!3nR08Vl(O7$#=#1_fQ4tmB{e9=2 z)46@qJFbIv{eWmlVjO@*Q`nQZJvMqI~v2GT49b#)>G(##s! zEZi5f%en51Dbcjt*hK~;DPt-JsEnDoPF==XP?cvS-BV%U`vsMD#FCwRp&@rKRCz|y zW1=aNRvK!|4ocS%A-7UZ=t)Ytlseu`hAHp3kh9b14g+SAO1p2uj_UcRCbv*rFUl*| zFO{M{Bo)o!ZJP%lHe73O*~sp`aoeh^oB#d(J7V)c)($>%>!Pu@U!x*O?!rZGX5l)Z zDwVVADykgz7G=*2RUz30xGu)k()Rj?2mj&Vqfh_-fgLGF*TC-XJyk#X&Y=TuerMBy z11G=R-SS}M)t_fSc6L<#)6qY?`jKW6@tI^tTHAt=x*ScR=Sd2OFGY0PL5d{D`<()x6y?5Wh`D?&o6R=O7JwidrM|__e8}gr^BKCLDCIcrV8f#R z8-z43@Z$p4rBxu%GZ2dg63JjZ83?dIU$igD0^J}i(B8Q&5RSwnJ$;E}B;L8MwLcb( zbOyWnBZjQBBoG=3l7nD>-#rnw_SSVBf&N%*Fc42glYIjbqo@*iG%}Pl!I5Y<5RGjM zQgz5C^zA^X_uGN)U|&Cr4Gac)BFR8BH4sSl#v{RSqJSAZiP%77AQ4Hj(7+&reJs$0 zO2xNthy>%IUT-&=G#zeq%D5sL?stUDeN%^D2Gp%>|A zfkbcLKp>b%Lf9fz-xAeC#sT4F1>^ zSHZzd_JG7R;_d7a!5^z3-p2kOJS7EX3zrJl znybhPNvvlQm+Nq)1!qEkNL9I+)R+A6MFvi55cP{pITLPvwI)rtLn?!wGT}}WUMmC4BIrf{{`{Xl@t3p z-$gui)xxrkXnb^wtn~A5PG10;K7UHRkCYm{a1M$=~6Lp+_;zE^m z(=y7F>#@z+>HGZJ>E>Z9I=yck`z z9)75+&fk(hK4r4y5cSd&5BsB}n7!^fkd6A1v#{64&)W3#q&}XzNmmZ*FP}|T!{Hp^ z&_NDm6|DY-S8s+2K4P)$+LvZOc z)VEA@L&M;`-}2t}G(A1sN=(W9r?zAC71zF%lt&x09SZ!7|IY4ud@|jsb+{h)X`N_% zKa7S@atD<9U-W57+LMmt+z@R=O?H#_-Mos*B>2B29u}|~^kw|oRxnBLvt;C3>gOJO z^g!ULnO-B%RrLBme>3ups10Ea^vZzN(W?Twj9wAYno5Ie^h$tN=~Vz-K(7F3HO-cw z^XTmdb<->f$|q6KUa(YOsIKlIaGaf{ZSh6-t4^D z>0f!9UUThJAz0b3IRI64yFMD*d66&M{IMSTa$RWbM2pL9KXk(Dy4`n%lzclch493b z(7}cNGwg(G(>N5Gxxy#q6;^SDPs=Mba)r;zD>QJ0ljRlaxWeb<6_#;@Q{@$ExWX6Z z6*R8!WqAdaD|}U6VF6b-U0$J@E1W5>Fpn#oEwA9_3i&b$zNRxf=j)+Y^w23hU$_5t zJi^-@U(s{)DR1CKZ_my<39jOVYfgneGOj2&edG(bYjpffK0ku-rTA=}njb?-_t`dU zoh$K1xkmRI!d?X$o88`#7u`_k1>`x>eI{0o_!(X>_Zh+cLV<(XD}QjdWW@ zw`RKe+)bD^T%%P+F@^Stfey#?VnoD(#Q1B8%>UfcP#$ENAJBa>zLV=KH?qEEoxJacwau%9y+$t@i5+b z%-hc`IWh93vgEW+%C7%{yTX#~OFy^cz4S3SHOiy)&702C>h^3i+C%;KAMNQk z(1nedpqKeEv$=CW{yd*=5B=JgZq(C<{Mls>QPXuy^40$IN4Y(ag6dGM)YGk%xorl} zo_-!|b*>kzj#;w0J^caz6@B_TImq1#0Vd34ZKHWIRw1lOC)}=myCC&u*XG;m^Lo}k zYUib45PxvR$SEat!Lzi3%*_#Q)tl4yd)h}{wELTOXw}Iztr`1`Iu?$~mbAU8CF5Jo zTGRGbI)in6T>r5LHOc(WI+-Y{R=EipGExe@u%+5N0+M#7CGre{_vj!!;?aS2oGK1h+ zD(heIWmu-!p7v|i^+)~bu;xzpYYTjMMh<$&xx=WbKLdMu;@Q=|M)iEiJL-?tzXDf& z>}D_mlNt)~Ybs0{$+p1aT(S z6qAjHZABQpvk5c_+7J2~=rE{;3GhMCPHe`GgFXrBfc-n5HK4Cy$CxA@bPwn}_@t!V zvz{qKZsk(Ptjhi52N<1=9r+}mKSl~p_ZnyQ&916#mBZ|&OIBWet+tq8viIQ{M7?R8 z%H&68Gp=6L|1sL)v^d>AsAzFk@3XCSs#$xBvu5uMue0v{nO#;=rgJUCBi<6Zk42kFhV0(b^Gp4Efw-xHYrcnQzor5+-aW?v< z)gFFoaM71lKc3N6t*ewiJDZOYofPF2_6s5-&wMvwEuo=vs8MLZ$UYIde_?P=GI_;r!kwI9asF1{mL`K$gOz6oobXgv@cf9_ULLb{Z&dYKq z1^<$02tHT2oIY9pKa%|2+*<-my+N(n*oEK1QtI`d>pXR}*QPjEzq7v4Q`fM>K-n_X(C9?t5y*jCZZmWT^o>rX zEhPr-Zrq-roF!FE0#G(s{V9*!2Hdujgm+PvC%b*tS;{DLlq;PVRN0)DIyYW4M{#4= zRCpAd^MPA??O9)p5-?l5tSI+a9 zf<(^o-{VAVT;|kculik5Yav+%O^@NPqbasVogzZ(P z*v?b#c&P%vdnt$qP+RNkjgC}n2k@(9Pi;7s!cTLxjdcK$sc2+%Pb3e_b9PA6RfuT?$ z&c3ES%Zr<{ocmvnM%P|n+cwbQtLqR9A0k4GDh5_GpbF8{P;E~%CHjfsH%|O4nn(ty zb66^x=YsWIWd|9HvHZw=M@u-IdCSE^HDKAi^+Md7_ThMf5rIp z-{VPc!z?@~-uq(xD%LD|vlw?2;){xLtR`X|DaM_JxSL|!#rBlccNf;5V*NQxt~_`r1}Y>0ij?yjH9~JsyJOw!&&-)+Su8CRUuHa;7I+VO1@b z*_d2?t+-rut@vfdfZ%Lw;WV6vv7LRKw+dVUDzHnCZ;G6r$#W_F8lgXE87Ef@{GbJ2 zE%3rZ51!HnT#6~~M80NS9H0K&)4-{oavubir-yJ?uv?M)7Xh;;kttPwIyWkXzY3i9 zgV>Ppvln3R3Y>|!lOFJt%Jt9ZkSWZ?o{<|9u8p$?&{E}+xD-AE9irf5%WZjIs>|TZ zI6i$mH*tLW`_clOPR?YU9C4H=UPmbrpA_Wy^!I2R$EVjbQwBc(yi|PE6TnqogW}t- z%IJT@aR-}3l_<%xm@~jj#aG#}7b#V)%JJ#**i~ipFUAIy;;XE2S}kStZ!3eR%HR(G zS9yzyZ$ISv)8o2c6u336>qlk$e}nhW^mwwLL%&oU*bBnHHO}m5HYlCEn`>;HCPflj|4v8FBr<7f*-T9 z-PpiQg_Q-81A!0)mL(7i6^;dZ`eR+e{y-Qzsze}|8e$Yj)gMVl!k+6_G&Po1;sI-c zU_2h&jv%UJd^@7Ff`~N>rv?VL!^Fh#2)82DSa)}S3?XEqYCVwwM3qHQXfPQ;RYe?^ zCxChcx}yluEj|jO3ka8iyMt&1MRiR<-Kl6uV&OzA&>M_~DWJ}G z`z_ve?XA$n{x#613y->QJs`y1N#ASzx3+lwfm_$C+0fP*==8Su+bks_UX4eg6>336 z3Xp==nww=r8O4$Pu_(Xk!Fv>>2YU+YqtG-X@J&X#iI6txCyLISio|PfE*Kb*g_eN6 zsTwtipuH)C>Nr)8si;6xEo6bFa6nUu#Q}~K3}{666&GQS!z&r~H&utCKt@wF%ETC2 zf&-^%iI7A797;x1@{(n&p(u|61Or8pmX_x5I8NqCY#+dA08PdX+ADKWe71P-72)CE z9<_a8#<|{LqL+EX+oP~FXcAvDq<42Dp6H83in$aSE>gl z!DtUA62bOGyJIpR?COd~?v@94_1`SSV8{kEh#%*x1H?SE*om*3BlHcH3Bk8HPe`>)8*(Ulw@ z@;gcXy^6KH(q7W1U`yYK(q4X#SBg(esRtFFBroY9p!Dpdz5H&Me-CtwItNL%TiVHb zeGf8Hk@oWY{`exYVoc@tshhNy^p_Bs?F;7s!jQ(dIY~Q7Ux(0aFV72xRKrT@Ngr1J zHxLje>p!fjTy@wWh4+nvxKUZ=bQ}y}b!HHGX}`pROM6KtAeVWw>c6N4XZ35 +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A, taco_tensor_t *C, taco_tensor_t *v, taco_tensor_t *B) { + double* restrict A_vals = (double*)(A->vals); + + A_vals = (double*)malloc(sizeof(double) * 5); + + A->vals = (uint8_t*)A_vals; + return 0; +} + +int compute(taco_tensor_t *A, taco_tensor_t *C, taco_tensor_t *v, taco_tensor_t *B) { + printf("Adhitha1\n"); + + double* restrict A_vals = (double*)(A->vals); + int* restrict C2_pos = (int*)(C->indices[1][0]); + int* restrict C2_crd = (int*)(C->indices[1][1]); + double* restrict C_vals = (double*)(C->vals); + double* restrict v_vals = (double*)(v->vals); + printf("Adhitha2\n"); + int B1_dimension = (int)(B->dimensions[0]); + int C1_dimension = (int)(B->dimensions[0]); + printf("Adhitha3 %d, %d\n", B1_dimension, C1_dimension); + int* restrict B2_pos = (int*)(B->indices[1][0]); + printf("Adhitha4\n"); + int* restrict B2_crd = (int*)(B->indices[1][1]); + printf("Adhitha2\n"); + double* restrict B_vals = (double*)(B->vals); + + printf("Adhitha3\n"); + + double* restrict tA = 0; + tA = (double*)malloc(sizeof(double) * C1_dimension); + for (int32_t ptA = 0; ptA < C1_dimension; ptA++) { + tA[ptA] = 0.0; + } + for (int32_t i1439 = 0; i1439 < C1_dimension; i1439++) { + double ti1440tA_val = 0.0; + for (int32_t i1440C = C2_pos[i1439]; i1440C < C2_pos[(i1439 + 1)]; i1440C++) { + int32_t i1440 = C2_crd[i1440C]; + ti1440tA_val += C_vals[i1440C] * v_vals[i1440]; + } + tA[i1439] = ti1440tA_val; + } + for (int32_t i1438 = 0; i1438 < B1_dimension; i1438++) { + double ti1439A_val = 0.0; + for (int32_t i1439B = B2_pos[i1438]; i1439B < B2_pos[(i1438 + 1)]; i1439B++) { + int32_t i1439 = B2_crd[i1439B]; + ti1439A_val += B_vals[i1439B] * tA[i1439]; + } + A_vals[i1438] = ti1439A_val; + } + free(tA); + + A->vals = (uint8_t*)A_vals; + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/spmv_spmv/spmv_fused.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} diff --git a/test/kernels/spmv_spmv/spmv_fused.h b/test/kernels/spmv_spmv/spmv_fused.h new file mode 100644 index 000000000..bc78275ac --- /dev/null +++ b/test/kernels/spmv_spmv/spmv_fused.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A, taco_tensor_t *C, taco_tensor_t *v, taco_tensor_t *B); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A, taco_tensor_t *C, taco_tensor_t *v, taco_tensor_t *B); +#endif diff --git a/test/kernels/spmv_spmv/spmv_fused.so b/test/kernels/spmv_spmv/spmv_fused.so new file mode 100755 index 0000000000000000000000000000000000000000..5efd6a4d8a9832f423c2f248ec61fa541f7833f7 GIT binary patch literal 14152 zcmeHOdvF}ZneWlAv22NzY&Z~@;K3|+azJYdKaj8uT0QJp*BTUCN}L3HS?xp8MY}8Z zL2Phr6JhYMEa%HPfvO}G*H!8Ql}EWqxa3k-XJH5197V+`f{&{sIp<1jT#H@QB^4j zgsKYDTFZ@agVJ?E$(_`z^q7(^rM|baQOY-YBwIm8Em$yRyN|(+;{0=yY9_Hsl{Ztn zoLBKPRjjPr)hNt`YyCX~V%~4R{mOXL==D~>!$s{#xA>s0 z)y`|JsjGK)Y5V5r8p*E2brr6z-q(M9=&OgG{oyaaurpUbn0Pe4ukKq996k8P16yu5 z_`$>Rt^={x{;BZ3r?2ksU;fo=?|HVr^2N4}6HmlpI%7Hn$-*kq8-VFm@FT#g=~KO{ z=^vOu|AiUw_8I(a0=@*7Gfe_$6xWD8T`v>;7~DcpFFNZi_!=d3A<#x~t=Qtw|FY6Q zJ>OEiU+ISxKB+n`KU-f__#wJMNVmc-rQhQs|IT<`22SxDq??IEeS^}^aS?vMinD_{ z0uuGVh3FZ5>baL7e%_FHy_j(Lq53UEPx-0BSwE)qWhX;^vl8r7`28w=L*XJ68cL*+ zp-eWM&W1uFG?E<2iclOxgaZ8=L(y0|HZ+pS#?t*8yGK*WSbunMG-k=FOG1(DVR8^2 z9eF4wB8f4P4M$R;K_KZJn`7a0WY`~vze>0(7Eh&PA`u=PO+`dBMzMsXaWh=PqL9zVl0=9iP4e4h%b}!trDSF zG@K2?!(b+3RT@w-D%J-BUEQJEean3-O8MJ;E73}HgRG05-5R0&iGKpJ8u2#tcs$8o zH-Z*-iCWnYg~(k`e2utT^&cK@d7M10GL1YQ!xtrfA2d1egG%4fb*aaGm_NPP!f7m2 zT}|p0Z8(paB$%|}_O5u|hI1LD-=xYVT~#H5g>J*uBPQ`?8&2)Xsnv$_Ge@#^8*U%Z zJ8k#^b^>aQ|GL)5&nYzN!q_{Jt(iJ$jQ^~*Np^VeEcx^nB( z>1mU#N2wOmJna8KiiMTuKn9GZmtb$ac*$j$lg5k1yAAD_@sms0COCXRIjkp#9D>u| zp0&?GP2`pt<7?MbZh+#cH}Y$r09X77s#rAeb8OBd;59Nw#f13HQMl}})VH3EL&M_p zb8o%k=>F+=&67stR7d^OJD0rMLpA9s)L+qSp76g|T->Xj@c-m|AaZhb{nNXy?JN&94FkcOr zr_k@(&8+vfpxNVXES~ysYAO)LQgB%>dD&I+lKUYZGgS7CPjB_#=ilnz z<`1FCaR2@N1Do@64fDd@iCs7Mww-DFZ>t{|`R?ZNm$VJ}HF^LKXfX@79k=c-8u^Et zi$k)Ehii-5tV~mJz{=>wAZ5(|q+-r?B9G?3aJZQQ8(TL)4mbdM7x^0HJ)7yBMp7^2`NHcsVpn5`bXfn(S5>J;7M8FnR5*b8jSJdDDQ|n&~YL6&7gU_xaf~`YtWaC z{C)JeZE2;RicVths2D*}w|D;~zMtsXE(}*{95eFwVyyfGtUrr8fA`rp|A3LqHhwwf z7a^DLAbRWgqr)TF;c(l0p>nsEa?ABwqPO8H0V_(m<@CPP7P(f_nx-Dt7Wmu;dIIzi z=o_Go=tgg#YA3)KLEi*5aAPMyp8<8F1Drr_?gMScgTE8h0DT5@AF6i-^jT0|$MF1A zduWr;wl`{v>gU$(hmIHiXl-!f{M6JKsPHtd^E7Q}sQYZ~9&z{et3SHjyM$n}$Kx&< z5gTxy51s!_Yr7qPl?vHlzw+M8zvQO(#dxpbT7^%jkNu3xO=rkB&IzTbwzEp*U7E7uKdA!8Ck|O3 zZpTT*A5?ODu8?w7J>~WPj_|&@v&6?##lNKJKPdW-iXKz+l%l-fv%b6gPQ7_xFqh2c z^gDbjeXT9abCPS@)ztwLRg^ho`Gu@XpQIwH9pPdxwCF{-Ctd| zKwCJsv+3!s>oi2?S|Kbva8(V|++I;vcNZ+-C4#F)eumPt3!W>5hD6Ts=kqPI{xa#v zGHeG9MIy^0sbqX)XxYf}_EoS=XXD{aHWAJa`y$p>*1Q_7gzGh-x#nx1`d55c)imty z>)mZ_RzvD73G9jvwM0`n>^ru!w*tuKlCiZzv1BYg5&=s)o@=+auWDJfLT?%3zk6@j zTlk6(=0--NBXDDdrnkhCsTS>ldP%;+Pu>&i9 zr@G&Vip~>%n?2y3-?`_euQB9Ok8BM@xEW=s)V};!mq{`zv@nE7z~w?0cvwyt4~kRnB3M+*Nuz*tH4gx1|&3x1`IpJdhi{~Wkh<+*r&27C^^Y;Zow**_Obyt4lg=y7Itv)XkZ`A@wWIA-w>++Ig&hmim?B&j98e)y`=hk^0V3g*XLVuY|>0 zQXlL; zmcePNXeu-`ni>p`hN76MWJ2NGc0q@)Mq}An)OW|Kj`r$G@}OBLoKA;#;22gmy#oil z!iiWYnoA^hz{JMMlWAqDsd#)e6^?RfnX$x!p?EHX0wXY8x5>Y;H`IGi51ncYDc#V6 zVMIpZfIh{7wR>XN26yGARLBc-|K_-cKL&$ zd)KYo+}j`O_jd(*9VOxnn>>?Nih_z%ARTDy?BwHTbmZ($9=j_$*2ag~2eOP%zSRsAzg5l)vW8%Ul}6u!)k z1R54Jo3`jM=aQH*`!Iv`$$4+fNK{B}IGh<4zUYo5EG?SFOq%o_jHNRpsbm=wf-D^y z4U>V&jg4l7PnPaO);E*_n~iP9ue=4mbV{Bs^u>l%&mE4!33HaI)rBn!&ZC5I0&{!v zNS+ZKol^ASwg?}dqXeD}V6nm65KO|!Av|=79ZAMhoDUBUrehB>hk1OA^Jo~(($Pv~ zt2|X1CjQqD&5USjis3~jo|iNADx8UPy-mm4Kfqd??RoyrG^z~wdxB#+ffMmU(6Har~^ubO18? zcELQaFPOGUYnr2!$FE>P?L#uQFReq=dSpT=a{1hTT>qr9?^1faUSYc5W{=rNMf+_B zMs32&pFXutVanfilp7f<=ZR);+3k7#Q=)cdx)-zJ@09)R3gPt`(~gp+mEr!!^vlYg@4xf^ zON=_~o#j90u;=w2(;sj_$`7|2yZtUQv_vfRFSX9&|3~3$FSciT7`C*gVS8RjG^wQ$ z>p?{(=9#_%ljy)(3|Y_OrtO*kB{HPL@$>B7UuNJb|nh{6&QriDf?|DMJwaft)Ibu?oF2dy35#KKZE^7Wj}No z`@dB7&U$`N+1EjCPs}oHLx%Hqy~9f^?ruAXyi<404EC3ly?z;c^2cL`T~)u?;)d0D vcoSF}i?~1W`-8?e?t7Mn^}CNgFd#keaPEwM0d%U_-`OI~#Ri9ggBAY+iVKWt literal 0 HcmV?d00001 diff --git a/test/kernels/spmv_spmv/spmv_spmv_default.c b/test/kernels/spmv_spmv/spmv_spmv_default.c new file mode 100644 index 000000000..dfaa1c4b0 --- /dev/null +++ b/test/kernels/spmv_spmv/spmv_spmv_default.c @@ -0,0 +1,157 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *ref, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *v) { + double* restrict ref_vals = (double*)(ref->vals); + + ref_vals = (double*)malloc(sizeof(double) * 5); + + ref->vals = (uint8_t*)ref_vals; + return 0; +} + +int compute(taco_tensor_t *ref, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *v) { + double* restrict ref_vals = (double*)(ref->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + double* restrict B_vals = (double*)(B->vals); + int* restrict C2_pos = (int*)(C->indices[1][0]); + int* restrict C2_crd = (int*)(C->indices[1][1]); + double* restrict C_vals = (double*)(C->vals); + double* restrict v_vals = (double*)(v->vals); + + #pragma omp parallel for schedule(runtime) + for (int32_t i1438 = 0; i1438 < B1_dimension; i1438++) { + double ti1439ref_val = 0.0; + for (int32_t i1439B = B2_pos[i1438]; i1439B < B2_pos[(i1438 + 1)]; i1439B++) { + int32_t i1439 = B2_crd[i1439B]; + for (int32_t i1440C = C2_pos[i1439]; i1440C < C2_pos[(i1439 + 1)]; i1440C++) { + int32_t i1440 = C2_crd[i1440C]; + ti1439ref_val += (B_vals[i1439B] * C_vals[i1440C]) * v_vals[i1440]; + } + } + ref_vals[i1438] = ti1439ref_val; + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/spmv_spmv/spmv_spmv_default.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} diff --git a/test/kernels/spmv_spmv/spmv_spmv_default.h b/test/kernels/spmv_spmv/spmv_spmv_default.h new file mode 100644 index 000000000..b53193484 --- /dev/null +++ b/test/kernels/spmv_spmv/spmv_spmv_default.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *ref, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *v); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *ref, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *v); +#endif diff --git a/test/kernels/ttm_ttm/fused copy.c b/test/kernels/ttm_ttm/fused copy.c new file mode 100644 index 000000000..5d40c8aa9 --- /dev/null +++ b/test/kernels/ttm_ttm/fused copy.c @@ -0,0 +1,248 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A1532, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) { + int A15321_dimension = (int)(A1532->dimensions[0]); + int A15323_dimension = (int)(A1532->dimensions[2]); + int* restrict A15322_pos = (int*)(A1532->indices[1][0]); + int* restrict A15322_crd = (int*)(A1532->indices[1][1]); + double* restrict A1532_vals = (double*)(A1532->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + + A15322_pos = (int32_t*)malloc(sizeof(int32_t) * (A15321_dimension + 1)); + A15322_pos[0] = 0; + for (int32_t pA15322 = 1; pA15322 < (A15321_dimension + 1); pA15322++) { + A15322_pos[pA15322] = 0; + } + int32_t A15322_crd_size = 1048576; + A15322_crd = (int32_t*)malloc(sizeof(int32_t) * A15322_crd_size); + int32_t i1543A1532 = 0; + + for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= B1_dimension) + continue; + + int32_t pA15322_begin = i1543A1532; + + for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) { + int32_t i1543 = B2_crd[i1543B]; + if (A15322_crd_size <= i1543A1532) { + A15322_crd = (int32_t*)realloc(A15322_crd, sizeof(int32_t) * (A15322_crd_size * 2)); + A15322_crd_size *= 2; + } + A15322_crd[i1543A1532] = i1543; + i1543A1532++; + } + + A15322_pos[i1542 + 1] = i1543A1532 - pA15322_begin; + } + } + + int32_t csA15322 = 0; + for (int32_t pA153220 = 1; pA153220 < (A15321_dimension + 1); pA153220++) { + csA15322 += A15322_pos[pA153220]; + A15322_pos[pA153220] = csA15322; + } + + A1532_vals = (double*)malloc(sizeof(double) * (i1543A1532 * A15323_dimension)); + + A1532->indices[1][0] = (uint8_t*)(A15322_pos); + A1532->indices[1][1] = (uint8_t*)(A15322_crd); + A1532->vals = (uint8_t*)A1532_vals; + return 0; +} + +int compute(taco_tensor_t *A1532, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) { + int A15321_dimension = (int)(A1532->dimensions[0]); + int A15323_dimension = (int)(A1532->dimensions[2]); + int* restrict A15322_pos = (int*)(A1532->indices[1][0]); + double* restrict A1532_vals = (double*)(A1532->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + int* restrict B3_pos = (int*)(B->indices[2][0]); + int* restrict B3_crd = (int*)(B->indices[2][1]); + double* restrict B_vals = (double*)(B->vals); + int C1_dimension = (int)(C->dimensions[0]); + int C2_dimension = (int)(C->dimensions[1]); + double* restrict C_vals = (double*)(C->vals); + int D1_dimension = (int)(D->dimensions[0]); + int D2_dimension = (int)(D->dimensions[1]); + double* restrict D_vals = (double*)(D->vals); + +// int32_t i1543A1532 = 0; + + #pragma omp parallel for schedule(static) + for (int32_t pA1532 = 0; pA1532 < (A15322_pos[A15321_dimension] * A15323_dimension); pA1532++) { + A1532_vals[pA1532] = 0.0; + } + + double* restrict rA1532_all = 0; + tA1532_all = (double*)malloc(sizeof(double) * D1_dimension * omp_get_max_threads()); + + #pragma omp parallel for schedule(runtime) + for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= B1_dimension) + continue; + + double* restrict tA1532 = 0; + tA1532 = &tA1532_all[D1_dimension*omp_get_thread_num()]; + // tA1532 = (double*)malloc(sizeof(double) * D1_dimension); + + for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) { + for (int32_t ptA1532 = 0; ptA1532 < D1_dimension; ptA1532++) { + tA1532[ptA1532] = 0.0; + } + for (int32_t i1544B = B3_pos[i1543B]; i1544B < B3_pos[(i1543B + 1)]; i1544B++) { + int32_t i1544 = B3_crd[i1544B]; + for (int32_t i1545 = 0; i1545 < D1_dimension; i1545++) { + int32_t i1545C = i1544 * C2_dimension + i1545; + tA1532[i1545] = tA1532[i1545] + B_vals[i1544B] * C_vals[i1545C]; + } + } + for (int32_t i1545 = 0; i1545 < D1_dimension; i1545++) { + for (int32_t i1546 = 0; i1546 < D2_dimension; i1546++) { + int32_t i1546A1532 = i1543B * A15323_dimension + i1546; + int32_t i1546D = i1545 * D2_dimension + i1546; + A1532_vals[i1546A1532] = A1532_vals[i1546A1532] + tA1532[i1545] * D_vals[i1546D]; + } + } + // i1543A1532++; + } + + + } + + } + free(tA1532_all); + + A1532->indices[1][0] = (uint8_t*)(A15322_pos); + A1532->vals = (uint8_t*)A1532_vals; + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/fused.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} diff --git a/test/kernels/ttm_ttm/fused.c b/test/kernels/ttm_ttm/fused.c new file mode 100644 index 000000000..f490913cb --- /dev/null +++ b/test/kernels/ttm_ttm/fused.c @@ -0,0 +1,242 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A1532, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) { + int A15321_dimension = (int)(A1532->dimensions[0]); + int A15323_dimension = (int)(A1532->dimensions[2]); + int* restrict A15322_pos = (int*)(A1532->indices[1][0]); + int* restrict A15322_crd = (int*)(A1532->indices[1][1]); + double* restrict A1532_vals = (double*)(A1532->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + + A15322_pos = (int32_t*)malloc(sizeof(int32_t) * (A15321_dimension + 1)); + A15322_pos[0] = 0; + for (int32_t pA15322 = 1; pA15322 < (A15321_dimension + 1); pA15322++) { + A15322_pos[pA15322] = 0; + } + int32_t A15322_crd_size = 1048576; + A15322_crd = (int32_t*)malloc(sizeof(int32_t) * A15322_crd_size); + int32_t i1543A1532 = 0; + + for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= B1_dimension) + continue; + + int32_t pA15322_begin = i1543A1532; + + for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) { + int32_t i1543 = B2_crd[i1543B]; + if (A15322_crd_size <= i1543A1532) { + A15322_crd = (int32_t*)realloc(A15322_crd, sizeof(int32_t) * (A15322_crd_size * 2)); + A15322_crd_size *= 2; + } + A15322_crd[i1543A1532] = i1543; + i1543A1532++; + } + + A15322_pos[i1542 + 1] = i1543A1532 - pA15322_begin; + } + } + + int32_t csA15322 = 0; + for (int32_t pA153220 = 1; pA153220 < (A15321_dimension + 1); pA153220++) { + csA15322 += A15322_pos[pA153220]; + A15322_pos[pA153220] = csA15322; + } + + A1532_vals = (double*)malloc(sizeof(double) * (i1543A1532 * A15323_dimension)); + + A1532->indices[1][0] = (uint8_t*)(A15322_pos); + A1532->indices[1][1] = (uint8_t*)(A15322_crd); + A1532->vals = (uint8_t*)A1532_vals; + return 0; +} + +int compute(taco_tensor_t *A1532, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) { + int A15321_dimension = (int)(A1532->dimensions[0]); + int A15323_dimension = (int)(A1532->dimensions[2]); + int* restrict A15322_pos = (int*)(A1532->indices[1][0]); + double* restrict A1532_vals = (double*)(A1532->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + int* restrict B3_pos = (int*)(B->indices[2][0]); + int* restrict B3_crd = (int*)(B->indices[2][1]); + double* restrict B_vals = (double*)(B->vals); + int C1_dimension = (int)(C->dimensions[0]); + int C2_dimension = (int)(C->dimensions[1]); + double* restrict C_vals = (double*)(C->vals); + int D1_dimension = (int)(D->dimensions[0]); + int D2_dimension = (int)(D->dimensions[1]); + double* restrict D_vals = (double*)(D->vals); + +// int32_t i1543A1532 = 0; + + #pragma omp parallel for schedule(static) + for (int32_t pA1532 = 0; pA1532 < (A15322_pos[A15321_dimension] * A15323_dimension); pA1532++) { + A1532_vals[pA1532] = 0.0; + } + + #pragma omp parallel for schedule(runtime) + for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= B1_dimension) + continue; + + double* restrict tA1532 = 0; + tA1532 = (double*)malloc(sizeof(double) * D1_dimension); + + for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) { + for (int32_t ptA1532 = 0; ptA1532 < D1_dimension; ptA1532++) { + tA1532[ptA1532] = 0.0; + } + for (int32_t i1544B = B3_pos[i1543B]; i1544B < B3_pos[(i1543B + 1)]; i1544B++) { + int32_t i1544 = B3_crd[i1544B]; + for (int32_t i1545 = 0; i1545 < D1_dimension; i1545++) { + int32_t i1545C = i1544 * C2_dimension + i1545; + tA1532[i1545] = tA1532[i1545] + B_vals[i1544B] * C_vals[i1545C]; + } + } + for (int32_t i1545 = 0; i1545 < D1_dimension; i1545++) { + for (int32_t i1546 = 0; i1546 < D2_dimension; i1546++) { + int32_t i1546A1532 = i1543B * A15323_dimension + i1546; + int32_t i1546D = i1545 * D2_dimension + i1546; + A1532_vals[i1546A1532] = A1532_vals[i1546A1532] + tA1532[i1545] * D_vals[i1546D]; + } + } + // i1543A1532++; + } + + free(tA1532); + } + } + + A1532->indices[1][0] = (uint8_t*)(A15322_pos); + A1532->vals = (uint8_t*)A1532_vals; + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/fused.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} diff --git a/test/kernels/ttm_ttm/fused.h b/test/kernels/ttm_ttm/fused.h new file mode 100644 index 000000000..d613c8f07 --- /dev/null +++ b/test/kernels/ttm_ttm/fused.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A1532, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A1532, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D); +#endif diff --git a/test/kernels/ttm_ttm/fused.so b/test/kernels/ttm_ttm/fused.so new file mode 100755 index 0000000000000000000000000000000000000000..69c65a1dc2a6cdd101f40ffea8b439083d8e2761 GIT binary patch literal 14560 zcmeHOeQ;aVmA}tY5+yNuN-)6?nuze|+9h@@2*J)rk!;y{%E*bAIMlRB6-Ab8Pi@KA zG9)z3M-_)W9tz$qEVQN5blc7_UABZSEtKsHc0#^ryW@lc>9F0pTM9CvjT0b*65{=x z`|hgCQs*{-#zEtd(ZuR_g(L-4b~MX3X{RcKF^3dFv~)`B4Kldh=6!m zC7XfYMeKSmJ8e8^LhQhG79=TSZVpf$)A3yV3}->zJR)gdfrURNsMI5t^jr(9bk_nm zk4QQwsv@bw5@U8ydX9_qe5wgKNlBJM!=IAP6hC+kXD86@7R)4-dVdQ&D(8PUxP;&x zBEPBp;t>9jR8)u8tq3-3cvi37z|^-+8QreT-AbhRy?>qg@ec>a%nzrCk$OUItrzS&RSefYrXyPK{% zaN(=5>W8B*|2%!p*)aPbPrUu|S?AVYKH{l5{y+?>Q-=GIGIx?x>L%_<@CSiSCjYxB z@aLw$cTItRe+vA>6gc_YjVC{B129=RE1@tM9-c}bxEoJ?@J?ap4hSyF7Y6@z$h+88 zY@gemliJAb6taq|Ecn}^KRv?b=dr##`5T1%Aax3a&HR_K5zx7;kPV5-$vNOgA%EaX zOYkP}6bw)on>V36YlVEL7+)eA77_U1)s{Y$j{^12xojruo5|U+pG`u(FVD_AE`J64 zfxxB0dmJxhBclB4MZ#=hr$LlomkOa!OGhFe>Q08clA#a_wZ+?#EEEG_p+IAOs5#md zZE5RHM!Op8YuXd>Xk&Owd(=vsJS7y_8YTnb_O|q$mgd)t;4M9aGqfcT0;YVzKZZ0L`wIF7^*=A3!5cmc1Jl{JB&HE>XFm}9Uaa6y&f~q!C}# z)^-I$NuCr~&$Vsn(FN>hLSC+ea{cO?O%2=_;m)00jC7$Ze!pZY8lw{%5GBQ+T;c||lw8|`e zjuZygVgd5iouMj`T z`D)^+iHG|+znpk-{qRA~-%LEYeE0z8KSw;ddbp4Ci-;#z4)5apwZxMPhdVhxmw0mB z@Mg}>BA&X=a0BNHh$j~gdpZBttH6`%hATLKo_KQEu$%K+z{mQ)^Pv8CqjqU`P`l*a z#hCizM(xt!Sh>1zkCnKP>FHGiP}rcSr~MpMuezuke@b2h7Zwl;7hbVxbZA^<(_!-3 zI1l~ZB$&SSK5nM;GW7L<%QoE@(FZcO>&j96!+MtJ zsa20c0VP9u3UzbUT;Q3{po$qCKS!s{1h0@fD#qBmhhfrhiEkQ?LBisFU+~@eJhi58 zgRgPJdVTjhP2hC%HZ!C*>!$t25~S(I9!&@A={Hw<{pP*?V0w?{B{}06t)B$UgiB99 zsL^gDXzr{Cnk^;J8RSW;-NxhE6Nvhaq*kdLPih%3L8O;(6MM9SmRzxJJgB(^MjI{N zTCOe<-#}WbUp0QAJ&c$d81ak_T(;|x3z9|1g3~Ks0bzz;+pMSUTthdu zYi=@ioD0<2DNd75&={&WE(MHZ<}WO-n0vSDiYhEUQ$FhktT%10?r z2bbGp(PMtC6KSm8c(dO4m~uG{?#v?US$YbIik`ACyKaKjQx{$8o^5tG$+Bt#SY^3s z$G=hbRQb%)mqt;)BRYi0p6M{X_NoCjJ%Ygs8FHl_Iho2RrN?z;P<`q{)VH0qskz_~ z6m+}RL;zJ9Ub&x&_827^8T4fv9N^a~Q*SB8WmM@&B$QqZrkmeKdQz+K8OPwwm8qf2 z3Pp4(2YKCh@+|DO>S_NpG`3$W=Ka)c)GB*Vsr#3paJu=hb^wx|3%YSO^ZZAshDjxf z%rL)TuR5gex4is}_8f##Z!1A$=O{hohuRQgWIQkNBVuM1_U?R{O33?#3jH2)EHUcKwkd9|q)kvX+Y zhr=skYROBUQ!%yZL?FHK2*wJpM@)6uZK*>_OucI$=2M&OcAr{mx9h2Xm#^xzdxy-8 znqIY0TbT^Ncdh6cdV1q4EM26e`jskpQ;(eDS-v8(v>`{^j`@&Oj_Z%<=|-gZ)Vms~ zOigo{RrIRMYA?-yK@+|}MV~-LdoQT_e@{&pY1G<-@J>HKKFJ^)Fvn7FURK@>8iSAt>dI@7 zYWAHR4l1wdMx*8p7;guS3!c}}eMLLz#yLG*=L(jd)8Qr;`Dm8x4z;Rv>lAq1%$0+pnAnu4u&I=q=!_ zO08mAt>Q;}@%A&V_M<7H$qtZ@Xq2Z(p@F=@8mnsW_xL<;Hgo;O(b3^2F`8&z@$_5k z0UTl<^N5|h#BM`19l@%LJ^wk}j?8(#_Vf?0!(7`3>I8=Qh~_pvG0|M+tvBAo_$I90 zIE&OFw6P0w5)D=Lp*egEKVvNbC@D?)neTnT=kDI0{S|B28(?ULpk=37zr|XIdgki8 zUu**MrA@w#zD>R_`$C5^4?uE!y~Pq{d_1a`Ly`3^V?Z};roB&ns;20h zAADSO@@v!3E}m2BQ^%E)%JHg!uPw&B7dhd%fEkc_r2{3_%^C;okG`}C{r>py?XV^L zA4d=OnKN#hezN!`B_slN2;(OM7IYkMzN zHspq-;mi%Gi%QRxFVGB>`NR8`S@#;leqSJU%pR=Tt`#TiYE1j*bqq*HwPCNSHvOwu zjbUF&E2XR2v{&}_*BHQhe)L>=?7s^E{dzl|7x+(|Et9O_DhSeXO^C z#|^cfH#~o~#tnK(N$QBQ#$4$Rplf7ad~bBL>ZY9;-TZ1v=2S`l~V zaf%qfr)-A%5a;b-ZN*aWv|1aW?U**r3e)R=3pN5O@uhbtw3)&Z$Q_5Zw9cUqyj9>g zD-kWxA?wGYTBMuUSbF)~T#|X3XLXbu={<(3a~Ci0rH-jSHE?X;qP@2F0@7Bh`~QL= zhUMDYF72>m-AxzqM#!v!jTbPyStf!Q+AUxVsQX`syq>PHW8*?C!$2zqA1=8^1d7r|VRr7X~`Ko?jYc`j$oqw!V;Lc-4p%#dIpbxxRklFt(I%uJG<^&qq8ZT7d zV&k_r1+_w~QP_7kYVK+y_#=A&tsh7o1^w8i_O7Bi5njNAh^>JaMXE%^Qn97#z5p_w4|-~)Q&~a!gdSG(Q~LV!&HUwZY$bt6z#@@tW+KS z+W8>Ln)&T{>Rm}q52C2Ld8dOI`1od`4MAfK+P|&>{khfCpVghMe(O0gT#0g6ZTb)_ zp&)3YmA(DyL#3BTM?IrwzU3Jmz8f-D|EKxF>Yvz}?tbS9lq}~xf%b!*e(Jc``e|OX z$w7UxmiI~BPIjQq7(*&Vc98=>#trEYGr%h!i*N&-MUh-o{ zv=24qdAmuY1Y#S}S7E<_qbLpRAkSjLHX|71Ye&p_)R?L_qScS*ucimhEjW581_rk#K!`qP@`-JpgA)TKI zVA47aR`2OR-DjW_)f6#F#zerzD<6|Mrl-3aOh?F2mlx(9Rw z^f6F3R{J+VcY(T~??%7c4BC%Qbr5t2^hHn#MHXSoeRnWrt4o<*IK$BgT@5&$;cq@W zIyyu$PFJ0?c#S&yUdJwW`?V`RcZ;@&VAA*FDMndoewQJL$VNOal>I23HdH%Z4;559 zi}%}BJKbq}wX@{GX+CGg-swJPWopK1XP?b>VEQa)B`}|}q}u6*EOe@!4j&yZRN$dE zoIY#`5}?fIbnPwhIg3+dub_UGvlxia>1dekmr8*N&CHZmj%koSimBaaQ^#lpc0U@U+6#Qnb%z#K{*AQrY*~ zd`@@DUeL%(jH=>g0Lc$SwiUAcEPp&-_+*0L{LTZ4ZCwHURx=^z)#Nv_M}6Tr^aXzA z{>k>%IENHlLYdSaZwK<#QuG}MHlwod5T*kw%ELASbNyv>R2;oS>i@(4mIvf{vOGVQ z=f!Ax9_Xx_g5!Sefx{LCWHujdPSYI6$oU6F!3uckSJ0=hB}?_ait z>xjrtj>8fWk!QM1=qnWD+3X=9FVAF)tz>?>&!cWPCqj)?3VGQl<(cr+La#{_NaBx) zh%9HnkdtTRGCxe6e7Y$S@|Zd~FZ0E;$N9HKMKEkRPh05!iy*(7=jVM?RP0qj-xTzM zpworp=Lz~*L6-?yC+MAm#ss}b(7l4n?-8qOYL>f8Hf+JSuO9bp<+qksl-<(9InNGH zWqHNYQVV5^QBup9R#V6hJk_M}MaWyur74@S!aex;3B~e4!3Y3leZD>A@jHRrc2MG% zDawQ0KI1Y)lsU?c&MRlzob#O=m>B=9)Rm+!@~DN2{+_2Hmb?EZm*!2q#<1Bi3-8Ss*K6dzmS=S;`k*D!|t*1@{8nH6D-UNOxHWzEZZ7 zH79!T39hWN0zk4S9$nQEjYqrMB4Fv0->R*Z%gUB5b(gis|Lbmcm&qr#rKhdExeaEl zqPfdr@kAN_;#k(%l}JV-$wU{x&Nh6(+#2cbVxLlFUN z_+C%xHtc%kJ)B&5l~u@vB`d*(ms+_FWx=pBXKek+#pmK)kvkjWaT~jm$?v7vi8k8J zx+jyriphB`SAOFC2THdwx$0y~6E0Vkd|a-w`S>JG;P&MzoG&j|-+cTtxq#qoY{3Ma zy0M*|8_iEp0TkeVh#V<$dM3-0$(IQE{=9zjIe{~|u!Ez;`m zW?MKuvE2>eWJm6Y!16#RXv!r0=NaIWm7m_tCc}RToclqPpTFAx`<=i$#r+e$2>>>g zf6gLJVJdq(ugRt=55y-e&&S-(#Bt<;qm)U;m&Wml{n^9uiQ{ECclbp6^}^0Rx0{MK z7VhHs#Br41_(VIYDR6p^6|Nk!O_3h~cjq7$b94&%w>XaeAch!9#>4x-C%aE|X$m{V z*rUv46X&r792Z8j$Y*WVeX9I>S<9!8UpocfF$MkraCcUIGWty}Kk**dZv>uykLx>A z*r!i8bib?5otw+kQ^+3$KG}V+QzGB|`(|%VVP~fBTmC&Rx4`r7mHnBwS0T&4S4Q74 z={{Tjy|2~4b8BRIF3jbn7su%~;ETrDvDhPAev)-Fcf*srWkFhZvL_b9#%}z2St!{N ziqHkKZrp`xPJ~+86I;UVp=Ru~xj^A|45lzHm?Frl)6Ge;lbR(|_&lmT%%0tLM6pQ2XmblS1 zhE%|E-5tLA+EDFUKV6~=2~(ka!YDl5<{F1$J@JUdn!6LB)^NO;F4Xz&TI;J1)IbvZ z*-))6Eb9I{0O9VPw7ojGuG$w2t*fhBU)vaJ^i>CI^K!&hHGcbStQ3@`0O_Kg*DG(5 z(fzScy5%?V@*dsDgFapAqbq6Fg*SP_OZO?nM*ykI%E8?(wb#bv|1 zn}QRh#N9*QIFnwp)J7XeLnS+rHg;&`_y$E!v; ze?cj0Yi68l4R^P)^5$)EXj(LhS00kPC)(BBmWbytA*6Lh+ry+FVx8?tR?c%TN3^^p z0X9jOJIi^0D37*^q1oCDWyx78))=)UWgM2n9eDX7OJos+qDz$JxI@Xx(H%Q5_<=>Y z^t3=F9B;v_BG|TgEFt6JEnB*x_ec&eT2UE?Lu8b$sR~_TX>9iUpGRotN=GS}{&|q^ zO_FK?mn8ojr5p+PikXo5^8HHEW}%4bnFo2kBUt>U1LIpysW0EdBpnn9lrNT+v2!fB zbN2!x+frY?uSqJ;tsue!*~kt$7&=3e`trR_(v?D3mS4(Ax&i5QCM9|K{wHat(5Lc~ zOn&{zKQfI(<$I!}{RAS&{H4CEe;jF~Urj;?@_kZLc@9VVwAael-wKR;BG3Kgy>ChL z-<&9~?9dG+TVKAPN;;G+jJQ0fl$4+Q03XX=yr0PTSy^6bM)H#0kA&>}<@=DNc6w3D z3Nl{_r4h;2m*=dK=D%*|*Z&(rU$(z|f0lH+%t+9D{qN=J%lB+aJEdWvFY7JM{u3f} z#3lQOd>@y8-;!TnsW0iX&?Og1eR*!|5U-+A4kA2AUed!rsqLh`e7~1}KQu_rMUeHD zda_(!M;eJpeR&Qrw2-tIbMxQvAE__tuaK0jKXzUq6nBlQ@|e_&tV6U2aQJ z%30j__>uhUh>(mdzkL7N=e8vB^(n9HFa(CMifj;Zsb89hOMOX)kuKxeqEEXmF5hmx z?8Pbc{~+|gY?&GR&DXaB$j3L=f^!Fy& zL9&vTA|m72avviTUNHyZw10Tyg#qOTRi#A&+JM1#?@JGXMYp literal 0 HcmV?d00001 diff --git a/test/kernels/ttm_ttm/gemm.c b/test/kernels/ttm_ttm/gemm.c new file mode 100644 index 000000000..ee2b24e99 --- /dev/null +++ b/test/kernels/ttm_ttm/gemm.c @@ -0,0 +1,181 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A2886, taco_tensor_t *C, taco_tensor_t *D) { + int A28861_dimension = (int)(A2886->dimensions[0]); + int A28862_dimension = (int)(A2886->dimensions[1]); + double* restrict A2886_vals = (double*)(A2886->vals); + + A2886_vals = (double*)malloc(sizeof(double) * (A28861_dimension * A28862_dimension)); + + A2886->vals = (uint8_t*)A2886_vals; + return 0; +} + +int compute(taco_tensor_t *A2886, taco_tensor_t *C, taco_tensor_t *D) { + int A28861_dimension = (int)(A2886->dimensions[0]); + int A28862_dimension = (int)(A2886->dimensions[1]); + double* restrict A2886_vals = (double*)(A2886->vals); + int C1_dimension = (int)(C->dimensions[0]); + int C2_dimension = (int)(C->dimensions[1]); + double* restrict C_vals = (double*)(C->vals); + int D1_dimension = (int)(D->dimensions[0]); + int D2_dimension = (int)(D->dimensions[1]); + double* restrict D_vals = (double*)(D->vals); + + #pragma omp parallel for schedule(static) + for (int32_t pA2886 = 0; pA2886 < (A28861_dimension * A28862_dimension); pA2886++) { + A2886_vals[pA2886] = 0.0; + } + + #pragma omp parallel for schedule(runtime) + for (int32_t i1551 = 0; i1551 < ((C1_dimension + 31) / 32); i1551++) { + for (int32_t i1553 = 0; i1553 < ((D1_dimension + 31) / 32); i1553++) { + for (int32_t i1555 = 0; i1555 < ((D2_dimension + 31) / 32); i1555++) { + for (int32_t i1552 = 0; i1552 < 32; i1552++) { + int32_t i1544 = i1551 * 32 + i1552; + if (i1544 >= C1_dimension) + continue; + + for (int32_t i1554 = 0; i1554 < 32; i1554++) { + int32_t i1545 = i1553 * 32 + i1554; + int32_t i1545C = i1544 * C2_dimension + i1545; + if (i1545 >= D1_dimension) + continue; + + for (int32_t i1556 = 0; i1556 < 32; i1556++) { + int32_t i1546 = i1555 * 32 + i1556; + int32_t i1546D = i1545 * D2_dimension + i1546; + int32_t i1546A2886 = i1544 * A28862_dimension + i1546; + if (i1546 >= D2_dimension) + continue; + + A2886_vals[i1546A2886] = A2886_vals[i1546A2886] + C_vals[i1545C] * D_vals[i1546D]; + } + } + } + } + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/gemm.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2])); +} diff --git a/test/kernels/ttm_ttm/gemm.h b/test/kernels/ttm_ttm/gemm.h new file mode 100644 index 000000000..20cd2db53 --- /dev/null +++ b/test/kernels/ttm_ttm/gemm.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A2886, taco_tensor_t *C, taco_tensor_t *D); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A2886, taco_tensor_t *C, taco_tensor_t *D); +#endif diff --git a/test/kernels/ttm_ttm/ttm1_1.c b/test/kernels/ttm_ttm/ttm1_1.c new file mode 100644 index 000000000..e016491a2 --- /dev/null +++ b/test/kernels/ttm_ttm/ttm1_1.c @@ -0,0 +1,219 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A2398, taco_tensor_t *B, taco_tensor_t *C) { + int A23981_dimension = (int)(A2398->dimensions[0]); + int A23983_dimension = (int)(A2398->dimensions[2]); + int* restrict A23982_pos = (int*)(A2398->indices[1][0]); + int* restrict A23982_crd = (int*)(A2398->indices[1][1]); + double* restrict A2398_vals = (double*)(A2398->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + + A23982_pos = (int32_t*)malloc(sizeof(int32_t) * (A23981_dimension + 1)); + A23982_pos[0] = 0; + for (int32_t pA23982 = 1; pA23982 < (A23981_dimension + 1); pA23982++) { + A23982_pos[pA23982] = 0; + } + int32_t A23982_crd_size = 1048576; + A23982_crd = (int32_t*)malloc(sizeof(int32_t) * A23982_crd_size); + int32_t i1543A2398 = 0; + + for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= B1_dimension) + continue; + + int32_t pA23982_begin = i1543A2398; + + for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) { + int32_t i1543 = B2_crd[i1543B]; + if (A23982_crd_size <= i1543A2398) { + A23982_crd = (int32_t*)realloc(A23982_crd, sizeof(int32_t) * (A23982_crd_size * 2)); + A23982_crd_size *= 2; + } + A23982_crd[i1543A2398] = i1543; + i1543A2398++; + } + + A23982_pos[i1542 + 1] = i1543A2398 - pA23982_begin; + } + } + + int32_t csA23982 = 0; + for (int32_t pA239820 = 1; pA239820 < (A23981_dimension + 1); pA239820++) { + csA23982 += A23982_pos[pA239820]; + A23982_pos[pA239820] = csA23982; + } + + A2398_vals = (double*)malloc(sizeof(double) * (i1543A2398 * A23983_dimension)); + + A2398->indices[1][0] = (uint8_t*)(A23982_pos); + A2398->indices[1][1] = (uint8_t*)(A23982_crd); + A2398->vals = (uint8_t*)A2398_vals; + return 0; +} + +int compute(taco_tensor_t *A2398, taco_tensor_t *B, taco_tensor_t *C) { + int A23981_dimension = (int)(A2398->dimensions[0]); + int A23983_dimension = (int)(A2398->dimensions[2]); + double* restrict A2398_vals = (double*)(A2398->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + int* restrict B3_pos = (int*)(B->indices[2][0]); + int* restrict B3_crd = (int*)(B->indices[2][1]); + double* restrict B_vals = (double*)(B->vals); + int C1_dimension = (int)(C->dimensions[0]); + int C2_dimension = (int)(C->dimensions[1]); + double* restrict C_vals = (double*)(C->vals); + + // int32_t i1543A2398 = 0; + + #pragma omp parallel for schedule(runtime) + for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= B1_dimension) + continue; + + for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) { + for (int32_t i1545 = 0; i1545 < C2_dimension; i1545++) { + // int32_t i1545A2398 = i1543A2398 * A23983_dimension + i1545; + int32_t i1545A2398 = i1543B * A23983_dimension + i1545; + double ti1544A2398_val = 0.0; + for (int32_t i1544B = B3_pos[i1543B]; i1544B < B3_pos[(i1543B + 1)]; i1544B++) { + int32_t i1544 = B3_crd[i1544B]; + int32_t i1545C = i1544 * C2_dimension + i1545; + ti1544A2398_val += B_vals[i1544B] * C_vals[i1545C]; + } + A2398_vals[i1545A2398] = ti1544A2398_val; + } + // i1543A2398++; + } + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/ttm1_1.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2])); +} diff --git a/test/kernels/ttm_ttm/ttm1_1.h b/test/kernels/ttm_ttm/ttm1_1.h new file mode 100644 index 000000000..4c631f227 --- /dev/null +++ b/test/kernels/ttm_ttm/ttm1_1.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A2398, taco_tensor_t *B, taco_tensor_t *C); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A2398, taco_tensor_t *B, taco_tensor_t *C); +#endif diff --git a/test/kernels/ttm_ttm/ttm1_1.so b/test/kernels/ttm_ttm/ttm1_1.so new file mode 100755 index 0000000000000000000000000000000000000000..911c44fa1340a6a884b424eac6af27f998d5bbb9 GIT binary patch literal 14400 zcmeHOeRNw#m7ixRjhd*E(xj@0cMYzshEm=@qW|;1-vGB(fWj$g^&v%Vg?z={p z8K#q}D@@7^MIUmV|ZKK$grbw>R+{^HFIUp~L2;geJam3wfJnZ>vc z>6+`}V9ipuvsrs=k*-nMrMPNwHMjoo^`}4k^cSD~<=^edySoSX_B^)qD-TZ``tieC zuR3(*BR$Or)8GGA@eNPM(l5O5oA3YLGy0uRhMJClya%carbTEl*|(>w0O)oQ{B~UP z$^Xj&_>u+eA6P)1{HNoxr)dE5)w^r~`-uhQ8-eS%?CFXH?A!;z)plXEU#5|41%9+_|Y` zAe%{dCb|dGR@wX|@#JWN3?v5nKb#gD@7dH5AIN5h;=}n&u75CXRW%QuNss0#;B+Px z&txA>P;*Ep`Js5S??dsPME`(D4h{(**?2dq8Q#7*ofuB`MS9T4xo~s3Cp(-LgNcEG zY*M7s)WNt^&ZRRW+2MFj^bDsVkRBXK=frT@5|r&8N(_Tb4~Y0kU;kh{F*1@Kl)aa1 zGCMew&!xpce|Imkfst$=L`;%$>qI=AO5_q~eD}zR>U12Pvbwl75VSHO%Ea26TjI9_ z*3RT^3EV~lM3dNoJ*`IWS2c<*0@eP)Kho476P0sQh#>gM>u$i7yhw~FY_g7chu8t0 z63O4I%7|al-;bH%6#EpGe^)>5gf-#`CC~RKp01NiNDE2bhZyJkr^|*PQ1U~%F4b6` z`KeVFPWKMAl%-s$0;heO)2Rx4DTAPAD{wXSWofOdmu#pmNnEeM=^nwUt^((K8D`ilmUtST=^@E~jd*g+beH76 zL_E1%jEs)HvhRjG5@*nZXvv1?ew3U=m~gN?X?P538T2- zVJK`dii`R|ji$4P`C9G@xaRrGEZ1D}V0mg*<-rN^$}A845a|_fyHT2{KM#H5@Og(} zo-z)X?liQc#!KgOwJ^9x8FZ6DZi3z3?hXG9F_B+mjBR*;a$Bf9Zlkb)+Ecn3T`U>+ zIlAau;5AZ5$Aoxu0w$xD_=D3ukg)j31CjfmrO}FPiF9t+Z0vdci(rhxs#xLCx|C6H zUSCJNx!3;fd;R1tRj$haueRoJ+;ICKqi{;_edxBXP8eE7W6NS-NOf4*GaJMP{4?`WElMB9J= zvT=9{PK)|$BgKtn&>e-(Sgk1Rm90QsqefwyzYAp@ho_t%y74Ka=o}-hI-@`pkBvKt z+X`k!tk|cuhF-P&06>pwQ%z`!cmJEzDIj&tg%yXI3(-T)rsuuIp8;*%SN87z0WyUg zdWPB>jpU1CB`s#A{J~fu>;j%sq;LN7;}eiAib=<(LQ_&0&s`4uTR4@A6A z9YZ~zIvw%)oJUW4-|zg9G4{M>G@Ur@-FiI#%IOyGGWU1d#|~?on!cBR#u)pyGv?Ul zuQi$m{4Vdl?U)SI^kDng5v_gfoX@-OcX0n0+^FG4_rVh#26A;_xM$S+#4(f_aKMmJ z*x~Ox?OoRCcat0x~cnMt=3e^Uux_*LUZhc4@R~{9*jH` ziBFVHyv_5mu=u8nCJxN@&zv-pCz{+(?pS>?N-HZ`bYIkJ9*-O@t!mYdM_xMHo}9eh z{p8NeqvlaK?DW{5wOv1zligavQFDC9uPb=-6jnsV`ot$PZ6_BtLeEgdWj`v zey`m;f&H}6%=vGo_2Dbsfc~~8e-t$v4Raz^yjdPS471Z;8#8}ja-kHICMuRmi>vh3 zpC!<49syfhDuFc$5Nwk!6S^X~4`Qi4lW6oI94u5T~t)<|+gC?h|*>pBxHEa+q zrgJ^54_|Y^*|=}K#RQgr<~bKxd7reB^OKdGGgk7?;`V{g{rnZ!~GJ0u% zGUm^zn&~d&<#fVebgLqUH|&3ZsCHIxP3Dx>7<`b(9<#K8CJK7he+@JLZje+m9VS>ulY324(BL z`+tXBEMgvu6x-nbozBo1qqss2Lg}5eqrbJ4w~Ddu0{ z-TyflYc1S;uC>YI-QOiy|^KEQn`>_RW zGJkgRN}lgy6Yv~%SL`Xp6<0)>R_<(wH2u7_kZKUSe%Y$QoySbKzdiYuarjp?r905w zg4=%*+n8D})ZSY2KW~cp-Ppi!AM5n%&1URN&UTD```A&?cd#LCz#fGis8OSRC>TbQ z8jrs61Q*cU4-Mk%tyHoItBaTSJ~$OpYEu>~8Lt}Z-$Tm_Hg1|0UT z!~zbz5PF&BiVJhyFp4?9&$|*_6arCieJ9xZm|yqS!^NeEGvtMse=Bxww|of5FGkbQ z>xt64QKsgVQFs9N^+(Z&JEwK2GRqGUPtTE9z!UJecBSL?Ay9hWKJv$M`Ek&_koh|3 zLC_SQ5)Z;<-v#f-HgXk)X)S0F^dKmmTV4g-3c3RKmVXA#fzrFI6>z8y{U3m?2YnT^ z19Sy!QPMtqpU_5q+Li9bt^<(w1EY5;L9}BWRCs)Cp4z*-OCNRZ7I(hu_8V{YuO^uE zqqvSkj`lZBF=V#k8b`Z7j(SA1$M-}{v!`~yW1~keI-5Opk1vXNf*-vo;%OXPywNkI zIZ`!CJdMC2p1Nj_4q50ldt4DZm(V)@3NC?dfX@YljpI6pG34_)Vf6fA(B=b5iD(Je zUSRV5!)#kxXWG&>)0VqDLk>;zG)CvN1UBk$t%ZO3;O8#nsb7!RL_D=0bwoV+n6swS zQyY=Z#ydY0cR}_ukd^OMX6r@&=J+7$Iixx6As@8Nsd*#$pcdEDu=hH~RKEX+dWLGA zS|U9}{r)b>4!lz?Q=SshDPWJ=us;B+`$M^Gdj%qOF8o~Zzy%Ln@W2HRT=2jJ4_xrT zLLT7v*8HBD-%I1sLMD0^rNsQbD&w}`neR3HZkfNwT*2j6>oRwx;`y28U5e-T;`EG4 z$z>7o$MfYZWlyR4_*wBRo#~PI-SJj>CPm_Rz@Jv~{0^L+wJ6QJ%hq+tD>dO#^8EbF z@6<1sQm9`6j0aVQ+c~b}_}w(uha-?||Dy?}5&mv& zuki-ev2}{JDSE%6J&Hb}=tmX(2Sxu?(QhbvR8juUv9YD)y?Wi2ZhRZd>+cEN76>-n znwMN?XQ(j{Tw8CU;%2nWa-QXSp0Eo&{C)uPmNTs(cJBr7HGB8S=qdbYo9*_r&)g5( zv6BiPqO43#=i>8}(Uxh~dzLSCc&_wpxqO-C!*r<$Xb#VNm)^eg-B&ErE?pe1eX{uq z4Xtym6&BvUyar-UzgW8T4rszk5*I#Gkv;8g&n5WPXgcE30)aKw|B>P38v4MNOs+|0 zGd=yiYx-|(TnF9ZTu)*oH<-xv1(MeCytsxt;rPDL980thyjX+pWE$cC^p;S_a-`lc zh_9Hv4XJD%U-B9ng8*{*OnO6aI+GsmPlBb7i5o^6*EOtLt2gxW|9fuH8~Ez!&i4Zp$u`K3lnq0}*<3oA%MJq^>Q9Qn(d5Xm_$&QcQ{A04(*7EBy5T(yt%DuW zV25Jl6j8-!Vtn0NG$E58ZRpM9m7fGYW8yRENG?v!5&6tWe{Uw8()%+x@tT@`zg6^N z@rm3)=aTU5Yd^=(o2sY=MHq~##QtTOs=~j@I512cZ&E4jz$D;(t{Qg;-ruTmr{I0C z8eb%MKdHtq61)#p$BTmg0FZP;}x5sn61AIQeU5*35eDh}w z@YURz3i>`NKhOQH`az}oQ3GWoxqMF#9>`|H#^Slbc#;CgMi2^>%Eo&KvfYV+cnVKQ zBk@FjR8SbzKsuLB1>Uo+sd0WG8ITrF3=b!^BZewByd5!H3B;16@`Hohp;E!g2(~KG zY){WXHj(0{<#L0ecnED&@mqm78W8WvAP_4aZ@VwDsWskuca&n`;z~3ANCIu8*sWQp zC!a|&mKw>%`x2QH#l=P6e|Kb4dkZA-*cop%ltm+YA0UL#vF(kqdzvG$_&sfHn_D~M zoss5PtF1-^smWNgnO0Dh3Z#g&aG2x9C|c|<1?$a?!J}9^=u>PS1)y2sZ5+?WfooO} zpe^=pw$`;OWNsFrcFfi_8^BjtQ+2o@1@c*udDT`_6kau08A&&99N%o+Rk4N@)h`e< zXbbO~Bc);prH|(gdXzPDw4kbwLj2-Y!H~A@$WTiW7}-9Ekp|5TTeOdJ8Jr{nc(4!1 z<3K}yN=U9RG14ajsqGnPS~Q0P2FX2=9vW~!Jt%7)Ve2~tqGp@EzT$l3$Q270q# za}<3UkV76w_o)Tdmx3~LmWs8qEJ@D8a$*oi9kN6g(I|?13?T4P1Td=xv1EazyYszJ zNo0Dl7bvzr)05?VqPu%I{Rne7)}%QPhj5mHIF+ulRIzLQ-?Q|jNUt`qeagh=6sCTK zGqH}=%n1=d7M}>Stk35erYR+gZB-_If1+6YC4mt1&W82*+{5&cDxi9COPG02#81j$ zU}T&1`FzBb-wQ!RCbChP3^4SLhV}Uz#gy7diQCU|Ot+wn-uW=k=PjmPq>M!ECmDPH z$v<3%?9920&u8NVB60n!&;8G!jP#pH2#McYF?Fd9lRlotX6k4CQD7v(@6|XCo2mVz zLUmQ95iphdd>&*vUMY+`-`|)B4?h3$ z-;db)%lb^e30-<*Wqm#;yVTK- +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A3056, taco_tensor_t *B, taco_tensor_t *A2886) { + int A30561_dimension = (int)(A3056->dimensions[0]); + int A30563_dimension = (int)(A3056->dimensions[2]); + int* restrict A30562_pos = (int*)(A3056->indices[1][0]); + int* restrict A30562_crd = (int*)(A3056->indices[1][1]); + double* restrict A3056_vals = (double*)(A3056->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + + A30562_pos = (int32_t*)malloc(sizeof(int32_t) * (A30561_dimension + 1)); + A30562_pos[0] = 0; + for (int32_t pA30562 = 1; pA30562 < (A30561_dimension + 1); pA30562++) { + A30562_pos[pA30562] = 0; + } + int32_t A30562_crd_size = 1048576; + A30562_crd = (int32_t*)malloc(sizeof(int32_t) * A30562_crd_size); + int32_t i1543A3056 = 0; + + for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= B1_dimension) + continue; + + int32_t pA30562_begin = i1543A3056; + + for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) { + int32_t i1543 = B2_crd[i1543B]; + if (A30562_crd_size <= i1543A3056) { + A30562_crd = (int32_t*)realloc(A30562_crd, sizeof(int32_t) * (A30562_crd_size * 2)); + A30562_crd_size *= 2; + } + A30562_crd[i1543A3056] = i1543; + i1543A3056++; + } + + A30562_pos[i1542 + 1] = i1543A3056 - pA30562_begin; + } + } + + int32_t csA30562 = 0; + for (int32_t pA305620 = 1; pA305620 < (A30561_dimension + 1); pA305620++) { + csA30562 += A30562_pos[pA305620]; + A30562_pos[pA305620] = csA30562; + } + + A3056_vals = (double*)malloc(sizeof(double) * (i1543A3056 * A30563_dimension)); + + A3056->indices[1][0] = (uint8_t*)(A30562_pos); + A3056->indices[1][1] = (uint8_t*)(A30562_crd); + A3056->vals = (uint8_t*)A3056_vals; + return 0; +} + +int compute(taco_tensor_t *A3056, taco_tensor_t *B, taco_tensor_t *A2886) { + int A30561_dimension = (int)(A3056->dimensions[0]); + int A30563_dimension = (int)(A3056->dimensions[2]); + double* restrict A3056_vals = (double*)(A3056->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + int* restrict B3_pos = (int*)(B->indices[2][0]); + int* restrict B3_crd = (int*)(B->indices[2][1]); + double* restrict B_vals = (double*)(B->vals); + int A28861_dimension = (int)(A2886->dimensions[0]); + int A28862_dimension = (int)(A2886->dimensions[1]); + double* restrict A2886_vals = (double*)(A2886->vals); + + // int32_t i1543A3056 = 0; + + #pragma omp parallel for schedule(runtime) + for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= B1_dimension) + continue; + + for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) { + for (int32_t i1546 = 0; i1546 < A28862_dimension; i1546++) { + // int32_t i1546A3056 = i1543A3056 * A30563_dimension + i1546; + int32_t i1546A3056 = i1543B * A30563_dimension + i1546; + double ti1544A3056_val = 0.0; + for (int32_t i1544B = B3_pos[i1543B]; i1544B < B3_pos[(i1543B + 1)]; i1544B++) { + int32_t i1544 = B3_crd[i1544B]; + int32_t i1546A2886 = i1544 * A28862_dimension + i1546; + ti1544A3056_val += B_vals[i1544B] * A2886_vals[i1546A2886]; + } + A3056_vals[i1546A3056] = ti1544A3056_val; + } + // i1543A3056++; + } + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/ttm1_2.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2])); +} diff --git a/test/kernels/ttm_ttm/ttm1_2.h b/test/kernels/ttm_ttm/ttm1_2.h new file mode 100644 index 000000000..86ebdb633 --- /dev/null +++ b/test/kernels/ttm_ttm/ttm1_2.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A3056, taco_tensor_t *B, taco_tensor_t *A2886); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A3056, taco_tensor_t *B, taco_tensor_t *A2886); +#endif diff --git a/test/kernels/ttm_ttm/ttm1_2.so b/test/kernels/ttm_ttm/ttm1_2.so new file mode 100755 index 0000000000000000000000000000000000000000..c698ec99101bea4a4b7945cab953850f3e96d6ca GIT binary patch literal 14400 zcmeHOeRNw#m7ixRjhd*E(xj@0cMYzshEm=@qW|;1-vGB(fWj$g^&v%Vg?z={p z8K#q}D@@7^MIUmV+VKJ?_jbw&I){^HFIUp~L2;geJam3wfJnZ>vc z>6+`}V9ipuvqgJsk*-nMrMPNwwY2^4^`}4k^cSD~<=^edyL*QA_CB`sD-TZ``tieC zuR3(*BfTvLQ{Vqq@eNPs(l5O5oA3YLGy0uRHZ~vscrR2JOpDN9vTsjU0nqIr`0cpn zlmC|m@Ffe_Kd^v2`A^4XPtySAt9RJ~_Tvl4Hv!ji+0zvZ*trjatL?(zKL~lBxJ-=d zdR1wIwBr`zUJL%TnjdK?zf$bB$)AGQrNS+`m3+6d_6>y(Ut$5grcWz8ND~9;CfKFa zqwL$;^KRf|Usvme=kDtsf zm`SI);yr^Yt8D&~SYk9z2I7MQA5Mvl_iXBn4Q4XKv5|Z_H!zg4s+tE+r$%!Xa4MaQ zr8AGlsX3&R_)sj-|DjlKd|*%{hK2=@OsoghjBMYWijO4v!@X$aT(~9Gn;A)oq4?ln zCLxk3>R?PN=ThlxW+avqy(1|Iq=vGooES-2g0kJi@ey#TK@rRL4-CcP*=%Y^_Fl4y z%+PQ?mlA^mJ$=XqvY9|5F$v186R}h>o{OXLJ=v`4bPS!cy0|tFv@(s9iFUNK#%>9$ zoypx2xQzygCb0v1T8-SVY7|`rs{Mt3q^UtBBIl+MLGY8;-GD85k;p1+vW|F%*a4mr z$=|EWh+i=HR1^+&-W*uu9Hhh3rXFF80Y(^%Z49N^253=)mWbS zsZ|zE_YSp`rCg~3r+u5#sS124gP>J-u0_S@hl?5ws zwUtPfrV5<*5Gn{&;Fb49qXK7NkbGwat{z1Q7F#QDc0J+U6*%n?ocb$pH-n(Y*y}E% zu;{gwx-j;Q=W5E6#@G?pgec#38c_NAKj7E9Duf*I{iHcPRYto0H^ghiPfGq5#M4ks zk4ye1#FGoBpO*Z~#FOi$4@v$-;>k7B2PFSI@#K=}-I9Nncp9GRVab1ucyi5jx8%P> zJh^1LQ}SORo?I~od1_YW!3pxpED!xg(ktF}qcl^09{R@N^A5v2 zWgITuX=q1{m(J&EVQ`N!=q7{Q1iQW68~z(&BEQBM+wcJ8worTAMqvZBr*t*CSTgW) zbkVoKYov~j3GwCxOhzp62d8@>Ve#Py!uLN*qZQr~?%J~1*z@`q!5D>A(ZZv3Nu%Jr zzK(cvum4k^jpIgPqaH2p^^*gQ;^Y3uWa&1)QHTX2grH@#X@R8isZp-H8^jdXw0#+#1)+_!b^Sf7?V` z;|oS1>h~41(J9ji}9Xk|r;?u**Yyp!iuO^pIC*7)6MR0?z;A5Gubb$^@Yt- zWIn8&lIBm{?*8V^TG%m9)E^x?ukCsT=4Q5ti23}EUzw9IYSl;UDQuQ$H9Z$XFR`S| z?{%0bu%9-WIseVHK76Gc(BF>4k0NH1VNOJgH_M}kVRrdzqvr2RE|h}OM8z^`b(P-w zvjjTKBVcPwC9p;T!X4(f0q90?{SQ%P4oA!;jn!xhUJm(@S?~_(?ezLN2#!CKK+&<9RA6JaJoY%DJ6c!ui+xy0M zUWe7#_*-i|8im%nv2Sa471ryRmafvlKb6bPH|;7Jg^$#gJ_ZIES81n}sV(KLj9wa~ zjQMk_X1W`BIUd&5HuieIwejrmcSZ`qLdXRdYy(smF7DJK7@e&~LORYc3hgfA@ULqk z1?*Mr*gHb5h`GbR!YE*)4#~Z%t`wJ59VJJGj-l()#n*(#j(Nl0j$?<qrOVPciIqF~G z-TyflYb)G+uC3YQ-QOYi*qq_RW zGJkgRN}lgy6Yv~%SL`Xp6<36tSMF>GH~+k?kZcgUe%YqMoySbKza#OMarjp?r905w zg4=%*+n8D})ZSY2KW~cq-Ppi!AM5h#EoSsf&JK)z$JkNOcd#LCz#fGis7a%JC>TPM znvn7Nh}A>4=iueXHONF@&J7z|;m3&gsX_m?j>J0{NyomIcdmxkR+`8mx~quBmubB_ zR_rlX?7@9OYd-onr=w_V$$X1uSI%!l(Nv>wzl#|7_!gp_QS&a0e|r$~xxaC|Vme#% zw(*7OCbYvE(>q}a4Z#qt-#6}kq8^)KWBKGiHI}CzhK!v5*cbXMZnL;G$$PBpP4CVJ zg7|3<77R3?-4!Vi!BiXt4f&#l#VAFV*cKOUE3U$@6ZxQbCAMG#)76ECkE`Hu+<-&g zl~}-yFEqYPbH#*3wkT;IzE5bQKJ7~PV%Gu4`+?Crl_1(N4k|ppc2DhH-ldPac8fdTb^DFC`d1T7 z`Vm~mAxHZgrzkSpaE+tgA4ff+#p8RTro~gc-?7o77o9Dhy2lrVJ;9G&6!tWYE#Byv z(j3W}C7vc=VNYF)M~5tQT0E{Wol9t)e+8GoHo)fs!p3o(!x-{;oiKX-Flh4ur9`xZ zYcDYQ{$aK)Z8L3YpJ~foo?(Zkd72_~S^^t&xYoiy{qS=)^3<=#Yr>w|k2=C0eau%$R9wtrGJ6p!hW=Xy^m{-7$yBdsh~-BVWo{|J9K zx7T=s>exC(+ZDZE(OyL#QS_sV{)3|bs^~WqJ*p^w=h)cV`d+PLdDH!ndLmo^*mu0diebShN6Y*>d?Z&4=ky6VM!<_b$DC>ASC3rd_%?RQqJh z6&hOSSSu{NV|fk4oPM!%=^fC7l>{z)rXqXV+n!7CtI>4Cr3C_OtpC}O#2Wg*mPo8g zWYWC@eQO48ZCVH2kz8*)n;VMf`U44Td0t$@op5|#XpSY?2VShfcQOre0D5aLGz9_V^6AuuzEnCjGLQgE9}_o>Hmz$|w^ncHEqBro>+mjy{ zOb)<|)ik}KH=SvaA1ND#M>4roB9|EfI6ROLL!*i8i1;i0SySDeHPZeXbh_a^4Q)f6 zkzl7{#-+$ne; zti~4!-cPFWiv;gO)%aq;`&l*an%VcN@wwmUsoa6x*QdNzEx%Mu&WpQe;)<%}=YG$( z?0aV7ZK~zHVtk(U@4$(fzt2|7FB5#fsm3ow9FblL$-F}>7yKQzvd}?qOVoX&8ox~N zzE+K|oD;7>Q>0pWgS)hkqBi^n+@@e0K(HAwT>V$~55>M=nFAA}v%e ziOg5;>%cWcc6GAjl`R^m`S9g*vq3zOectL4pSv#VrDNx=M~dqqJN7srL*jGS%R>^M z+s=^%@Q(pkAw#N17WqfudKF?Z&nzH+LgH>=k0*K+_TJn_^cBO*VVk`bHXdq&xljA#f!bBjt%FlDZtA0?ae$+r&Hka@1#RJ)F*jOw#6iZOxSQepB$xN(oFw+wsj3x1e zl#Rvnqk_Vy22;6IGVq>t%}w(Q$$+$2d}JiP9Whk7k?n}tiX)aRnI9V34wVW{MzB?h zW_o)EGw~!hEteZ=M6gvtR*d+qKnx9t^`;Ss6^pgs7v9tsYr8u_v2Zb^8G9s-wo>fY zEYzD%Cm2g+GqL`7I!SSHk@w#n-qg_wNj!GO+6-mUh};JVA#`keWAvVua5Q#Ld;8|L zu2@&NCE8}I5kYD))@-H~RHXtbVl5QnxG{ z6$EIDy_>CdtqPf&MW`LKbZ1_9SXD5jtvfQ*QUtQwhcMEhxe<%@b1sdOL;w%= z0eKu~7)T1q^~baQB9PpkhNeYxIAD<6BdL+>Kqg(q#85Vp8jO>I$_)?ZL_pRaKsL~q z0h^=f%YYp6K&oFYsQx6BnX^=^m1RkC9+u-nIO>okvWP}e;ImmCq;U-f$dW!KBqAC zE1ZdSv}R6-0J8W*m|=ZB&oE6YQEaO+@%t0S;x7q=pm#Q`&*vVdhg1R8i(A6Xdm?^P z4gn+Etk35oru<$AA~KPU%4C3{cQmZe=P0JsK1$qvmSegFW%SO6c|LD3?IvX;YCp-? z`%nJiGGu4YWqdvxClHD2XMOH}8fB#4LPALV-ioP9b(r+=G&WN|>yH8>5q_`6ao9}l zCl#uzGG)P3>hpP!>3F3u@_c`1D&MmJ=lc2F$dvobW|(LCF%(qR&*u-O-L`Wm*UR>b zHhq4N$<(GUea*T=|6b|y_}l+ZBv&bnyxo7Fx9N|o=OCv1cOGnp`_259k)c;gJU{sS z%YQ#&?=R~!{U&tjk(KrNoa|CZKbC`tOw2Q#07_%W`h1?|zt5SZ!9n7FvmUqWZ&5}f ztk37}sa2#Ugf4$)e^{R>owq9WXU^wJad=i$=2(yE4{iE<9vW8qEXQW-{7+Fp7`LC# zL8A-kD|fItF@+pq!5PHLuztM_XMLvADCc~o=;w8dv)i@Ho`sklBh8m|`W50Ki_HAm zWt{-*7zwYr)>3Jo(X=vl*-IDD&nx}DzoLJQ(zo~X6{Wuv<&}w9rgX05e5Ks4S6kej zl_2tVS&AjGW2AD!b(W%Tk(pn+Eb+W|%m^W4DE%qAIV0(~nsM>` +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A2593, taco_tensor_t *A2398, taco_tensor_t *D) { + int A25931_dimension = (int)(A2593->dimensions[0]); + int A25933_dimension = (int)(A2593->dimensions[2]); + int* restrict A25932_pos = (int*)(A2593->indices[1][0]); + int* restrict A25932_crd = (int*)(A2593->indices[1][1]); + double* restrict A2593_vals = (double*)(A2593->vals); + int A23981_dimension = (int)(A2398->dimensions[0]); + int* restrict A23982_pos = (int*)(A2398->indices[1][0]); + int* restrict A23982_crd = (int*)(A2398->indices[1][1]); + + A25932_pos = (int32_t*)malloc(sizeof(int32_t) * (A25931_dimension + 1)); + A25932_pos[0] = 0; + for (int32_t pA25932 = 1; pA25932 < (A25931_dimension + 1); pA25932++) { + A25932_pos[pA25932] = 0; + } + int32_t A25932_crd_size = 1048576; + A25932_crd = (int32_t*)malloc(sizeof(int32_t) * A25932_crd_size); + int32_t i1543A2593 = 0; + + for (int32_t i1547 = 0; i1547 < ((A23981_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= A23981_dimension) + continue; + + int32_t pA25932_begin = i1543A2593; + + for (int32_t i1543A2398 = A23982_pos[i1542]; i1543A2398 < A23982_pos[(i1542 + 1)]; i1543A2398++) { + int32_t i1543 = A23982_crd[i1543A2398]; + if (A25932_crd_size <= i1543A2593) { + A25932_crd = (int32_t*)realloc(A25932_crd, sizeof(int32_t) * (A25932_crd_size * 2)); + A25932_crd_size *= 2; + } + A25932_crd[i1543A2593] = i1543; + i1543A2593++; + } + + A25932_pos[i1542 + 1] = i1543A2593 - pA25932_begin; + } + } + + int32_t csA25932 = 0; + for (int32_t pA259320 = 1; pA259320 < (A25931_dimension + 1); pA259320++) { + csA25932 += A25932_pos[pA259320]; + A25932_pos[pA259320] = csA25932; + } + + A2593_vals = (double*)malloc(sizeof(double) * (i1543A2593 * A25933_dimension)); + + A2593->indices[1][0] = (uint8_t*)(A25932_pos); + A2593->indices[1][1] = (uint8_t*)(A25932_crd); + A2593->vals = (uint8_t*)A2593_vals; + return 0; +} + +int compute(taco_tensor_t *A2593, taco_tensor_t *A2398, taco_tensor_t *D) { + int A25931_dimension = (int)(A2593->dimensions[0]); + int A25933_dimension = (int)(A2593->dimensions[2]); + double* restrict A2593_vals = (double*)(A2593->vals); + int A23981_dimension = (int)(A2398->dimensions[0]); + int A23983_dimension = (int)(A2398->dimensions[2]); + int* restrict A23982_pos = (int*)(A2398->indices[1][0]); + int* restrict A23982_crd = (int*)(A2398->indices[1][1]); + double* restrict A2398_vals = (double*)(A2398->vals); + int D1_dimension = (int)(D->dimensions[0]); + int D2_dimension = (int)(D->dimensions[1]); + double* restrict D_vals = (double*)(D->vals); + +// int32_t i1543A2593 = 0; + + #pragma omp parallel for schedule(runtime) + for (int32_t i1547 = 0; i1547 < ((A23981_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= A23981_dimension) + continue; + + for (int32_t i1543A2398 = A23982_pos[i1542]; i1543A2398 < A23982_pos[(i1542 + 1)]; i1543A2398++) { + for (int32_t i1546 = 0; i1546 < D2_dimension; i1546++) { + // int32_t i1546A2593 = i1543A2593 * A25933_dimension + i1546; + int32_t i1546A2593 = i1543A2398 * A25933_dimension + i1546; + double ti1545A2593_val = 0.0; + for (int32_t i1545 = 0; i1545 < D1_dimension; i1545++) { + int32_t i1545A2398 = i1543A2398 * A23983_dimension + i1545; + int32_t i1546D = i1545 * D2_dimension + i1546; + ti1545A2593_val += A2398_vals[i1545A2398] * D_vals[i1546D]; + } + A2593_vals[i1546A2593] = ti1545A2593_val; + } + // i1543A2593++; + } + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/ttm2.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2])); +} diff --git a/test/kernels/ttm_ttm/ttm2.h b/test/kernels/ttm_ttm/ttm2.h new file mode 100644 index 000000000..40f1400d1 --- /dev/null +++ b/test/kernels/ttm_ttm/ttm2.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A2593, taco_tensor_t *A2398, taco_tensor_t *D); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A2593, taco_tensor_t *A2398, taco_tensor_t *D); +#endif diff --git a/test/kernels/ttm_ttm/ttm2.so b/test/kernels/ttm_ttm/ttm2.so new file mode 100755 index 0000000000000000000000000000000000000000..16a3d2542141b1b755548f437f4ba19dc7f1e9d0 GIT binary patch literal 14400 zcmeHOeQ;aVmA}tYh=_%r0!c6>5D^}&16Yy4q;`{lWlQ!GBNIa$8rp!O$dc`?Eg4&e zgnIL_ido(&3fWy+nC|fmE>D=?X=bZcT?)N+16%03(DGHOp#%^T9ojTt{>P5ovY7qgcXSHl0 zepj+9xa@+Nq*<}Ci%m#U#ylLLJkG;&_A<_bdU!6Bq+6*b%sGNUka0|iPM1J$N zi$k=Bq@p^!ZcAXphNtJ>c>S02!8a~^e8&@yHJ^AfMOjd~3lEuDi04s{ z;yB+|R_?SnC=V>~D3o>~o(en-!57{-^e>0L`@}2%FgW1c*0VS9K>1^Lk3M?x?#;^| zJ@e&6!$a{CKh6EY)mr||<8PjL$F==GzEWK`_IC-W&Knk>z+~SVmILruLGU~9%q9Pw zdGJN^*gr6jJhh((k2OpHm@D7K^VpBhBVP;LgU1>!na9p;5L{^$2LDCKyV+8<-Qy`q zt>Si^Y<#f^e@*m{B$r>tc3b2pA$B2ivR)y-RapD6z;~-AAjkBS!0SZ^y#{tEY!mja z<@pS7vhNY&MfU4;0`H|xig3^*s1wPb&(6w$D)2Qz-eIE*u*%KUhvIC<0 z8%4si0-qGPE)^n?j-FI9(w~m@r6Um*=}LB`StJ3%BB8dHNPE05-qF>cj`y{-GNT$1b;$~KJ;K}&*bP*g+wnvhw zd!tkw(usX366^d@BoXcEX0e`L1|${PhHUzFY>Y?yVx9g33OO5Yh$m8gan=*m5;mYZFvOm=qNwY*>90Kv4{&<@8#Z5t8?%rr0xOg{<^mlgkM56uu@g81#&c;$b zy#whu>+agtfvC4X<*glfH~CGj>k2%ZAT z-zw6GU((fe0j(560vq^L7goYD_MniL>yzxR3 zwZg<{?GQtm%jJu3nzv<`EW*nr2zs^%M}?+Cg~*p|h$`_kPZ3UQgbbBMxLn&P%~ynr zp~Y2di*PxIP(pnXUc4^qMYwDWl5Z`-#a4u1wz&wGO;7mNBAn(389Iw_rvyRu%v%mU zyWq8D9;Ocsr^^cCdgh2@lof860#vx_J^ZRG>JcNplQbtz6%ej^lX!*ranApaczRV6 z!<>JScxu9lL!5t(cxt+dM>+pA@zgXE2RMJ2cxsY~-JE}dczQh(y`296@zgXETRHzd z;;Bg{S~>qs;;AVn>N&p>eBuCj9@Ot|(LUM});_A=&6xWAHtnO)gjZd$*Gyc&^xXQp zp|DBME$9N(>(1)NYw1hSG>4a(O>@DWg;O&scaBo4%<#~!CcWGZS93F~&Ou)vIcL+2 zNqr=Lldc@qpFNkZfWbY&ppy*BB3SM1UjJK&v4PckX8q?Wwu#E))U)fUJo(E|#k`K6 zqYHitULkcsk9% zmBbr+wV!}i59`@xMmWHkNAv z<7e6fBmnpkP~F(4(JL?lS~%OR<$5nitT!{Ps-em1!pJ$h9y^o2;#{GycUax~Ep(Pp z>{vLvTr(#0kyEyW`p!!_6gKA?3m`r{_Egw-J7gRQ=W3pURhSQGt)Y=u?SA7IVdGSb zG1X$c95P0;`^++D_wwSR5c^0P@NQ5jrAJQLLE804^qhSUnd$WBf(1P@tPtY}8z)I`wQ?tIw`GlnorR*Bvtk zH8QtV%{4<{=p}XkO`s?hwUM4hi>e2YsRswOFfYi6ZrC;BWB&g1*b5}k8ZcfD&*QUf@Jf#B%2+5DcnZ@3`Wb{+=CBgV9Care)Ma*eSQ$(B)jOId>wc9`H;>)>-w8D} zl5NwL*R^TxgxY`7LGYBi`Du0UAJGg*sxeLma)V2mdhl>dtT5H6y6qnZay7n|*cmW) z1m0(XT%QuGdki$t^DJT8P?~xwJtvin^ybF;sQ~Qp<$81{otMdulW(n`?DYELD zwrZa(7~q1L!wWi!-F025(QMsXcrA6Kq1Qhypp&B8K>)vc=B!&Cx*H&{ zw1S^@H%17O(thLD0pk^t?X0ic%lm(e@r?T5HDn?CLvwUyf5t~AO0h+AW9Wp9HtPFM zEq~en(3qs57{&{7;4MQ!9s1Tsn0EX~)6t1(qG;EzH)oE4YJJa_1G`#oDUDKJRP9%7RWiz z3mRkok^G9FGUk8wY$!Iq*7@+xivz|{O96UgDFVX%7Qr@Qotgm_%`k=4;H# zv8g)en44ZQ0>*Wnkv|Dm$LpvF8BYQ5=(%+-kb(S@NI-|VHaJw^ z10RW%M!#F#zYD4UTyr57K#|Zp=>5v5$Es2TQ_U@M$X!ULuZh-PTl`|EVzE-m_OHq=HF?rKBMQB@RyK(`%IzmBzA)=Z-X_w zw~#TS?tcmLdaluq6^Gso1Eq5_QNCyML&eQkJ)b%Lk&P#8jG1y%Z2{n7X^oLjy>U>t8Tetr5<+21pxXgRG?&pc(%ExE*Bw`^yXzwS4|Y*8V^Uv*mdyYBJ*sSiU%4YFpcjmZtO8 zk#`FP+I;PW%nv~y0&T~R;~_NBGvI5mm@LO@34r=QzYR*~*7ra+gKAiDz74t;l>6MI(D<0 zKDGAh>$Q~xlRjPx8-pCpXEKBl*@9;n<$lDDNQ2A$U|EB!V!y4~<;mF_T$TG4_+7sH z&-1%#GYgwtyUT1F?TcKs!2GVt2A2o2&}ncvf^;sWas3!ktOW90NZ2qQCop+mL)i0p zHUrZ^DG<%$c?cN4f0`*vaJno_(`C8Y)oW7}S8ZTcNnoQAPYv2gTo7DyZ*Q1JULmAqjaaPXF)CLuJ4#D0> zc&GgSE8yB%_MJs8hrgV+&of9nfPD(J0R^I`fbFwj?*XfvDiki_QlI>M(gL5fz$Y#6 zNeg_^0)Kf6$a`yfPc84Iv9$0&dl3qfzg0w>COrMUM&2#U?=hFi^vgUvcB$ayn)WHd z%X@L!qfl^|guQ>RkfP|M$WQJS&e9$PLEas2#@>g5yaWE4ke7GhwANCfi&!2!9!`WB za|n64f0B3V7n{;-QUDV7iHI!cu#l5?(=tD7d3pKAMM1Gm#Mm7BKV+t`5T zv)&uLzN+g7I9I*1y4LHfS!JT^T9nLeJhSO#hn;Wp^99J8jcLAO=UxCmuvU+ZPU1(| zXtk&O^X!Na->r&UIix(?ybeA%(Vsm}2d~Nw>E?KNx zxUjzB;f6~Tl+ISeOgwZ^8N}=wD=)thny?bXgU?iCPx;Vw0e%(gj(8NWceVMyzc03$ zKCs1Nt7EBTqN`(d*Y&k)pxc*DMElb{(R8ObW)9Eu%VZ^NCzxVeq95#^b@9Id#+dW;$ zGzA zZewzOE5+?h&V!}+0w(8^Qv5t7=b=)3A(Qi2Dejn__e$~E-{&dahS}FGTCG&RoQ={(T<#=Q-|VR)3=30iVmSYJ$tpo|oR4 z$Nr1hqmbW|)eozj?icx5{mAH^Y%V)L7xGsBtmg&p5&c%46JF%)3_mq=Aqodg z!d?;bR)4M!fqR5ZF{1AFfdCsYpk6YFo5B(vBTcei2p2W(X*OL7%2I&juU20#y~fB_>`%Gs&Gx_} zPdn(7XCAqqna*v}n@zf{nJ$19&%2piYlOqx3_|6Y$!o@auQ;btXG3!1GyU>Ptq32y zQn1*MZcZ=0nY>Fp4U4j$$7Rss+&4=~co6b7p3~)#=PZ2%MSkSq7b$Utv{Z*XS~73{ zjvl;e&~%?kJ7p}1bA%V$dM`f=RCTp8&UHrnJDInAM-rMQP2+e$a`(jh`nyue5+;JQ zzIb<(6hy4IJI%a2cQ2yejuhB5`CfYYEBD4b#c=9uhqB~M6?0sfk}?j<(HU z(j0Tw{{^C*B3*4@`s6{LQzX>{E{S=xmKw%GNsV~n*lFEA_i10u*ih~Y@?r5aGJV!}N<)a|W zFXbfNgfzPIk-R)_NxGGk5vcqmW34~6k4!^!`dlW@XTt;{$o!?gtbY<|q~Ab72=d-a zQu7kT)TgD?s=pl=wTZk}lYZEeTDGM;FNjZve#DCP<#|xj;b}=TBG-3G`8^AgWd8Ep zD5)&3G$VOQ{}u_w`OEW%q+8{MpfDiwmG*KLeR+>5sr493ZCxDx5e(%o-@n`+OPVeg zM%=3Z4U7J;*at}}|IS02k@c4Rzac_bO0s{*^RN8-5o>*=zNC*sm$sBrU!Id4;^-&k zAi{&>B^?Dy?@sE=^R)c?oN;O#1X*vXC(HFWNFx!cFVEelR*)8B9{xM?BlRVH21&*G z)8})c*gK=jV^UAj7cBbn9N#PSrJOWlRC!^~Oq*yQvsN+$u}H1XhexK7FOB=rPIk zuT_?KId@D8A)*WYQ?xiEc<|KWk^N83|1>|zzK7^^sJzCEZm=X;^QS)pvWRfSdud@u KXs{?)SoXht2qxzM literal 0 HcmV?d00001 diff --git a/test/kernels/ttm_ttm/ttm_original copy 2.c b/test/kernels/ttm_ttm/ttm_original copy 2.c new file mode 100644 index 000000000..cb21b209f --- /dev/null +++ b/test/kernels/ttm_ttm/ttm_original copy 2.c @@ -0,0 +1,242 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A1537, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) { + int A15371_dimension = (int)(A1537->dimensions[0]); + int A15373_dimension = (int)(A1537->dimensions[2]); + int* restrict A15372_pos = (int*)(A1537->indices[1][0]); + int* restrict A15372_crd = (int*)(A1537->indices[1][1]); + double* restrict A1537_vals = (double*)(A1537->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + + A15372_pos = (int32_t*)malloc(sizeof(int32_t) * (A15371_dimension + 1)); + A15372_pos[0] = 0; + for (int32_t pA15372 = 1; pA15372 < (A15371_dimension + 1); pA15372++) { + A15372_pos[pA15372] = 0; + } + int32_t A15372_crd_size = 1048576; + A15372_crd = (int32_t*)malloc(sizeof(int32_t) * A15372_crd_size); + int32_t i1543A1537 = 0; + + for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= B1_dimension) + continue; + + int32_t pA15372_begin = i1543A1537; + + for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) { + int32_t i1543 = B2_crd[i1543B]; + if (A15372_crd_size <= i1543A1537) { + A15372_crd = (int32_t*)realloc(A15372_crd, sizeof(int32_t) * (A15372_crd_size * 2)); + A15372_crd_size *= 2; + } + A15372_crd[i1543A1537] = i1543; + i1543A1537++; + } + + A15372_pos[i1542 + 1] = i1543A1537 - pA15372_begin; + } + } + + int32_t csA15372 = 0; + for (int32_t pA153720 = 1; pA153720 < (A15371_dimension + 1); pA153720++) { + csA15372 += A15372_pos[pA153720]; + A15372_pos[pA153720] = csA15372; + } + + A1537_vals = (double*)malloc(sizeof(double) * (i1543A1537 * A15373_dimension)); + + A1537->indices[1][0] = (uint8_t*)(A15372_pos); + A1537->indices[1][1] = (uint8_t*)(A15372_crd); + A1537->vals = (uint8_t*)A1537_vals; + return 0; +} + +int compute(taco_tensor_t *A1537, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) { + int A15371_dimension = (int)(A1537->dimensions[0]); + int A15373_dimension = (int)(A1537->dimensions[2]); + int* restrict A15372_pos = (int*)(A1537->indices[1][0]); + double* restrict A1537_vals = (double*)(A1537->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + int* restrict B3_pos = (int*)(B->indices[2][0]); + int* restrict B3_crd = (int*)(B->indices[2][1]); + double* restrict B_vals = (double*)(B->vals); + int C1_dimension = (int)(C->dimensions[0]); + int C2_dimension = (int)(C->dimensions[1]); + double* restrict C_vals = (double*)(C->vals); + int D1_dimension = (int)(D->dimensions[0]); + int D2_dimension = (int)(D->dimensions[1]); + double* restrict D_vals = (double*)(D->vals); + + // int32_t i1543A1537 = 0; + + #pragma omp parallel for schedule(static) + for (int32_t pA1537 = 0; pA1537 < (A15372_pos[A15371_dimension] * A15373_dimension); pA1537++) { + A1537_vals[pA1537] = 0.0; + } + + #pragma omp parallel for schedule(runtime) + for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= B1_dimension) + continue; + + for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) { + for (int32_t i1544B = B3_pos[i1543B]; i1544B < B3_pos[(i1543B + 1)]; i1544B++) { + int32_t i1544 = B3_crd[i1544B]; + for (int32_t i1553 = 0; i1553 < ((D1_dimension + 31) / 32); i1553++) { + for (int32_t i1555 = 0; i1555 < ((D2_dimension + 31) / 32); i1555++) { + for (int32_t i1554 = 0; i1554 < 32; i1554++) { + int32_t i1545 = i1553 * 32 + i1554; + int32_t i1545C = i1544 * C2_dimension + i1545; + if (i1545 >= D1_dimension) + continue; + + for (int32_t i1556 = 0; i1556 < 32; i1556++) { + int32_t i1546 = i1555 * 32 + i1556; + // int32_t i1546A1537 = i1543A1537 * A15373_dimension + i1546; + int32_t i1546A1537 = i1544B * A15373_dimension + i1546; + int32_t i1546D = i1545 * D2_dimension + i1546; + if (i1546 >= D2_dimension) + continue; + + A1537_vals[i1546A1537] = A1537_vals[i1546A1537] + (B_vals[i1544B] * C_vals[i1545C]) * D_vals[i1546D]; + } + } + } + } + } + + // i1543A1537++; + } + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/ttm_original.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} diff --git a/test/kernels/ttm_ttm/ttm_original copy.c b/test/kernels/ttm_ttm/ttm_original copy.c new file mode 100644 index 000000000..2db396c0a --- /dev/null +++ b/test/kernels/ttm_ttm/ttm_original copy.c @@ -0,0 +1,225 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A1537, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) { + int A15371_dimension = (int)(A1537->dimensions[0]); + int A15373_dimension = (int)(A1537->dimensions[2]); + int* restrict A15372_pos = (int*)(A1537->indices[1][0]); + int* restrict A15372_crd = (int*)(A1537->indices[1][1]); + double* restrict A1537_vals = (double*)(A1537->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + + A15372_pos = (int32_t*)malloc(sizeof(int32_t) * (A15371_dimension + 1)); + A15372_pos[0] = 0; + for (int32_t pA15372 = 1; pA15372 < (A15371_dimension + 1); pA15372++) { + A15372_pos[pA15372] = 0; + } + int32_t A15372_crd_size = 1048576; + A15372_crd = (int32_t*)malloc(sizeof(int32_t) * A15372_crd_size); + int32_t i1543A1537 = 0; + + for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= B1_dimension) + continue; + + int32_t pA15372_begin = i1543A1537; + + for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) { + int32_t i1543 = B2_crd[i1543B]; + if (A15372_crd_size <= i1543A1537) { + A15372_crd = (int32_t*)realloc(A15372_crd, sizeof(int32_t) * (A15372_crd_size * 2)); + A15372_crd_size *= 2; + } + A15372_crd[i1543A1537] = i1543; + i1543A1537++; + } + + A15372_pos[i1542 + 1] = i1543A1537 - pA15372_begin; + } + } + + int32_t csA15372 = 0; + for (int32_t pA153720 = 1; pA153720 < (A15371_dimension + 1); pA153720++) { + csA15372 += A15372_pos[pA153720]; + A15372_pos[pA153720] = csA15372; + } + + A1537_vals = (double*)malloc(sizeof(double) * (i1543A1537 * A15373_dimension)); + + A1537->indices[1][0] = (uint8_t*)(A15372_pos); + A1537->indices[1][1] = (uint8_t*)(A15372_crd); + A1537->vals = (uint8_t*)A1537_vals; + return 0; +} + +int compute(taco_tensor_t *A1537, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) { + int A15371_dimension = (int)(A1537->dimensions[0]); + int A15373_dimension = (int)(A1537->dimensions[2]); + double* restrict A1537_vals = (double*)(A1537->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + int* restrict B3_pos = (int*)(B->indices[2][0]); + int* restrict B3_crd = (int*)(B->indices[2][1]); + double* restrict B_vals = (double*)(B->vals); + int C1_dimension = (int)(C->dimensions[0]); + int C2_dimension = (int)(C->dimensions[1]); + double* restrict C_vals = (double*)(C->vals); + int D1_dimension = (int)(D->dimensions[0]); + int D2_dimension = (int)(D->dimensions[1]); + double* restrict D_vals = (double*)(D->vals); + + // int32_t i1543A1537 = 0; + + #pragma omp parallel for schedule(runtime) + for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= B1_dimension) + continue; + + for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) { + for (int32_t i1546 = 0; i1546 < D2_dimension; i1546++) { + // int32_t i1546A1537 = i1543A1537 * A15373_dimension + i1546; + int32_t i1546A1537 = i1543B * A15373_dimension + i1546; + double ti1544A1537_val = 0.0; + for (int32_t i1544B = B3_pos[i1543B]; i1544B < B3_pos[(i1543B + 1)]; i1544B++) { + int32_t i1544 = B3_crd[i1544B]; + for (int32_t i1545 = 0; i1545 < D1_dimension; i1545++) { + int32_t i1545C = i1544 * C2_dimension + i1545; + int32_t i1546D = i1545 * D2_dimension + i1546; + ti1544A1537_val += (B_vals[i1544B] * C_vals[i1545C]) * D_vals[i1546D]; + } + } + A1537_vals[i1546A1537] = ti1544A1537_val; + } + // i1543A1537++; + } + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/ttm_original.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} diff --git a/test/kernels/ttm_ttm/ttm_original.c b/test/kernels/ttm_ttm/ttm_original.c new file mode 100644 index 000000000..ac2674239 --- /dev/null +++ b/test/kernels/ttm_ttm/ttm_original.c @@ -0,0 +1,226 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A1537, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) { + int A15371_dimension = (int)(A1537->dimensions[0]); + int A15373_dimension = (int)(A1537->dimensions[2]); + int* restrict A15372_pos = (int*)(A1537->indices[1][0]); + int* restrict A15372_crd = (int*)(A1537->indices[1][1]); + double* restrict A1537_vals = (double*)(A1537->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + + A15372_pos = (int32_t*)malloc(sizeof(int32_t) * (A15371_dimension + 1)); + A15372_pos[0] = 0; + for (int32_t pA15372 = 1; pA15372 < (A15371_dimension + 1); pA15372++) { + A15372_pos[pA15372] = 0; + } + int32_t A15372_crd_size = 1048576; + A15372_crd = (int32_t*)malloc(sizeof(int32_t) * A15372_crd_size); + int32_t i1543A1537 = 0; + + for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= B1_dimension) + continue; + + int32_t pA15372_begin = i1543A1537; + + for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) { + int32_t i1543 = B2_crd[i1543B]; + if (A15372_crd_size <= i1543A1537) { + A15372_crd = (int32_t*)realloc(A15372_crd, sizeof(int32_t) * (A15372_crd_size * 2)); + A15372_crd_size *= 2; + } + A15372_crd[i1543A1537] = i1543; + i1543A1537++; + } + + A15372_pos[i1542 + 1] = i1543A1537 - pA15372_begin; + } + } + + int32_t csA15372 = 0; + for (int32_t pA153720 = 1; pA153720 < (A15371_dimension + 1); pA153720++) { + csA15372 += A15372_pos[pA153720]; + A15372_pos[pA153720] = csA15372; + } + + A1537_vals = (double*)malloc(sizeof(double) * (i1543A1537 * A15373_dimension)); + + A1537->indices[1][0] = (uint8_t*)(A15372_pos); + A1537->indices[1][1] = (uint8_t*)(A15372_crd); + A1537->vals = (uint8_t*)A1537_vals; + return 0; +} + +int compute(taco_tensor_t *A1537, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) { + int A15371_dimension = (int)(A1537->dimensions[0]); + int A15373_dimension = (int)(A1537->dimensions[2]); + double* restrict A1537_vals = (double*)(A1537->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + int* restrict B3_pos = (int*)(B->indices[2][0]); + int* restrict B3_crd = (int*)(B->indices[2][1]); + double* restrict B_vals = (double*)(B->vals); + int C1_dimension = (int)(C->dimensions[0]); + int C2_dimension = (int)(C->dimensions[1]); + double* restrict C_vals = (double*)(C->vals); + int D1_dimension = (int)(D->dimensions[0]); + int D2_dimension = (int)(D->dimensions[1]); + double* restrict D_vals = (double*)(D->vals); + + // int32_t i1543A1537 = 0; + + #pragma omp parallel for schedule(runtime) + for (int32_t i1547 = 0; i1547 < ((B1_dimension + 15) / 16); i1547++) { + for (int32_t i1548 = 0; i1548 < 16; i1548++) { + int32_t i1542 = i1547 * 16 + i1548; + if (i1542 >= B1_dimension) + continue; + + for (int32_t i1543B = B2_pos[i1542]; i1543B < B2_pos[(i1542 + 1)]; i1543B++) { + for (int32_t i1546 = 0; i1546 < D2_dimension; i1546++) { + // int32_t i1546A1537 = i1543A1537 * A15373_dimension + i1546; + int32_t i1546A1537 = i1543B * A15373_dimension + i1546; + double ti1544A1537_val = 0.0; + for (int32_t i1544B = B3_pos[i1543B]; i1544B < B3_pos[(i1543B + 1)]; i1544B++) { + int32_t i1544 = B3_crd[i1544B]; + for (int32_t i1545 = 0; i1545 < D1_dimension; i1545++) { + int32_t i1545C = i1544 * C2_dimension + i1545; + int32_t i1546D = i1545 * D2_dimension + i1546; + ti1544A1537_val += (B_vals[i1544B] * C_vals[i1545C]) * D_vals[i1546D]; + } + } + A1537_vals[i1546A1537] = ti1544A1537_val; + } + // i1543A1537++; + } + } + } + return 0; +} + +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/ttm_original.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} diff --git a/test/kernels/ttm_ttm/ttm_original.h b/test/kernels/ttm_ttm/ttm_original.h new file mode 100644 index 000000000..a27841047 --- /dev/null +++ b/test/kernels/ttm_ttm/ttm_original.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A1537, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A1537, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D); +#endif diff --git a/test/kernels/ttm_ttm/ttm_original.so b/test/kernels/ttm_ttm/ttm_original.so new file mode 100755 index 0000000000000000000000000000000000000000..fa04aed35a00dd5622e9b53f76c56020c2b923bb GIT binary patch literal 14408 zcmeHOeQ;aVmA}tYh!YzEg4$| zho(uWVwSfLg>>64%$82mZg-dIc1o6^rDeOtjzc~u%$o4gY+>7MTR?L+e)pVnzrWr)-q{##Dp3?BgM(eih&wUQLi{4($Vw3b@v~Ys z7r)EcC0usSRMNDVztVyvWlZA$<#7(4iHkT3s_}@VyGktlenF)kv83m|*h+U_tnrAX zqoOL3mRe%W2};iqk#46RAtx!xQfPTS-A3{K=W%u#OQluU$pdJ-hKQl$rNQl=`K8EW-gw6 zno>HirlhRgS+CqbM^h+mAs*_s`o@>v-v7=0k3RL=zuDej-rBu8aevwO?;P28{Lalw z_MQ1+qW){~7k`p{$JJ8youhBP_y^a(&%f@gJMz~FsLmSZpul9`9+m*m>>&7UcxICS zhgtCXv)JD|i#)ZThQ}Vp0L+x{f?4cGXOXW3uHmtV#k1JC1%k`$!r(s#c{e+c4QN_X zYBjf0K3Ujri1Cr+^5?T%Hu-UAFJ$Fx-vZ0bR$=W&0^hG%fSl8(1>PeD=#{Wbp;g$o zm*)!LWWPtu7aEfk)(U)|01jF(>O}JM*o2&@0$(lU_uA~7&*jf$-xatJXHRjwoDGTc zZx9JD2wW58*QG)v(%zj)M*7mx-gG3wBAv<3G>araSR}M@eWWek8*lIIOUHXRu5ai{ zCF2{TtzB^|ZRV6nY#>SoqFtT$#98xA>sumSsZ>v-w?CQg?2cPm&44H41L*=do@|RG zQ+G$HIHVK%LL}Dlg-9aW*~MbrJq$=H(u!<)w{3_=dt)7e1PVDFu8${Dy>ZqZ?dnR! zSX-Pb7~#t4c(O0m8%eW7ZyW;g?!I`M^~NnhUhbY~FSvLYi}ZDLc1NOpeerHyd(OsE z-97#3IP2sZ!lC+x$W`7| zld-G3*H8!1Aa-C)E8*)^iJ)^pmA~+hG!@7M`PgKv2K?xymt#qu!}*##aLJ<3q#O3~3YQy&m`5sNmUwEKvAvu>Ks+_c*e=dLMLc!SSP$nPC!U&SY%Ax# zM?5vjSPSRBLp(Lbn4j~@z$f;C=Ry6!7SD(GhCLtpcQK~^V58^5k%U)Wy4y-z%Jl5I zJE5>i&(7%t)$1m7X~)7QEU^Hr(8GJQF(F~p^7;j zKZoZ01iV7(s2F4Kj=*Hl62D_C0SSu_+!na?DeA4jrohHc8}xhM&VkX*Yt4w>rkl>o zs)#jqdya$phRkMv(7d}goZamih`1@I`zgh2E~Pm2li9pB+jB9PePA*}su~*i z<%iEY_1Kx*M<3<$yNA@>FJeH2Vu$qM6KL3=rz6{#2dUCyBjN0-XQ0PI#$ig)Lc?!5 z!^Ze}$eI5vwxhZeEFm`iC>dBc2~aNk$Czw>ywrXO&*v^K;Au9U1L{lx}YEw3rnS zo56>jb))K@$B<@*Ju3!}s(b#P%RRyy19tV;v&%#;AXY|aF0C=I!V-+wUqT%vj=T>7(lI_U6L`C$D8|1db3(~%_f$F6`@V_tW2L#^bU<}Ge?>Og zY4!GlcuG#In~w*Kw~PaUfHyvw)5%sYH3AN&st~?ufAs5P)zvMfnE5}1w z=OI25)v9@!1B#v*b|=)^pFgd-S)yJ&-&qw_CIWS%z=jj*=Fz)BRBt*8VF>~`lX*;|#mm#M(0 zKa`X|qqlgP+xb!;Tj3yu0^t%%ct`T%1RX-Kkpwe`&~wM?-m{89`4^d`>~}0t&QjKA zg356!ndguC!R=f_OfSF#3g#H-b~mOT1P1hsHSGPE z3^cja2Y>bztv+q~)z)&`JX#1N5yfKaywP1b6DjqvsSN`PAOHS62 z{p>1tz!)d<0cD(<%F55;~FG-`DNE8L2&LlTpJqo*=#{2_M)sh~7_ zv4}R5=AN{GkZ}-fW$sb1x(Q*deE>A95HFK~+=ECkkxAoVo|H8tRvCS6bZBvyF*&x&^bQ2rBA11Ij91e>;=UKaJBP%e{yg`YmJ(t9xFByq;}vVq>D7 zVW72w@-tolDsGGyOnT}DKgG1LaRehfY<78S8hvNN#+miT??c9W;{1RWz+Im$Us)IS zRH%Et1#6AwjUP7Fxzs%&$}N`ml!szR!$y|}yZQV2@^|#?iXT#@I!07gVELbn55n0c zdxFN%^~QgmyiksJEH^;|8+GOxXSQN-pziz~)q%QSH=1qLZ0Bzq6*T8zqudjUy{`|y zQIb3R9tLQ+=j2gzvNd0*yp_g(Tp9M1Af#$@_n;E| zOo8=#2|rHu&y$rO$-VY2PND_twQk;ovfSp@&AU;E>&Kq^nDlb8PInbd@)7ika-m~Q z8kFvITcJy*Hw`kh;naW*;9OOMvWi+ z?$TZC`cJL-^wpka1e1Oc50#nLK^ek`Y{9b`Wq+R5-Flb%!IFActpP&v7iXb3iE&HiYLT zVDh~XVK3p)XafeNKs1ME3o!nkYpN`blVxd|EXxh99*3g1YJ<~C0vlC$sLlOo>#c}W zy&fnDxGL{+1YBCiS+dbp8Q{gnX&T8}A)AIQe}6ZXFX}g~4T7$HisPmdv_ZqPoSUf) zDDVBScM$!05OIc#?xVajj-YFoQu4%nS7n3C9iS$oc6bf49XQv>J|=9O%45Tv7z_K~ z$>$f5%}>H7E$~STe9{7+w7~z%7Lf0)<$G%RUK-0j5A+V5g5+-&k#Y;3{9Ys9Ez9pQ zi)H#n8joElc-m7ae99tBz89yn1BFtHu=md9`FBd%e<;X3e1gsf2=d+WW^pc&?|{D{ zwl`a`N4@%nwH>Uj9*0P#mE+ zFY_H2{MSS}jv72&R8Ls_za#Ryxjo0rMa5PN+9c?$f+hsLOVIlS{aZo*QP3X=dPq?D zouj#-;WJv*rdE6#>({RJUgNE)zPg`tz8$_=Z_TRZ7RpwjWLD!@O)m%RY=@A~Lf&dj ztBcEb19;qCJu*6uA7z8xp7O}8z#Tg%@$(er!RefPmLkdm!1lMF+BK8MfQ{rT<79f zq2Y)}@p@NU|NDAlE9nDUEVeS1N+vqnS9V@qyBfN^=|r?I-5pJLcw^S|oL3?%;dqfL zj`_-GpDn?6G6jABT7%DLHKbPEjjx#P)orPMe95b>tpSkkPsZ1^$CL5i&KOwwn7D4B zc6IgYRa$ks{D0F`TD5!&+rBl5CdpcvRdmz@= z%RZ()D~qeMlG|U2N>^W7-Pqj{tZ5MpA0i?dMU1Rog(4*T2ddkX{i2;1K4aoDXJV;Kj! z>4;cw3WXgQ1ah4##vM$qZ^gKi$#t+8pTp#OQjDL&EGum z-NEKFw`jFu`7$;-BVIn~S5z#2?zHdumVMWxzfG~c%Bl)K@`*$TZhG?jY_a@;$^E7n zUkE=Utq|gI2V2DCci6&22i;<|ndHx7a$PHyKYyCP2BkZg++_--375Nw9hbYR9hbYQ z9hbYV9hbYU9lxj;5S)WuJPoH|>}2oc?FlM?68L?{nIe~GvOJSK-BC#%L#QZxTHt;g zzE0rsoFbJrAfm9Tc9zqumE+U<-4J%<`A6EJH_XZmdUe599^I|1$*y;A` zg`ELS%Gx=4$ z!{w*1OYhHO|2H_JkbPNL>4&w5UlZ{{auM*E%5x3yWrczwelwS!VZSRrs1zSHkk*&(Pb6?4n{peAq`Mz`@E(H8pK$+!)yys1G;Va)hfI_c)s@1!XBf@>}!!rS};5jQu5- zz3HBKnh86Zz}I1kHdoOXK@|0Ir&YK5}t&-jc0Uy9L6ILh`bCOdB5l>- z4wuZ^x2+pJ4Vvz?Xorj?ai8$wc<<$xg6hsT#<`AYUkCHHZA(JaqG?<+Nbat9Z(nCB zS;Rz;)*J7Nl7fi!bfuY>=k7(++nxfOCLc^M?{aUvLrklVHYiKZQnBWjB`M>u9PP%1 zhb)mr6pH*Gy>NeIUW}@4Ok7~`*8X;=M3e1U5d_UuaWOU*2mZZ4+`>UU`u3Q3Q*> zC}51<-AH|T|B-Z`NT7VNHB7!QlE)@LFtRQ6G;2tC9NhJ|!uY zkAf_}l#_H5(&*ifWE!HA_cM7<8zK-v<}dYS{gX%|{dy8Y zkoP!At(PH|K5eOX{Q+RqCi4B7^v9Oeeq*7$3PT^5LVbBJlys<27;*VNP*VQB1-Q&# z-XA5E<&|b6FX>+)p)h}W50Uhsd@(2t$b6-}tW96O&y>`Dowe8ht3qG)zdVOanwA*} zYS;g^O<&$`C9RZ(g}$t}H2cqp(5odmKIA=E{vC_)Hw*U-cnDN>&r+Z5vecl-zS!m7GoO!o%)gbl0Ju|LjB2m zx=`$yQsptJCn-4y73$0Te2>tVa?*^Qe-#OY$@0tl(7-JElviOm0fw-eLJ)DOzubmP zeM!fVF5`uw-_|V7Zr3h50Wmv9nlEVd%h=~FGWlzlbpo(sB)swxOQmU2(~8(-7tW&J zFZA0#M*m`=Z?ETTLca{@g+a2CE=NSh3+3KiW^vaSf{5E?>07@YBbCc9wG=grO#a$s ziMLI0E~^Xu6XN@W2DA>39Dj2Cr}asWJwzu%)s +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +int assemble(taco_tensor_t *A1542, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) { + int A15421_dimension = (int)(A1542->dimensions[0]); + int A15423_dimension = (int)(A1542->dimensions[2]); + int* restrict A15422_pos = (int*)(A1542->indices[1][0]); + int* restrict A15422_crd = (int*)(A1542->indices[1][1]); + double* restrict A1542_vals = (double*)(A1542->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + + A15422_pos = (int32_t*)malloc(sizeof(int32_t) * (A15421_dimension + 1)); + A15422_pos[0] = 0; + for (int32_t pA15422 = 1; pA15422 < (A15421_dimension + 1); pA15422++) { + A15422_pos[pA15422] = 0; + } + int32_t A15422_crd_size = 1048576; + A15422_crd = (int32_t*)malloc(sizeof(int32_t) * A15422_crd_size); + int32_t i1548A1542 = 0; + + for (int32_t i1552 = 0; i1552 < ((B1_dimension + 15) / 16); i1552++) { + for (int32_t i1553 = 0; i1553 < 16; i1553++) { + int32_t i1547 = i1552 * 16 + i1553; + if (i1547 >= B1_dimension) + continue; + + int32_t pA15422_begin = i1548A1542; + + for (int32_t i1548B = B2_pos[i1547]; i1548B < B2_pos[(i1547 + 1)]; i1548B++) { + int32_t i1548 = B2_crd[i1548B]; + if (A15422_crd_size <= i1548A1542) { + A15422_crd = (int32_t*)realloc(A15422_crd, sizeof(int32_t) * (A15422_crd_size * 2)); + A15422_crd_size *= 2; + } + A15422_crd[i1548A1542] = i1548; + i1548A1542++; + } + + A15422_pos[i1547 + 1] = i1548A1542 - pA15422_begin; + } + } + + int32_t csA15422 = 0; + for (int32_t pA154220 = 1; pA154220 < (A15421_dimension + 1); pA154220++) { + csA15422 += A15422_pos[pA154220]; + A15422_pos[pA154220] = csA15422; + } + + A1542_vals = (double*)malloc(sizeof(double) * (i1548A1542 * A15423_dimension)); + + A1542->indices[1][0] = (uint8_t*)(A15422_pos); + A1542->indices[1][1] = (uint8_t*)(A15422_crd); + A1542->vals = (uint8_t*)A1542_vals; + return 0; +} + +int compute(taco_tensor_t *A1542, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) { + int A15421_dimension = (int)(A1542->dimensions[0]); + int A15423_dimension = (int)(A1542->dimensions[2]); + int* restrict A15422_pos = (int*)(A1542->indices[1][0]); + double* restrict A1542_vals = (double*)(A1542->vals); + int B1_dimension = (int)(B->dimensions[0]); + int* restrict B2_pos = (int*)(B->indices[1][0]); + int* restrict B2_crd = (int*)(B->indices[1][1]); + int* restrict B3_pos = (int*)(B->indices[2][0]); + int* restrict B3_crd = (int*)(B->indices[2][1]); + double* restrict B_vals = (double*)(B->vals); + int C1_dimension = (int)(C->dimensions[0]); + int C2_dimension = (int)(C->dimensions[1]); + double* restrict C_vals = (double*)(C->vals); + int D1_dimension = (int)(D->dimensions[0]); + int D2_dimension = (int)(D->dimensions[1]); + double* restrict D_vals = (double*)(D->vals); + +// int32_t i1548A1542 = 0; + + #pragma omp parallel for schedule(static) + for (int32_t pA1542 = 0; pA1542 < (A15422_pos[A15421_dimension] * A15423_dimension); pA1542++) { + A1542_vals[pA1542] = 0.0; + } + + #pragma omp parallel for schedule(runtime) + for (int32_t i1552 = 0; i1552 < ((B1_dimension + 15) / 16); i1552++) { + for (int32_t i1553 = 0; i1553 < 16; i1553++) { + int32_t i1547 = i1552 * 16 + i1553; + if (i1547 >= B1_dimension) + continue; + + for (int32_t i1548B = B2_pos[i1547]; i1548B < B2_pos[(i1547 + 1)]; i1548B++) { + for (int32_t i1549B = B3_pos[i1548B]; i1549B < B3_pos[(i1548B + 1)]; i1549B++) { + int32_t i1549 = B3_crd[i1549B]; + for (int32_t i1550 = 0; i1550 < D1_dimension; i1550++) { + int32_t i1550C = i1549 * C2_dimension + i1550; + for (int32_t i1551 = 0; i1551 < D2_dimension; i1551++) { + // int32_t i1551A1542 = i1548A1542 * A15423_dimension + i1551; + int32_t i1551A1542 = i1548B * A15423_dimension + i1551; + int32_t i1551D = i1550 * D2_dimension + i1551; + A1542_vals[i1551A1542] = A1542_vals[i1551A1542] + (B_vals[i1549B] * C_vals[i1550C]) * D_vals[i1551D]; + } + } + } + // i1548A1542++; + } + } + } + return 0; +} +#include "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/ttm_original2.h" +int _shim_assemble(void** parameterPack) { + return assemble((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} +int _shim_compute(void** parameterPack) { + return compute((taco_tensor_t*)(parameterPack[0]), (taco_tensor_t*)(parameterPack[1]), (taco_tensor_t*)(parameterPack[2]), (taco_tensor_t*)(parameterPack[3])); +} diff --git a/test/kernels/ttm_ttm/ttm_original2.h b/test/kernels/ttm_ttm/ttm_original2.h new file mode 100644 index 000000000..8a08b4548 --- /dev/null +++ b/test/kernels/ttm_ttm/ttm_original2.h @@ -0,0 +1,125 @@ +#ifndef TACO_C_HEADERS +#define TACO_C_HEADERS +#include +#include +#include +#include +#include +#include +#include +#include +#if _OPENMP +#include +#endif +#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b)) +#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b)) +#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a) +#ifndef TACO_TENSOR_T_DEFINED +#define TACO_TENSOR_T_DEFINED +typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; +typedef struct { + int32_t order; // tensor order (number of modes) + int32_t* dimensions; // tensor dimensions + int32_t csize; // component size + int32_t* mode_ordering; // mode storage ordering + taco_mode_t* mode_types; // mode storage types + uint8_t*** indices; // tensor index data (per mode) + uint8_t* vals; // tensor values + int32_t vals_size; // values array size +} taco_tensor_t; +#endif +#if !_OPENMP +int omp_get_thread_num() { return 0; } +int omp_get_max_threads() { return 1; } +#endif +int cmp(const void *a, const void *b) { + return *((const int*)a) - *((const int*)b); +} +int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayStart] >= target) { + return arrayStart; + } + int lowerBound = arrayStart; // always < target + int upperBound = arrayEnd; // always >= target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return upperBound; +} +int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) { + if (array[arrayEnd] <= target) { + return arrayEnd; + } + int lowerBound = arrayStart; // always <= target + int upperBound = arrayEnd; // always > target + while (upperBound - lowerBound > 1) { + int mid = (upperBound + lowerBound) / 2; + int midValue = array[mid]; + if (midValue < target) { + lowerBound = mid; + } + else if (midValue > target) { + upperBound = mid; + } + else { + return mid; + } + } + return lowerBound; +} +taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize, + int32_t* dimensions, int32_t* mode_ordering, + taco_mode_t* mode_types) { + taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t)); + t->order = order; + t->dimensions = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t)); + t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t)); + t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***)); + t->csize = csize; + for (int32_t i = 0; i < order; i++) { + t->dimensions[i] = dimensions[i]; + t->mode_ordering[i] = mode_ordering[i]; + t->mode_types[i] = mode_types[i]; + switch (t->mode_types[i]) { + case taco_mode_dense: + t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **)); + break; + case taco_mode_sparse: + t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **)); + break; + } + } + return t; +} +void deinit_taco_tensor_t(taco_tensor_t* t) { + for (int i = 0; i < t->order; i++) { + free(t->indices[i]); + } + free(t->indices); + free(t->dimensions); + free(t->mode_ordering); + free(t->mode_types); + free(t); +} +#endif + +#ifndef TACO_GENERATED_assemble +#define TACO_GENERATED_assemble +int assemble(taco_tensor_t *A1542, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D); +#endif + +#ifndef TACO_GENERATED_compute +#define TACO_GENERATED_compute +int compute(taco_tensor_t *A1542, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D); +#endif diff --git a/test/kernels/ttm_ttm/ttm_original2.so b/test/kernels/ttm_ttm/ttm_original2.so new file mode 100755 index 0000000000000000000000000000000000000000..6466a2af25f27b4b4cc11283e998dc09686efe76 GIT binary patch literal 14568 zcmeHOYjjiBl|EMzSb(D|h)bM=28G^87LwKc+Ws=o2)3j61YMRP{HV&Bpd7=6C zx#!4|Y&SE1X8!cR)js<>d+)Q)KCk;Y-d7u_vndLb$tp}t)jlU z;^GwXkW@5>x2*s+?6~Uh-OhfT`oZ@$>}tRAWWz&u|9Isqk9_je6JMt)sGRB|GkLg< zc@*b@GTWkjN40Wbo=2fFvU3xz>e^piJpQHQ-}vG0J~!B(-`qJIJ+SDTpE!B!g-`5Q zcI@&eqt%Z@o_jL!nyYcqSI@oj+^epEpFCP#b@p$fP@OZ~0f(2&Qo0P7XBK=9u-W9F znFD`(4*WNB;D_hHzc&X?@%G@#O*hQZuF5&^5Dd)Lu1esZT;t%qbJ*F6vK6z)e;V>` z_C9vNeKkVwKHa5)2Q7$Uvy0{hx&671@JFhuT+b9cwnk#XuKvzotgSZ^YUxX37EeX2 zGt$`?iLr2J4}*Spuo+JE?%5Uz^@iJgQ8Z~LTpfva_eNM}sH3Ah%vvH8jv!Z#MY{UB zdxJ3+?Tthj>f{o}zZ942JspBAq-)oDIX<{#b-{v^TdRTiVxMT24%u za%)*I(h`b=(8lJzJ`tNB+HOUnqO{D)lv5_qP+b#TQ(BSEttq_|{YXRIj+JsAazqKC z9PKDJru>8D-`w89wVB>OFbDlMP8<&44isKrm_rNt}#?rnSqNX zh^u%r@XY;J&%ntinHn>286zs&k%5a_A+EAJ1D9Pw1#KBPR+V(>$-pz0pn(kBm6m2q zk6(1^=Df{IJxm`NjoDJ;di-hU$+Sf3&aEg)-QI*>b-5Qg;@hZvaw3Iv`zGQQ;>S5( zO+0n+r;ts0INuCDO4~9|>bG`k@9Ym~?|6q8 zQ@_=uy>l{JsxBY43YRlIv2hFv+x5h}Cqeb9tGe;~*o_FGjaY>6x?QP>X_Z|kDQeR^ z^!Jls;?9S-nc{2E*T=5ebz@2&OWva^r}gvKVudg`R$v*-Cxg<3Tzf+s%VCH0uh!!m zAAi7{^A>9@ppO-3PM@xBlH?)xG2rf<8i zY5O*P|3$i^(9Pumb6;7DZaQu+LZNO9Yem4zN6mVFATg|YNGNeoD{)C$dDrQdu| z4?vLSBghfbGwX|tquNn!;-GewQ@_?`)_WmAod5|BSwC*&3n~8tEAOWKsFin8K8d_p zzai0c3z#V|@ljQ6m?}?=U32K+%gL#CQ>o!mb$A4$ts#6`ADcjw{hG&@s7rxX=n2Ou z8b}20_8U{1jcY#RjCs^*jXBI)19$(6*LjrJhnmiU9h-0<_fsl30yA-PXPk=(7vrN2 z!kiQjSmMozk8-ysAP0!z5l<>4HH55wW>`B0N@=t41_a2u-2$tu_Nj-qVj__}tFEJ5 z!z?&r`j0rOPN<1Hc=1syO5tZZF+f)k6@;o2YF*g&Z&n+z}AkW)I)>FkY(O?;b(O7 zM^+3H6+eSfYB)S2x8R)_(9q92wMGEz^u*vgglSOQK{^xULBer{^4uGARgWGX=AH)< z`KQR3o+!v`Fiz`<4d?GoI2P!}1}$Km*pwJt;MT`pL7xw5MaD1Ve^BBn#dr%`<&Bqh zWjub`7DyC~!Tc}CiV@Sw{KmE7YYoa6jR>#Lcpcb=_{-OnE57p=8kBQj$yh+eI$#cJ zjR?kJXsS=-5!*&Xq2i2=J{tlWSnsDc3~B@F=TGQW%QR|7z<8BxyVWm@mj7|Vi33fV zYRx;{cr%cQIs?U%dR2v{e&NLCstNV;qYF;>;;B6KvC$QNO&va=4qv1Jm8f?&RE=`8 zXc}xeQSvNG%qFd(s!1zTKjWZ&@oSY>kd|Ij-FcVP`{(1bU-GFt7=9Krpo?^SK=YQ5 zmOqOkS3sGJss-mPf1;|}p~O!pQT6_@D0Cb)pIYotFc93ns^=b@K%ePV0c}Gpf?5Kw zW%nft=JT$ks`T))dLsXPqG=u`rQ)kP0|2$&@Banr-Jx7k7b|;LN-ZUc$k8B3YZjd6 z!ly{s24S0SAJja4<=w#g7R?+`;NnHQxUpljCT9 zTE2XAG6+5ygB!FHVM#aMrcFQjRUpQxzOm%;TIHr z|NXs%uwy)5d^&zj+4pOhOKkKbNJ$k1CMv;-$~P_)78!kR_0T@p_a*96wdE?xyMHtTr4~)uw+Vt1%oKbj)LSwduf$QDXq> z|L#*xvSM;8F^#P3OzAvze1eIaAas&qd=Hw|VU#@NW{-rH)+Un>8jwVvo^O;P-m zvdP@wX~57(e&>x;s%quFq;7t)DEW0T$T*Y#U}XxEk6IZ|@*ri5-%>S`L&)>~ux?G_ z7u#^R!1szqzgcE_o#>05bc^9j>{a~eogI3Z2Ts?`I;TGN2bL3 zo`PGH{0gt?G%?;YXgUw^Exz~})u%R`8N2GJ9l4CM4eFskVY1>*f$k&p#9l}FWy}ZO z3CR-JcnWI*OKw9B{nlWNsfS*Kyq>6WoI!}mE@-8obR)`qP;v7fETeAl^3DzzXECw^ zW`|Z*TYfoUT*lJVV7wvj->|Q^s}uRFu?82YhrSFWwdTF=)KF#@m6!vO|93+-Bo97jBm0J$@3A(~9)?DMzBBbW7|~^Dv&h^m+sa2XhO;$p%g;?#!VjxY_rVezK^JWp8C5@9 zjD@v4b>WNUsmV`3#v1>$UdUVLBXBEV_GaHVA0S^x`N&P6Z~Z1Sv2$U7nhc{05vwZ- zm^u)zJ7BtLcy2a}X+&VC)|CaYsNhBzQPRz7CvL)!#d!8PO;{Rm@FmF%mFdQxbo0X~ zs4v44N8Vd-^qg*PMN1vp>*`(`g0Gb`>77*SkCkN42Zy;5?tJ?BV%7S7kh}-Zf>jN9QecRRZ(5 zimF{6$U>*u<=jLM&tYx(QtG zK!;qQL>S#IT?HoJff7bfAbz?PL7o!PWx$4j@prn@{@14cuS@&C$<8%4Q%e=R=$ z@_9l<^j(pXd|yp#DkV&Hp5C}-EoW1rKDka6iHv-wy90BHl6*INLdescq*Q1T{&t^d zJswVk8l&esN^;K1cfvPVFxw#0Me%F?omVhd%f;Hec%t1ILHo@-J42;{Bc(w4?p z{xE*NO}U(8n*yM0%eAK*y$`s3FBN`_vOGB)dDkeTELLuJUBAfgy4kh;hQ*2-!^c*t z*j;xmTEFOnH!fC|*98>HI zm5)4Q!`CeZ=K?%6<>gi&Jtdv^O4(Y{(%p|wa3z&x0Al@Jk&Ug9u1Ift7%YAM+c;3U zwq$LEr=(T>-?GM2BClw3e|twuJIq+Fc}k*P-6i}ZWJynNcPtW)b@u}7X%Dl`fpA|h zdyn?4&Th_XZhtizU9zsEwzJV+)+iW0L_{%M46dz!3tjyKC9PfkB2En7H1QdP*T_SUXQi>JLS#vT{L?_UI6!2UM&h-0C5=+-Yw=-)(<4MpgTtR&+rWwHw2ld+?l z&WiOXQ`n9{AlJQY+|J~BmyJ7^To<$Pc}%V^+4y`W*QIPckID5d8+WGH!EAiy_jf9{ zvxV%qSRb)K@w}Z~&*b;g%tAZu9-i6c-^b*-mMuT?`~#KSncQ_Ur3sh2N-jQ2W4Rr< zyXMNvT{st)yKgT3{%k;Sc6Q4QoQAQ3y_U)?PyyI*K19wGxjd8Q+2o6a{AkWN`H;Yw z+}OcW+K7wn%lj4%;%6H!CF7iloMz1&pV{vQaIz!shrsegZw!@L#?N~SA4FZOkpY;%<0Ox{DJta{>(!_lv(Cc;T-rXj?Wy= zRUDr=U#fxA(|hhb-7f4L@OVfdokAR+IgcLX_)I(I9QZNd;>b?gHf6pI+>?b^%+KeL zf0g6;EcYDIyTE5VPetE#W~CU=diO_#7%Yr&3{To0&f&c zC`K4hZ;1ZQJy*6)$nVZM_q7GMN9bmfb}pZZSo!_HSImUjm$>{a`)2kHPxdVf%KBpc z(I{^0rjM5eW1Yb;9Wd*|S*VuoU~5NrbEqTOf;+9gV5on9(K)J)NG#G)x^8V%bBK31)F@;f!drJac+&D zLQ8u=RSJ*}v3b4nL>ZkU`*WxLW**|B!+OxC!+mr#%{urdPr8XCZ4@Uur8j*d&+APq zSSJi~4)#rZR3Q%UO(W#vv>(%_12epk6=oa@%&0N@pd%d$v`+bDS0N7eWrLZA^JYC0 zIPF;0DaQ=o<~V?ub7F9YlsJ#bJ7?D8mb_+p08!LOrv`&r$1ro6!w-3~(!M>N7$Tst zUW>NLTo;~kO7X~1${$op+FKas+CqJ8th8lM7c?yz!=n$$JrL>bYwzyLVuC2^jdX-a zLF9TmVyu+cUW#mKYd6>!o%<~1l9YK^4t3%oj4Y8wI7Npk zOL3Nxm0~z{Vi5oqY3^@@N~o(9tBYXUyQ1APA8KyyjXWSZJa9#19s!YAIuI&!g{4{6 z!~ZQrw>|Vq1=|2m^7%(eMB5vecV2TUv{Eyg_jclt-_OZrn3W$LHj8wkar zX;q$+dXmxsluUj3eqqRCDM~qun;w6Xe-Rmyk^al)umc`TB3Ga4%1jer2rJ73k(c_# zIk?o9bQ0whivXT}fBl8(z?8*v@yC)MwK3BG44*j*aS&E+b&?kF2 zQ=Dt;5c)-80Fj-$aLMr`*Z&2;sNpeKE;%%!fee1o0m8aia G{r?8d(), {3}); @@ -84,4 +87,193 @@ TEST(indexstmt, spmm) { } +TEST(indexstmt, sddmm) { + Type t(type(), {3,3}); + TensorVar A("A", t, {Sparse, Dense}); + TensorVar B("B", t, {Sparse, Dense}); + TensorVar C("C", t, {Dense, Dense}); + TensorVar w("w", Type(type(),{3}), Dense); + + // the below expression is the concrete index notation + // where (consumer, producer) + IndexStmt spmm = forall(i, + forall(k, + where(forall(j, A(i,j) = w(j)), + forall(j, w(j) += B(i,k)*C(k,j)) + ) + ) + ); + + // after adding scheduling transformations to this concrete-topologically sorted index stmt + // + + std::cout << spmm << std::endl; + spmm = reorderLoopsTopologically(spmm); + std::cout << "topologically reordered loops statement: " << spmm << std::endl; + + Kernel kernel = compile(spmm); + kernel.compute(); +} + +TEST(indexstmt, sddmmPlusSpmm) { + + // Y(i,l) = B(i,j)*C(i,k)*D(k,j) * F(j,l); + // indexstmt order i, j, k, l + //topologically reordered loops statement: forall(i, forall(k, forall(j, forall(l, Y(i,l) += B(i,j) * C(i,k) * D(k,j) * F(j,l), NotParallel, IgnoreRaces), NotParallel, IgnoreRaces), NotParallel, IgnoreRaces), NotParallel, IgnoreRaces) + + Type t(type(), {3,3}); + TensorVar Y("Y", t, {Dense, Dense}); + TensorVar B("B", t, {Dense, Sparse}); + TensorVar C("C", t, {Dense, Dense}); + TensorVar D("D", t, {Dense, Dense}); + TensorVar E("E", t, {Dense, Dense}); + + // TensorVar A("A", Type(type(),{3}), ); + TensorVar A("A", Type()); + + IndexStmt fused1 = + forall(i, + forall(j, + forall(k, + forall(l, Y(i,l) += B(i,j) * C(i,k) * D(j,k) * E(j,l)) + ) + ) + ); + + std::cout << "before topological sort" << fused1 << std::endl; + fused1 = reorderLoopsTopologically(fused1); + std::cout << "after topological sort" << fused1 << std::endl; + + Kernel kernel = compile(fused1); + + + IndexStmt fused2 = + forall(i, + forall(j, + where( + forall(l, Y(i,l) += A * E(j,l)), // consumer + forall(k, A += B(i,j)*C(i,k)*D(j,k)) // producer + ) + ) + ); + + Kernel kernel2 = compile(fused2); + +} + + + +TEST(indexstmt, mttkrpPlusSpmm) { + + // ./bin/taco "A(i,m)=B(i,k,l)*C(k,j)*D(l,j)*E(j,m)" -f=A:dd:0,1 -f=B:sss:0,1,2 -f=C:dd:0,1 -f=D:dd:0,1 -f=E:dd:0,1 + + // i = 11, k = 5, l = 7, j = 8; + long unsigned int idim = 11, kdim = 5, ldim = 7, jdim = 8, mdim = 6; + + Type atype(type(), {idim, mdim}); + Type btype(type(), {idim, kdim, ldim}); + Type ctype(type(), {kdim, jdim}); + Type dtype(type(), {ldim, jdim}); + Type etype(type(), {jdim, mdim}); + + TensorVar A("A", atype, {Dense, Dense}); + TensorVar B("B", btype, {Sparse, Sparse, Sparse}); + TensorVar C("C", ctype, {Dense, Dense}); + TensorVar D("D", dtype, {Dense, Dense}); + TensorVar E("E", etype, {Dense, Dense}); + + TensorVar ws("ws", Type(type(), {jdim}) ); + + IndexStmt fused1 = + forall(i, + forall(k, + forall(l, + forall(j, + forall(m, A(i,m) += B(i,k,l) * C(k,j) * D(l,j) * E(j,m)) + ) + ) + ) + ); + + std::cout << "before topological sort" << fused1 << std::endl; + fused1 = reorderLoopsTopologically(fused1); + std::cout << "after topological sort" << fused1 << std::endl; + + Kernel kernel = compile(fused1); + + IndexStmt fused2 = + forall(i, + where( + forall(j, + forall(m, + A(i,m) += ws(j) * E(j,m) + ) + ) + , + forall(k, + forall(l, + forall(j, + ws(j) += B(i,k,l) * C(k,j) * D(l,j) + ) + ) + ) + ) + ); + + Kernel kernel2 = compile(fused2); + +} + +// ./bin/taco "y(i)=A(i,j)*B(j,k)*v(k)" -f=y:d:0 -f=A:dd:0,1 -f=B:dd:0,1 -f=v:d:0 +TEST(indexstmt, mmPlusSpmv) { + + // + + long unsigned int idim = 11, jdim = 8, kdim = 5; + + Type ytype(type(), {idim}); + Type atype(type(), {idim, jdim}); + Type btype(type(), {jdim, kdim}); + Type vtype(type(), {kdim}); + + TensorVar y("y", ytype, {Dense}); + TensorVar A("A", atype, {Dense, Dense}); + TensorVar B("B", btype, {Dense, Dense}); + TensorVar v("v", vtype, {Dense}); + + TensorVar ws("ws", Type(type(), {jdim}) ); + + IndexStmt fused1 = + forall(i, + forall(j, + forall(k, + forall(m, y(i) += A(i,j) * B(j,k) * v(k)) + ) + ) + ); + + std::cout << "before topological sort" << fused1 << std::endl; + fused1 = reorderLoopsTopologically(fused1); + std::cout << "after topological sort" << fused1 << std::endl; + + Kernel kernel = compile(fused1); + + IndexStmt fused2 = + where( + forall(i, + forall(j, + y(i) += A(i,j) * ws(j) + ) + ) + , + forall(j, + forall(k, + ws(j) += B(j,k) * v(k) + ) + ) + ); + + Kernel kernel2 = compile(fused2); +} + diff --git a/test/tests-scheduling-eval.cpp b/test/tests-scheduling-eval.cpp index 59debc88e..29a7e512e 100644 --- a/test/tests-scheduling-eval.cpp +++ b/test/tests-scheduling-eval.cpp @@ -1,88 +1,8 @@ -#include -#include -#include -#include -#include -#include -#include -#include "taco/cuda.h" -#include "test.h" -#include "test_tensors.h" -#include "taco/tensor.h" -#include "taco/index_notation/index_notation.h" -#include "taco/index_notation/transformations.h" -#include "codegen/codegen.h" -#include "taco/lower/lower.h" -#include "taco/util/timers.h" - - -#define TOOL_BENCHMARK_TIMER(CODE,NAME,TIMER) { \ - if (time) { \ - taco::util::Timer timer; \ - timer.start(); \ - CODE; \ - timer.stop(); \ - taco::util::TimeResults result = timer.getResult(); \ - cout << NAME << " " << result << " ms" << endl; \ - TIMER=result; \ - } \ - else { \ - CODE; \ - } \ -} - -using namespace taco; +#include "util.h" + const IndexVar i("i"), j("j"), k("k"), l("l"), m("m"), n("n"); int WARP_SIZE = 32; -void printToCout(IndexStmt stmt) { - std::shared_ptr codegen = ir::CodeGen::init_default(cout, ir::CodeGen::ImplementationGen); - ir::Stmt compute = lower(stmt, "compute", false, true); - codegen->compile(compute, true); -} - -void printToFile(string filename, IndexStmt stmt) { - stringstream source; - - string file_path = "eval_generated/"; - mkdir(file_path.c_str(), 0777); - - std::shared_ptr codegen = ir::CodeGen::init_default(source, ir::CodeGen::ImplementationGen); - ir::Stmt compute = lower(stmt, "compute", false, true); - codegen->compile(compute, true); - - ofstream source_file; - string file_ending = should_use_CUDA_codegen() ? ".cu" : ".c"; - source_file.open(file_path + filename + file_ending); - source_file << source.str(); - source_file.close(); -} - -void printToFile(string filename, string additional_filename, IndexStmt stmt) { - stringstream source1; - stringstream source2; - - string file_path = "eval_generated/"; - mkdir(file_path.c_str(), 0777); - - std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); - ir::Stmt compute = lower(stmt, "compute", false, true); - codegen->compile(compute, true); - - ofstream source_file; - string file_ending = should_use_CUDA_codegen() ? ".cu" : ".c"; - source_file.open(file_path+filename+file_ending); - source_file << source1.str(); - source_file.close(); - - ofstream additional_source_file; - string additional_file_ending = ".ispc"; - additional_source_file.open(file_path+additional_filename+additional_file_ending); - additional_source_file << source2.str(); - additional_source_file.close(); - -} - IndexStmt scheduleSpMVCPU(IndexStmt stmt, int CHUNK_SIZE=16) { IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1"); return stmt.split(i, i0, i1, CHUNK_SIZE) @@ -909,7 +829,7 @@ TEST(scheduling_eval, spmmISPC) { expected.compute(); ASSERT_TENSOR_EQ(expected, C); - float ERROR_MARGIN = 0.01; + // float ERROR_MARGIN = 0.01; // ASSERT_TENSOR_VAL(expected, y); for (int i = 0; i < NUM_I; i++) { for (int k = 0; k < NUM_K; k++) { @@ -1172,6 +1092,67 @@ TEST(scheduling_eval, sddmmCPU) { ASSERT_TENSOR_EQ(expected, A); } +TEST(scheduling_eval, sddmmSPMMFusedCPU) { + if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) { + return; + } + + int NUM_I = 1021/10; + int NUM_J = 1039/10; + int NUM_K = 1057/10; + float SPARSITY = .3; + Tensor A("A", {NUM_I, NUM_K}, {Dense, Dense}); + Tensor B("B", {NUM_I, NUM_K}, CSR); + Tensor C("C", {NUM_I, NUM_J}, {Dense, Dense}); + Tensor D("D", {NUM_J, NUM_K}, {Dense, Dense}); + + srand(268238); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + C.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + + for (int i = 0; i < NUM_I; i++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, k}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + } + + for (int j = 0; j < NUM_J; j++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + D.insert({j, k}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + + B.pack(); + C.pack(); + D.pack(); + + A(i,k) = B(i,k) * C(i,j) * D(j,k); + + IndexStmt stmt = A.getAssignment().concretize(); + stmt = scheduleSDDMMCPU(stmt, B); + + printToFile("sddmm_cpu_ryan2", stmt); + + A.compile(stmt); + A.assemble(); + A.compute(); + + Tensor expected("expected", {NUM_I, NUM_K}, {Dense, Dense}); + expected(i,k) = B(i,k) * C(i,j) * D(j,k); + expected.compile(); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, A); +} + TEST(scheduling_eval, sddmmcsrCPU) { if (should_use_CUDA_codegen()) { @@ -1246,8 +1227,8 @@ TEST(scheduling_eval, sddmm2CPU) { int NUM_J = 1021/10; int NUM_K = 18; float SPARSITY = .3; - Tensor Y("Y", {NUM_I, NUM_J}, CSR); - Tensor A("A", {NUM_I, NUM_J}, CSR); + Tensor Y("Y", {NUM_I, NUM_J}, {Dense, Compressed(ModeFormat::UNIQUE)}); + Tensor A("A", {NUM_I, NUM_J}, {Dense, Compressed(ModeFormat::UNIQUE)}); Tensor X("X", {NUM_I, NUM_K}, {Dense, Dense}); srand(268238); @@ -1271,23 +1252,23 @@ TEST(scheduling_eval, sddmm2CPU) { A.pack(); X.pack(); - Y(i,j) = A(i,j) * X(i,k) * X(j,k); + Y(i,j) = A(i,j) * X(i,k) * X(k,j); - IndexStmt stmt = A.getAssignment().concretize(); - // stmt = scheduleSDDMMCPU(stmt, B); + // IndexStmt stmt = A.getAssignment().concretize(); + // // stmt = scheduleSDDMMCPU(stmt, A); - //printToFile("sddmm_cpu", stmt); + // printToFile("sddmm2_cpu", stmt); - A.compile(stmt); - A.assemble(); - A.compute(); + // A.compile(stmt); + // A.assemble(); + // A.compute(); - Tensor expected("expected", {NUM_I, NUM_J}, {Dense, Dense}); - expected(i,j) = A(i,j) * X(i,k) * X(j,k); - expected.compile(); - expected.assemble(); - expected.compute(); - ASSERT_TENSOR_EQ(expected, A); + // Tensor expected("expected", {NUM_I, NUM_J}, {Dense, Dense}); + // expected(i,j) = A(i,j) * X(i,k) * X(j,k); + // expected.compile(); + // expected.assemble(); + // expected.compute(); + // ASSERT_TENSOR_EQ(expected, A); } @@ -1365,7 +1346,7 @@ TEST(scheduling_eval, sddmmISPC) { ASSERT_TENSOR_EQ(expected, A); - float ERROR_MARGIN = 0.01; + // float ERROR_MARGIN = 0.01; // ASSERT_TENSOR_VAL(expected, y); for (int i = 0; i < NUM_I; i++) { for (int k = 0; k < NUM_K; k++) { @@ -1447,7 +1428,7 @@ TEST(scheduling_eval, sddmm2ISPC) { ASSERT_TENSOR_EQ(expected, A); - float ERROR_MARGIN = 0.01; + // float ERROR_MARGIN = 0.01; // ASSERT_TENSOR_VAL(expected, y); for (int i = 0; i < NUM_I; i++) { for (int j = 0; j < NUM_J; j++) { @@ -1585,7 +1566,7 @@ TEST(scheduling_eval, spmvISPC) { ASSERT_TENSOR_EQ(expected, y); - float ERROR_MARGIN = 0.01; + // float ERROR_MARGIN = 0.01; // ASSERT_TENSOR_VAL(expected, y); for (int j = 0; j < NUM_J; j++) { if (expected(j) <= y(j) + ERROR_MARGIN && expected(j) >= y(j) - ERROR_MARGIN) { @@ -2015,6 +1996,64 @@ TEST(scheduling_eval, mttkrpCPU) { ASSERT_TENSOR_EQ(expected, A); } +TEST(scheduling_eval, temp) { + if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) { + return; + } + std::default_random_engine gen(0); + std::uniform_real_distribution unif(0.0, 1.0); + // Predeclare the storage formats that the inputs and output will be stored as. + // To define a format, you must specify whether each dimension is dense or sparse + // and (optionally) the order in which dimensions should be stored. The formats + // declared below correspond to doubly compressed sparse row (dcsr), row-major + // dense (rm), and column-major dense (dm). + Format dcsr({Sparse,Sparse}); + Format rm({Dense,Dense}); + Format cm({Dense,Dense}, {1,0}); + + // Load a sparse matrix from file (stored in the Matrix Market format) and + // store it as a doubly compressed sparse row matrix. Matrices correspond to + // order-2 tensors in taco. The matrix in this example can be download from: + // https://www.cise.ufl.edu/research/sparse/MM/Williams/webbase-1M.tar.gz + Tensor B = read("/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx", dcsr); + // Generate a random dense matrix and store it in row-major (dense) format. + Tensor C({B.getDimension(0), 1000}, rm); + for (int i = 0; i < C.getDimension(0); ++i) { + for (int j = 0; j < C.getDimension(1); ++j) { + C.insert({i,j}, unif(gen)); + } + } + C.pack(); + + // Generate another random dense matrix and store it in column-major format. + Tensor D({1000, B.getDimension(1)}, cm); + for (int i = 0; i < D.getDimension(0); ++i) { + for (int j = 0; j < D.getDimension(1); ++j) { + D.insert({i,j}, unif(gen)); + } + } + D.pack(); + + // Declare the output matrix to be a sparse matrix with the same dimensions as + // input matrix B, to be also stored as a doubly compressed sparse row matrix. + Tensor A(B.getDimensions(), dcsr); + + // Define the SDDMM computation using index notation. + IndexVar i, j, k; + A(i,j) = B(i,j) * C(i,k) * D(k,j); + + // At this point, we have defined how entries in the output matrix should be + // computed from entries in the input matrices but have not actually performed + // the computation yet. To do so, we must first tell taco to generate code that + // can be executed to compute the SDDMM operation. + A.compile(); + // We can now call the functions taco generated to assemble the indices of the + // output matrix and then actually compute the SDDMM. + A.assemble(); + A.compute(); + // Write the output of the computation to file (stored in the Matrix Market format). + write("A.mtx", A); +} TEST(scheduling_eval, mttkrpISPC) { if (should_use_CUDA_codegen()) { diff --git a/test/tests-scheduling-fuse.cpp b/test/tests-scheduling-fuse.cpp new file mode 100644 index 000000000..bd77f1d64 --- /dev/null +++ b/test/tests-scheduling-fuse.cpp @@ -0,0 +1,2872 @@ +#include "taco/cuda.h" +#include "taco/tensor.h" +#include "test.h" +#include "util.h" +#include +#include "gtest/gtest.h" +#include +#include + +// #define NUM_THREADS_TO_USE 64 +#define NUM_THREADS_TO_USE 32 + +void handle_error (int retval) +{ + printf("PAPI error %d: %s\n", retval, PAPI_strerror(retval)); + exit(1); +} + +TEST(scheduling_eval, spmvFusedWithSyntheticData) { + if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) { + return; + } + taco_set_num_threads(NUM_THREADS_TO_USE); + + std::default_random_engine gen(0); + std::uniform_real_distribution unif(0.0, 1.0); + + Format csr({dense, sparse}); + Format rm({dense}); + + // uncomment this for reading the csr matrix saved in mtx file + std::cout << "reading B mat mtx\n"; + + int NUM_I = 5; // 1021/10; + int NUM_J = 5; // 1039/10; + int NUM_K = 8; + float SPARSITY = .3; + Tensor B("B", {NUM_I, NUM_J}, csr); + srand(75883); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + } + B.pack(); + + + std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl; + std::cout << "adding c mat\n"; + Tensor C("C", {NUM_J, NUM_K}, csr); + for (int i = 0; i < C.getDimension(0); ++i) { + for (int j = 0; j < C.getDimension(1); ++j) { + C.insert({i,j}, unif(gen)); + } + } + std::cout << "packing C mat\n"; + C.pack(); + + Tensor v("v", {NUM_K}, rm); + for (int i = 0; i < v.getDimension(0); ++i) { + v.insert({i}, unif(gen)); + } + std::cout << "packing D mat\n"; + v.pack(); + + Tensor A("A", {NUM_I}, rm); + Tensor ref("ref", {NUM_I}, rm); + IndexVar i, j, k, l, m; + A(i) = B(i,j) * C(j,k) * v(k); + + // IndexStmt stmt = A.getAssignment().concretize(); + IndexStmt stmt = makeReductionNotation(A.getAssignment()); + stmt = makeConcreteNotation(stmt); + printToFile("SpMVfused", stmt); + stmt = reorderLoopsTopologically(stmt); + stmt = loopFusionOverFission(stmt, A.getAssignment(), "f", 1); + stmt = insertTemporaries(stmt); + stmt = parallelizeOuterLoop(stmt); + + A.compile(stmt); + // We can now call the functions taco generated to assemble the indices of the + // output matrix and then actually compute the MTTKRP. + A.assemble(); + + + // ref(i) = B(i,j) * C(j,k) * v(k); + // IndexStmt refStmt = makeReductionNotation(ref.getAssignment()); + // refStmt = makeConcreteNotation(refStmt); + // refStmt = insertTemporaries(refStmt); + // refStmt = parallelizeOuterLoop(refStmt); + // ref.compile(refStmt); + // ref.assemble(); + + // Tensor ref1({NUM_J}, rm); + // Tensor ref2({NUM_I}, rm); + // ref1(j) = C(j,k) * v(k); + // ref2(i) = B(i,j) * ref1(j); + + // IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment()); + // ref1Stmt = makeConcreteNotation(ref1Stmt); + // ref1Stmt = insertTemporaries(ref1Stmt); + // ref1Stmt = parallelizeOuterLoop(ref1Stmt); + // ref1.compile(ref1Stmt); + // ref1.assemble(); + + // IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment()); + // ref2Stmt = makeConcreteNotation(ref2Stmt); + // ref2Stmt = insertTemporaries(ref2Stmt); + // ref2Stmt = parallelizeOuterLoop(ref2Stmt); + // ref2.compile(ref2Stmt); + // ref2.assemble(); + + std::cout << "compute start\n"; + taco::util::TimeResults timevalue; + bool time = true; + // TOOL_BENCHMARK_TIMER(ref.compute(), "\n\nReference Kernel: ", timevalue); + TOOL_BENCHMARK_TIMER(A.compute(), "\n\nFused Kernel: ", timevalue); + // ASSERT_TENSOR_EQ(ref, A); + + // // check results + // for (int q = 0; q < A.getDimension(0); ++q) { + // if ( abs(A(q) - ref(q))/abs(ref(q)) > ERROR_MARGIN) { + // std::cout << "error: results don't match A("<< q << "): " + // << A(q) << ", ref: " << ref(q) << std::endl; + // ASSERT_TRUE(false); + // } + // } + // // ASSERT_TENSOR_EQ(A, ref); + // TOOL_BENCHMARK_TIMER(ref1.compute(), "\n\nSDDMM Kernel: ", timevalue); + // TOOL_BENCHMARK_TIMER(ref2.compute(), "\n\nSpMM Kernel: ", timevalue); + // ASSERT_TENSOR_EQ(ref, ref2); + + // for (int q = 0; q < ref2.getDimension(0); ++q) { + // for (int w = 0; w < ref2.getDimension(1); ++w) { + // if ( abs(ref2(q,w) - ref(q,w))/abs(ref(q,w)) > ERROR_MARGIN) { + // std::cout << "error: results don't match A("<< q << "," << w << "): " + // << ref2(q,w) << ", ref: " << ref(q,w) << std::endl; + // ASSERT_TRUE(false); + // } + // } + // } + +} + +TEST(scheduling_eval, spmvFused) { + if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) { + return; + } + + ofstream statfile; + statfile.open( + "/home/min/a/kadhitha/workspace/my_taco/taco/test/stats/spmv-spmv.txt", std::ios::app); + if (statfile.is_open()) { + statfile << "\nspmv-spmv execution\n"; + statfile << "\n-----------------------------------------\n"; + } + taco_set_num_threads(NUM_THREADS_TO_USE); + + std::default_random_engine gen(0); + std::uniform_real_distribution unif(0.0, 1.0); + + Format csr({dense, sparse}); + Format rm({dense}); + + + + int filenum = 1; + + std::vector matfiles = { + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx", // 5 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx", // 8 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx", // 10 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx", // 12 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wiki-Talk/wiki-Talk.mtx", // 13 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/com-Orkut/com-Orkut.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx", // 15 + "/home/min/a/kadhitha/workspace/my_taco/FusedMM/dataset/harvard.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/twitter7/twitter7.mtx" + }; + std::vector matfilesrw = { + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/synthetic.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cage3.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/bcsstk17.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/pdb1HYS.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/rma10.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cant.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/consph.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cop20k_A.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/shipsec1.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/scircuit.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/webbase-1M.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/wiki-Talk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/com-Orkut.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/circuit5M.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/harvard.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/twitter7.mtx" + }; + + // uncomment this for reading the csr matrix saved in mtx file + std::cout << "reading B mat mtx\n"; + + + int kDim = 8; + float SPARSITY = .3; + std::string matfile = matfiles[filenum]; + std::cout << "reading B mat mtx\n"; + Tensor B = read(matfile, csr, true); + B.setName("B"); + B.pack(); + + std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl; + std::cout << "adding c mat\n"; + + std::cout << "reading B mat mtx\n"; + Tensor C = read(matfile, csr, true); + C.setName("C"); + C.pack(); + + + Tensor v("v", {C.getDimension(1)}, rm); + for (int i = 0; i < v.getDimension(0); ++i) { + v.insert({i}, unif(gen)); + } + std::cout << "packing D mat\n"; + v.pack(); + + if (statfile.is_open()) { + statfile + << "A(i) = B(i,j) * C(j,k) * v(k);" << std::endl + << "B1_dimension: " << B.getDimension(0) << ", B2_dimension: " << B.getDimension(1) << ", vals: " << B.getStorage().getValues().getSize() << std::endl + << "C1_dimension: " << C.getDimension(0) << ", C2_dimension: " << C.getDimension(1) << ", vals: " << C.getStorage().getValues().getSize() << std::endl + << "D1_dimension: " << v.getDimension(0) << ", vals: " << v.getStorage().getValues().getSize() << std::endl + << std::endl; + } + + Tensor A("A", {B.getDimension(0)}, rm); + Tensor ref("ref", {B.getDimension(0)}, rm); + IndexVar i, j, k, l, m; + A(i) = B(i,j) * C(j,k) * v(k); + + ref(i) = B(i,j) * C(j,k) * v(k); + IndexStmt refStmt = makeReductionNotation(ref.getAssignment()); + refStmt = makeConcreteNotation(refStmt); + refStmt = insertTemporaries(refStmt); + refStmt = parallelizeOuterLoop(refStmt); + ref.compile(refStmt); + ref.assemble(); + + // IndexStmt stmt = A.getAssignment().concretize(); + IndexStmt stmt = makeReductionNotation(A.getAssignment()); + stmt = makeConcreteNotation(stmt); + printToFile("SpMVfused", stmt); + stmt = reorderLoopsTopologically(stmt); + stmt = loopFusionOverFission(stmt, A.getAssignment(), "f", 1); + stmt = insertTemporaries(stmt); + stmt = parallelizeOuterLoop(stmt); + A.compile(stmt); + A.assemble(); + + + // Tensor ref1({NUM_J}, rm); + // Tensor ref2({NUM_I}, rm); + // ref1(j) = C(j,k) * v(k); + // ref2(i) = B(i,j) * ref1(j); + + // IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment()); + // ref1Stmt = makeConcreteNotation(ref1Stmt); + // ref1Stmt = insertTemporaries(ref1Stmt); + // ref1Stmt = parallelizeOuterLoop(ref1Stmt); + // ref1.compile(ref1Stmt); + // ref1.assemble(); + + // IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment()); + // ref2Stmt = makeConcreteNotation(ref2Stmt); + // ref2Stmt = insertTemporaries(ref2Stmt); + // ref2Stmt = parallelizeOuterLoop(ref2Stmt); + // ref2.compile(ref2Stmt); + // ref2.assemble(); + + std::cout << "compute start\n"; + taco::util::TimeResults timevalue; + bool time = true; + std::string sofused = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/spmv_spmv/spmv_fused.so"; + + TOOL_BENCHMARK_TIMER(ref.compute(statfile, sofused), "\n\nReference Kernel: ", timevalue); + + + std::cout << "b1 dim: " << B.getTacoTensorT()->dimensions[1] << std::endl; + // TOOL_BENCHMARK_TIMER(ref.compute(statfile, sofused), "\n\nFused Kernel: ", timevalue); + // ASSERT_TENSOR_EQ(ref, A); + + // // check results + // for (int q = 0; q < A.getDimension(0); ++q) { + // if ( abs(A(q) - ref(q))/abs(ref(q)) > ERROR_MARGIN) { + // std::cout << "error: results don't match A("<< q << "): " + // << A(q) << ", ref: " << ref(q) << std::endl; + // ASSERT_TRUE(false); + // } + // } + // // ASSERT_TENSOR_EQ(A, ref); + // TOOL_BENCHMARK_TIMER(ref1.compute(), "\n\nSDDMM Kernel: ", timevalue); + // TOOL_BENCHMARK_TIMER(ref2.compute(), "\n\nSpMM Kernel: ", timevalue); + // ASSERT_TENSOR_EQ(ref, ref2); + + // for (int q = 0; q < ref2.getDimension(0); ++q) { + // for (int w = 0; w < ref2.getDimension(1); ++w) { + // if ( abs(ref2(q,w) - ref(q,w))/abs(ref(q,w)) > ERROR_MARGIN) { + // std::cout << "error: results don't match A("<< q << "," << w << "): " + // << ref2(q,w) << ", ref: " << ref(q,w) << std::endl; + // ASSERT_TRUE(false); + // } + // } + // } + + if (statfile.is_open()) { + statfile.close(); + } + +} + +TEST(scheduling_eval, sddmmFusedWithSyntheticData) { + if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) { + return; + } + + taco_set_num_threads(NUM_THREADS_TO_USE); + + std::default_random_engine gen(0); + std::uniform_real_distribution unif(0.0, 1.0); + + Format csr({dense, sparse}); + Format rm({dense, dense}); + int ldim = 4; + int kdim = 8; + + // uncomment this for reading the csr matrix saved in mtx file + std::cout << "reading B mat mtx\n"; + + int NUM_I = 1021/10; + int NUM_J = 1039/10; + float SPARSITY = .3; + Tensor B("B", {NUM_I, NUM_J}, csr); + srand(75883); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + } + B.pack(); + write("/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx", B); + + std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl; + std::cout << "adding c mat\n"; + Tensor C({B.getDimension(0), kdim}, rm); + for (int i = 0; i < C.getDimension(0); ++i) { + for (int j = 0; j < C.getDimension(1); ++j) { + C.insert({i,j}, unif(gen)); + } + } + std::cout << "packing C mat\n"; + C.pack(); + + Tensor D({B.getDimension(1), kdim}, rm); + for (int i = 0; i < D.getDimension(0); ++i) { + for (int j = 0; j < D.getDimension(1); ++j) { + D.insert({i,j}, unif(gen)); + } + } + std::cout << "packing D mat\n"; + D.pack(); + + Tensor F({B.getDimension(1), ldim}, rm); + for (int i = 0; i < F.getDimension(0); ++i) { + for (int j = 0; j < F.getDimension(1); ++j) { + F.insert({i,j}, unif(gen)); + } + } + std::cout << "packing F mat\n"; + F.pack(); + + Tensor A({B.getDimension(0), ldim}, rm); + Tensor ref({B.getDimension(0), ldim}, rm); + IndexVar i, j, k, l; + A(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); + + // IndexStmt stmt = A.getAssignment().concretize(); + IndexStmt stmt = makeReductionNotation(A.getAssignment()); + stmt = makeConcreteNotation(stmt); + printToFile("fusedMMConcrete", stmt); + + stmt = reorderLoopsTopologically(stmt); + printToFile("fusedMMOrdered", stmt); + + stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 1); + printToFile("fusedMMFused", stmt); + + stmt = insertTemporaries(stmt); + printToFile("fusedMMWithTemps", stmt); + stmt = parallelizeOuterLoop(stmt); + printToFile("fusedMMFusedPar", stmt); + + A.compile(stmt); + // We can now call the functions taco generated to assemble the indices of the + // output matrix and then actually compute the MTTKRP. + A.assemble(); + + + ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); + IndexStmt refStmt = makeReductionNotation(ref.getAssignment()); + refStmt = makeConcreteNotation(refStmt); + refStmt = insertTemporaries(refStmt); + refStmt = parallelizeOuterLoop(refStmt); + ref.compile(refStmt); + ref.assemble(); + + Tensor ref1({B.getDimension(0), B.getDimension(1)}, csr); + Tensor ref2({B.getDimension(0), ldim}, rm); + ref1(i,j)=B(i,j)*C(i,k)*D(j,k); + ref2(i,l)=ref1(i,j)*F(j,l); + + IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment()); + ref1Stmt = makeConcreteNotation(ref1Stmt); + ref1Stmt = insertTemporaries(ref1Stmt); + ref1Stmt = parallelizeOuterLoop(ref1Stmt); + ref1.compile(ref1Stmt); + ref1.assemble(); + + IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment()); + ref2Stmt = makeConcreteNotation(ref2Stmt); + ref2Stmt = insertTemporaries(ref2Stmt); + ref2Stmt = parallelizeOuterLoop(ref2Stmt); + ref2.compile(ref2Stmt); + ref2.assemble(); + + std::cout << "compute start\n"; + taco::util::TimeResults timevalue; + bool time = true; + TOOL_BENCHMARK_TIMER(ref.compute(), "\n\nReference Kernel: ", timevalue); + TOOL_BENCHMARK_TIMER(A.compute(), "\n\nFused Kernel: ", timevalue); + + // check results + for (int q = 0; q < A.getDimension(0); ++q) { + for (int w = 0; w < A.getDimension(1); ++w) { + if ( abs(A(q,w) - ref(q,w))/abs(ref(q,w)) > ERROR_MARGIN) { + std::cout << "error: results don't match A("<< q << "," << w << "): " + << A(q,w) << ", ref: " << ref(q,w) << std::endl; + ASSERT_TRUE(false); + } + } + } + // ASSERT_TENSOR_EQ(A, ref); + TOOL_BENCHMARK_TIMER(ref1.compute(), "\n\nSDDMM Kernel: ", timevalue); + TOOL_BENCHMARK_TIMER(ref2.compute(), "\n\nSpMM Kernel: ", timevalue); + + for (int q = 0; q < ref2.getDimension(0); ++q) { + for (int w = 0; w < ref2.getDimension(1); ++w) { + if ( abs(ref2(q,w) - ref(q,w))/abs(ref(q,w)) > ERROR_MARGIN) { + std::cout << "error: results don't match A("<< q << "," << w << "): " + << ref2(q,w) << ", ref: " << ref(q,w) << std::endl; + ASSERT_TRUE(false); + } + } + } + +} + + +IndexStmt scheduleSDDMMCPU_forfuse(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i, j, k, l, m; + IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1"); + return stmt.split(i, i0, i1, CHUNK_SIZE) + .pos(k, kpos, B(i,k)) + .split(kpos, kpos0, kpos1, UNROLL_FACTOR) + .reorder({i0, i1, kpos0, j, kpos1}) + .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + .parallelize(kpos1, ParallelUnit::CPUVector, OutputRaceStrategy::ParallelReduction); +} + +TEST(scheduling_eval, sddmmFused) { + if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) { + return; + } + + taco_set_num_threads(NUM_THREADS_TO_USE); + + ofstream statfile; + statfile.open( + "/home/min/a/kadhitha/workspace/my_taco/taco/test/stats/sddmm-spmm.txt", std::ios::app); + if (statfile.is_open()) { + statfile << "\nsddmm-spmm execution\n"; + statfile << "\n-----------------------------------------\n"; + } + + std::default_random_engine gen(0); + std::uniform_real_distribution unif(0.0, 1.0); + + Format csr({dense, sparse}); + Format rm({dense, dense}); + int ldim = 128; + int kdim = 128; + + // vector filenums = {2,3,4,5,6,7,8,9,10,12,15}; + + vector filenums = {1}; + + for (auto filenum : filenums) { + + // int filenum = 5; + + std::vector matfiles = { + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx", // 5 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx", // 8 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx", // 10 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx", // 12 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wiki-Talk/wiki-Talk.mtx", // 13 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/com-Orkut/com-Orkut.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx", // 15 + "/home/min/a/kadhitha/workspace/my_taco/FusedMM/dataset/harvard.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/twitter7/twitter7.mtx" + }; + std::vector matfilesrw = { + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/synthetic.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cage3.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/bcsstk17.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/pdb1HYS.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/rma10.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cant.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/consph.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cop20k_A.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/shipsec1.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/scircuit.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/webbase-1M.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/wiki-Talk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/com-Orkut.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/circuit5M.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/harvard.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/twitter7.mtx" + }; + + std::string matfile = matfiles[filenum]; + std::cout << "reading B mat mtx\n"; + Tensor B = read(matfile, csr, true); + B.setName("B"); + B.pack(); + // write(matfilesrw[filenum], B); + + if (statfile.is_open()) { + statfile << matfile << std::endl; + } + + std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl; + std::cout << "adding c mat\n"; + Tensor C({B.getDimension(0), kdim}, rm); + for (int i = 0; i < C.getDimension(0); ++i) { + for (int j = 0; j < C.getDimension(1); ++j) { + C.insert({i,j}, unif(gen)); + } + } + std::cout << "packing C mat\n"; + C.pack(); + + Tensor D({B.getDimension(1), kdim}, rm); + for (int i = 0; i < D.getDimension(0); ++i) { + for (int j = 0; j < D.getDimension(1); ++j) { + D.insert({i,j}, unif(gen)); + } + } + std::cout << "packing D mat\n"; + D.pack(); + + Tensor F({B.getDimension(1), ldim}, rm); + for (int i = 0; i < F.getDimension(0); ++i) { + for (int j = 0; j < F.getDimension(1); ++j) { + F.insert({i,j}, unif(gen)); + } + } + std::cout << "packing F mat\n"; + F.pack(); + + Tensor A({B.getDimension(0), ldim}, rm); + Tensor ref({B.getDimension(0), ldim}, rm); + IndexVar i, j, k, l, m; + IndexVar i0("i0"), i1("i1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"), k0("k0"), k1("k1"); + A(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); + if (statfile.is_open()) { + statfile + << "ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);" << std::endl + << "B1_dimension: " << B.getDimension(0) << ", B2_dimension: " << B.getDimension(1) << ", vals: " << B.getStorage().getValues().getSize() << std::endl + << "C1_dimension: " << C.getDimension(0) << ", C2_dimension: " << C.getDimension(1) << ", vals: " << C.getStorage().getValues().getSize() << std::endl + << "D1_dimension: " << D.getDimension(0) << ", D2_dimension: " << D.getDimension(1) << ", vals: " << D.getStorage().getValues().getSize() << std::endl + << "E1_dimension: " << F.getDimension(0) << ", E2_dimension: " << F.getDimension(1) << ", vals: " << F.getStorage().getValues().getSize() << std::endl + << std::endl; + } + + // IndexStmt stmt = A.getAssignment().concretize(); + IndexStmt stmt = makeReductionNotation(A.getAssignment()); + stmt = makeConcreteNotation(stmt); + stmt = reorderLoopsTopologically(stmt); + stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 1); + stmt = stmt + .split(i, i0, i1, 16); + stmt = insertTemporaries(stmt); + stmt = parallelizeOuterLoop(stmt); + + A.compile(stmt); + A.assemble(); + + + ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); + IndexStmt refStmt = makeReductionNotation(ref.getAssignment()); + refStmt = makeConcreteNotation(refStmt); + refStmt = insertTemporaries(refStmt); + refStmt = refStmt + .split(i, i0, i1, 16) + .reorder({i0, i1, j, k, l}); + stmt = insertTemporaries(stmt); + refStmt = parallelizeOuterLoop(refStmt); + ref.compile(refStmt); + ref.assemble(); + + Tensor ref1({B.getDimension(0), B.getDimension(1)}, csr); + Tensor ref2({B.getDimension(0), ldim}, rm); + ref1(i,j)=B(i,j)*C(i,k)*D(j,k); + ref2(i,l)=ref1(i,j)*F(j,l); + + IndexStmt ref1Stmt = ref1.getAssignment().concretize(); // anyway Ryan's kernel is used here + + ref1Stmt = ref1Stmt.split(i, i0, i1, 16); + // .pos(j, jpos, B(i,j)); + // .split(k, k0, k1, 8); + // .reorder({i0, i1, jpos0, k, jpos1}); + // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + // .parallelize(jpos1, ParallelUnit::CPUVector, OutputRaceStrategy::ParallelReduction); + // ref1Stmt.split(i, ); + // stmt = scheduleSDDMMCPU_forfuse(ref1Stmt, B); + // IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment()); + // ref1Stmt = makeConcreteNotation(ref1Stmt); + ref1Stmt = insertTemporaries(ref1Stmt); + ref1Stmt = parallelizeOuterLoop(ref1Stmt); + ref1.compile(ref1Stmt); + ref1.assemble(); + + IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment()); // Ryan's SpMM kernel is used here + ref2Stmt = makeConcreteNotation(ref2Stmt); + ref2Stmt = insertTemporaries(ref2Stmt); + ref2Stmt = parallelizeOuterLoop(ref2Stmt); + ref2.compile(ref2Stmt); + ref2.assemble(); + + std::cout << "compute start\n"; + taco::util::TimeResults timevalue; + bool time = true; + + std::string sofile_fused = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/fused_kernel.so"; + TOOL_BENCHMARK_TIMER(A.compute(statfile), "\n\nFused Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "fused time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + statfile << "\nseparate execution\n"; + + // // std::string sofile_sddmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so"; + // std::string sofile_sddmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.so"; + // TOOL_BENCHMARK_TIMER(ref1.compute(statfile, sofile_sddmm), "\n\nSDDMM Kernel: ", timevalue); + // if (statfile.is_open()) { + // statfile << "sddmm time: "; + // statfile << timevalue.mean << std::endl; + // } else { std::cout << " stat file is not open\n"; } + + // std::string sofile_sddmm_ryan = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/sddmm_ryan.so"; + // TOOL_BENCHMARK_TIMER(ref1.compute(statfile, sofile_sddmm_ryan), "\n\nSDDMM Kernel: ", timevalue); + // if (statfile.is_open()) { + // statfile << "sddmm time: "; + // statfile << timevalue.mean << std::endl; + // } else { std::cout << " stat file is not open\n"; } + + // std::string sofile_spmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so"; + // TOOL_BENCHMARK_TIMER(ref2.compute(statfile, sofile_spmm), "\n\nSpMM Kernel: ", timevalue); + // if (statfile.is_open()) { + // statfile << "spmm time: "; + // statfile << timevalue.mean << std::endl; + // } else { std::cout << " stat file is not open\n"; } + + // statfile << "\nreference execution \n"; + + // std::string sofile_original = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/taco_original.so"; + // TOOL_BENCHMARK_TIMER(ref.compute(statfile, sofile_original), "\n\nReference Kernel: ", timevalue); + // if (statfile.is_open()) { + // statfile << "taco reference time: "; + // statfile << timevalue << std::endl; + // } else { std::cout << " stat file is not open\n"; } + + // double* A_vals = (double*) (A.getTacoTensorT()->vals); + // double* ref_vals = (double*) (ref.getTacoTensorT()->vals); + // double* ref2_vals = (double*) (ref2.getTacoTensorT()->vals); + + // // int* A2_pos = (double*) (ref.getTacoTensorT()->vals); + + // // for (size_t q=0; q < B.getStorage().getValues().getSize(); q++) { + // // if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) { + // // std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + // // << "refvals: " << ref_vals[q] << std::endl; + // // ASSERT_TRUE(false); + // // } + // // } + + // for (size_t q=0; q < A.getDimension(0)* A.getDimension(1); q++) { + // if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) { + // std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + // << "refvals: " << ref_vals[q] << std::endl; + // ASSERT_TRUE(false); + // } + // } + // for (size_t q=0; q < A.getDimension(0)* A.getDimension(1); q++) { + // if ( abs(A_vals[q] - ref2_vals[q])/abs(ref2_vals[q]) > ERROR_MARGIN) { + // std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + // << "refvals: " << ref2_vals[q] << std::endl; + // ASSERT_TRUE(false); + // } + // } + // for (int q= 0; q< A_vals + // for (int q = 0; q < A.getDimension(0); ++q) { + // for (int w = 0; w < A.getDimension(1); ++w) { + // if ( abs(A(q,w) - ref(q,w))/abs(ref(q,w)) > ERROR_MARGIN) { + // std::cout << "error: results don't match A("<< q << "," << w << "): " + // << A(q,w) << ", ref: " << ref(q,w) << std::endl; + // ASSERT_TRUE(false); + // } + // } + // } + // ASSERT_TENSOR_EQ(A, ref); + + } // end of for loop + + + if (statfile.is_open()) { + statfile.close(); + } +} + + + + +TEST(scheduling_eval, hadamardFused) { + if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) { + return; + } + + ofstream statfile; + statfile.open( + "/home/min/a/kadhitha/workspace/my_taco/taco/test/stats/hadamard-gemm.txt", std::ios::app); + if (statfile.is_open()) { + statfile << "\nsddmm-spmm execution\n"; + statfile << "\n-----------------------------------------\n"; + } + + std::default_random_engine gen(0); + std::uniform_real_distribution unif(0.0, 1.0); + + Format csr({dense, sparse}); + Format rm({dense, dense}); + int kdim = 128; + int ldim = 128; + + vector filenums = {2,3,4,5,6,7,8,9,10,12,15}; + // vector filenums = {8,9,10,12}; + + for (auto filenum : filenums) { + + // int filenum = 15; + + std::vector matfiles = { + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx", // 2 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx", // 5 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx", // 8 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx", // 10 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx", // 12 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wiki-Talk/wiki-Talk.mtx", // 13 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/com-Orkut/com-Orkut.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx", // 15 + "/home/min/a/kadhitha/workspace/my_taco/FusedMM/dataset/harvard.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/twitter7/twitter7.mtx" + }; + std::vector matfilesrw = { + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/synthetic.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cage3.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/bcsstk17.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/pdb1HYS.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/rma10.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cant.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/consph.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cop20k_A.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/shipsec1.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/scircuit.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/webbase-1M.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/wiki-Talk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/com-Orkut.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/circuit5M.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/harvard.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/twitter7.mtx" + }; + + std::string matfile = matfiles[filenum]; + std::cout << "reading B mat mtx\n"; + Tensor B = read(matfile, csr, true); + B.setName("B"); + B.pack(); + // write(matfilesrw[filenum], B); + + if (statfile.is_open()) { + statfile << matfile << std::endl; + } + + std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl; + std::cout << "adding c mat\n"; + Tensor C({B.getDimension(1), kdim}, rm); + for (int i = 0; i < C.getDimension(0); ++i) { + for (int j = 0; j < C.getDimension(1); ++j) { + C.insert({i,j}, unif(gen)); + } + } + std::cout << "packing C mat\n"; + C.pack(); + + Tensor D({B.getDimension(1), kdim}, rm); + for (int i = 0; i < D.getDimension(0); ++i) { + for (int j = 0; j < D.getDimension(1); ++j) { + D.insert({i,j}, unif(gen)); + } + } + std::cout << "packing D mat\n"; + D.pack(); + + Tensor F({kdim, ldim}, rm); + for (int i = 0; i < F.getDimension(0); ++i) { + for (int j = 0; j < F.getDimension(1); ++j) { + F.insert({i,j}, unif(gen)); + } + } + std::cout << "packing F mat\n"; + F.pack(); + + Tensor A({B.getDimension(0), ldim}, rm); + Tensor ref({B.getDimension(0), ldim}, rm); + IndexVar i, j, k, l, m; + IndexVar i0("i0"), i1("i1"), l0("l0"), l1("l1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"), k0("k0"), k1("k1"); + A(i,l)=B(i,j)*C(j,k)*D(j,k)*F(k,l); + if (statfile.is_open()) { + statfile + << "ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l);" << std::endl + << "B1_dimension: " << B.getDimension(0) << ", B2_dimension: " << B.getDimension(1) << ", vals: " << B.getStorage().getValues().getSize() << std::endl + << "C1_dimension: " << C.getDimension(0) << ", C2_dimension: " << C.getDimension(1) << ", vals: " << C.getStorage().getValues().getSize() << std::endl + << "D1_dimension: " << D.getDimension(0) << ", D2_dimension: " << D.getDimension(1) << ", vals: " << D.getStorage().getValues().getSize() << std::endl + << "E1_dimension: " << F.getDimension(0) << ", E2_dimension: " << F.getDimension(1) << ", vals: " << F.getStorage().getValues().getSize() << std::endl + << std::endl; + } + + // IndexStmt stmt = A.getAssignment().concretize(); + IndexStmt stmt = makeReductionNotation(A.getAssignment()); + stmt = makeConcreteNotation(stmt); + stmt = reorderLoopsTopologically(stmt); + stmt = stmt.reorder({i, j, k, l}); + stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 1); + stmt = stmt + .split(i, i0, i1, 16); + stmt = insertTemporaries(stmt); + stmt = parallelizeOuterLoop(stmt); + printToFile("fusedMMFusedPar", stmt); + + A.compile(stmt); + A.assemble(); + + + ref(i,l)=B(i,j)*C(j,k)*D(j,k)*F(k,l); + IndexStmt refStmt = makeReductionNotation(ref.getAssignment()); + refStmt = makeConcreteNotation(refStmt); + refStmt = refStmt + .split(i, i0, i1, 16) + .reorder({i0, i1, j, k, l}); + refStmt = insertTemporaries(refStmt); + refStmt = parallelizeOuterLoop(refStmt); + ref.compile(refStmt); + ref.assemble(); + + Tensor ref1({B.getDimension(0), kdim}, rm); + Tensor ref2({B.getDimension(0), ldim}, rm); + ref1(i,k)=B(i,j)*C(j,k)*D(j,k); + ref2(i,l)=ref1(i,k)*F(k,l); + + // IndexStmt ref1Stmt = ref1.getAssignment().concretize(); + + // ref1Stmt = ref1Stmt.split(i, i0, i1, 16); + // // .pos(j, jpos, B(i,j)); + // // .split(k, k0, k1, 8); + // // .reorder({i0, i1, jpos0, k, jpos1}); + // // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + // // .parallelize(jpos1, ParallelUnit::CPUVector, OutputRaceStrategy::ParallelReduction); + // // ref1Stmt.split(i, ); + // // stmt = scheduleSDDMMCPU_forfuse(ref1Stmt, B); + IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment()); + ref1Stmt = makeConcreteNotation(ref1Stmt); + ref1Stmt = ref1Stmt + .split(i, i0, i1, 16) + .reorder({i0, i1, j, k}); + // .pos(j, jpos, B(i,j)) + // .split(jpos, jpos0, jpos1, 32) + // .split(k, k0, k1, 32) + // .reorder({i0, i1, jpos0, k0, jpos1, k1}); + ref1Stmt = insertTemporaries(ref1Stmt); + ref1Stmt = parallelizeOuterLoop(ref1Stmt); + ref1.compile(ref1Stmt); + ref1.assemble(); + + IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment()); + ref2Stmt = makeConcreteNotation(ref2Stmt); + ref2Stmt = ref2Stmt + .split(i, i0, i1, 32) + .split(k, k0, k1, 32) + .split(l, l0, l1, 32) + .reorder({i0, k0, l0, i1, k1, l1}); + ref2Stmt = insertTemporaries(ref2Stmt); + ref2Stmt = parallelizeOuterLoop(ref2Stmt); + ref2.compile(ref2Stmt); + ref2.assemble(); + + std::cout << "compute start\n"; + taco::util::TimeResults timevalue; + bool time = true; + + TOOL_BENCHMARK_TIMER(A.compute(statfile), "\n\nFused Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "fused time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + // // std::string sofile_sddmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so"; + // std::string sofile_sddmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.so"; + TOOL_BENCHMARK_TIMER(ref1.compute(statfile), "\n\nHadamard Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "hadamard time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + // std::string sofile_sddmm_ryan = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/sddmm_ryan.so"; + // TOOL_BENCHMARK_TIMER(ref1.compute(statfile, sofile_sddmm_ryan), "\n\nSDDMM Kernel: ", timevalue); + // if (statfile.is_open()) { + // statfile << "sddmm time: "; + // statfile << timevalue.mean << std::endl; + // } else { std::cout << " stat file is not open\n"; } + + // std::string sofile_spmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so"; + TOOL_BENCHMARK_TIMER(ref2.compute(statfile), "\n\nGeMM Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "gemm time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + // std::string sofile_original = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/taco_original.so"; + TOOL_BENCHMARK_TIMER(ref.compute(statfile), "\n\nReference Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "taco reference time: "; + statfile << timevalue << std::endl; + } else { std::cout << " stat file is not open\n"; } + + double* A_vals = (double*) (A.getTacoTensorT()->vals); + double* ref_vals = (double*) (ref.getTacoTensorT()->vals); + double* ref2_vals = (double*) (ref2.getTacoTensorT()->vals); + + // // int* A2_pos = (double*) (ref.getTacoTensorT()->vals); + + for (int q=0; q < A.getDimension(0)* A.getDimension(1); q++) { + if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) { + std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + << "refvals: " << ref_vals[q] << std::endl; + ASSERT_TRUE(false); + } + } + + for (int q=0; q < A.getDimension(0)* A.getDimension(1); q++) { + if ( abs(A_vals[q] - ref2_vals[q])/abs(ref2_vals[q]) > ERROR_MARGIN) { + std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + << "refvals: " << ref2_vals[q] << std::endl; + ASSERT_TRUE(false); + } + } + + } // end of for loop + + if (statfile.is_open()) { + statfile.close(); + } + +} + + + + + + +TEST(scheduling_eval, mttkrpFusedWithSyntheticData) { + if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) { + return; + } + taco_set_num_threads(NUM_THREADS_TO_USE); + + std::default_random_engine gen(0); + std::uniform_real_distribution unif(0.0, 1.0); + // Predeclare the storage formats that the inputs and output will be stored as. + // To define a format, you must specify whether each dimension is dense or + // sparse and (optionally) the order in which dimensions should be stored. The + // formats declared below correspond to compressed sparse fiber (csf) and + // row-major dense (rm). + Format csf({Sparse,Sparse,Sparse}); + Format rm({Dense,Dense}); + Format sd({Dense,Dense}); + + int NUM_I = 1021/20; + int NUM_J = 1039/20; + int NUM_K = 1057/20; + int NUM_L = 1232/20; + int NUM_M = 1231/20; + float SPARSITY = .1; + Tensor A("A", {NUM_I, NUM_M}, sd); + Tensor B("B", {NUM_I, NUM_K, NUM_L}, csf); + Tensor C("C", {NUM_K, NUM_J}, rm); + Tensor D("D", {NUM_L, NUM_J}, rm); + Tensor E("E", {NUM_J, NUM_M}, rm); + Tensor ref({NUM_I, NUM_M}, sd); + + srand(549694); + for (int i = 0; i < NUM_I; i++) { + for (int k = 0; k < NUM_K; k++) { + for (int l = 0; l < NUM_L; l++) { + float rand_float = (float) rand() / (float) (RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, k, l}, (double) ((int) (rand_float * 3 / SPARSITY))); + } + } + } + } + B.pack(); + write("/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/synthetic.tns", B); + + // Generate a random dense matrix and store it in row-major (dense) format. + // Matrices correspond to order-2 tensors in taco. + for (int k = 0; k < NUM_K; k++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + C.insert({k, j}, (double) ((int) (rand_float*3))); + } + } + C.pack(); + + for (int l = 0; l < NUM_L; l++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + D.insert({l, j}, (double) ((int) (rand_float*3))); + } + } + D.pack(); + + for (int i = 0; i < E.getDimension(0); ++i) { + for (int j = 0; j < E.getDimension(1); ++j) { + E.insert({i,j}, unif(gen)); + } + } + E.pack(); + + // Define the MTTKRP computation using index notation. + IndexVar i, k, l, j, m; + A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m); + + + IndexStmt stmt = makeReductionNotation(A.getAssignment()); + stmt = makeConcreteNotation(stmt); + printToFile("fusedMTTKRPConcrete", stmt); + + stmt = reorderLoopsTopologically(stmt); + printToFile("fusedMTTKRPOrdered", stmt); + + stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 1); + printToFile("fusedMTTKRPFused", stmt); + + stmt = insertTemporaries(stmt); + printToFile("fusedMTTKRPWithTemps", stmt); + stmt = parallelizeOuterLoop(stmt); + printToFile("fusedMTTKRPFusedPar", stmt); + + + // At this point, we have defined how entries in the output matrix should be + // computed from entries in the input tensor and matrices but have not actually + // performed the computation yet. To do so, we must first tell taco to generate + // code that can be executed to compute the MTTKRP operation. + A.compile(stmt); + // We can now call the functions taco generated to assemble the indices of the + // output matrix and then actually compute the MTTKRP. + A.assemble(); + + + ref(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m); + IndexStmt refStmt = makeReductionNotation(ref.getAssignment()); + refStmt = makeConcreteNotation(refStmt); + refStmt = insertTemporaries(refStmt); + refStmt = parallelizeOuterLoop(refStmt); + ref.compile(refStmt); + ref.assemble(); + + // Tensor ref2({NUM_I, NUM_J}, sd); + // ref2(i,j) = B(i,k,l) * D(l,j) * C(k,j); + // IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment()); + // ref2Stmt = makeConcreteNotation(ref2Stmt); + // ref2Stmt = insertTemporaries(ref2Stmt); + // ref2Stmt = parallelizeOuterLoop(ref2Stmt); + // ref2.compile(ref2Stmt); + // ref2.assemble(); + + // Tensor ref3({NUM_I, NUM_M}, sd); + // ref3(i,m) = ref2(i,j) * E(j,m); + // IndexStmt ref3Stmt = makeReductionNotation(ref3.getAssignment()); + // ref3Stmt = makeConcreteNotation(ref3Stmt); + // ref3Stmt = insertTemporaries(ref3Stmt); + // ref3Stmt = parallelizeOuterLoop(ref3Stmt); + // ref3.compile(ref3Stmt); + // ref3.assemble(); + + std::cout << "compute start\n"; + taco::util::TimeResults timevalue; + bool time = true; + // TOOL_BENCHMARK_TIMER(ref.compute(), "\n\nReference ISPC: ", timevalue); + TOOL_BENCHMARK_TIMER(A.compute(), "\n\nFused MTTKRP+SPMM: ", timevalue); + TOOL_BENCHMARK_TIMER(ref.compute(), "\n\nReference MTTKRP+SPMM: ", timevalue); + // TOOL_BENCHMARK_TIMER(ref2.compute(), "\n\nReference MTTKRP: ", timevalue); + // TOOL_BENCHMARK_TIMER(ref3.compute(), "\n\nReference SPMM: ", timevalue); + ASSERT_TENSOR_EQ(ref, A); + // ASSERT_TENSOR_EQ(ref, ref3); + +} + + +TEST(scheduling_eval, mttkrpFused) { + if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) { + return; + } + + taco_set_num_threads(NUM_THREADS_TO_USE); + + ofstream statfile; + statfile.open( + "/home/min/a/kadhitha/workspace/my_taco/taco/test/stats/mttkrp-spmm.txt", std::ios::app); + if (statfile.is_open()) { + statfile << "\nmttkrp-spmm execution\n"; + statfile << "\n-----------------------------------------\n"; + } + + std::default_random_engine gen(0); + std::uniform_real_distribution unif(0.0, 1.0); + // Predeclare the storage formats that the inputs and output will be stored as. + // To define a format, you must specify whether each dimension is dense or + // sparse and (optionally) the order in which dimensions should be stored. The + // formats declared below correspond to compressed sparse fiber (csf) and + // row-major dense (rm). + Format csf({Dense,Sparse,Sparse}); + Format rm({Dense,Dense}); + Format sd({Dense,Dense}); + int jDim = 32; + int mDim = 64; + + int matfilenum = 3; + + // Load a sparse order-3 tensor from file (stored in the FROSTT format) and + // store it as a compressed sparse fiber tensor. The tensor in this example + // can be download from: http://frostt.io/tensors/nell-2/ + std::vector matfiles = { + "/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns", + "/home/min/a/kadhitha/ispc-examples/data/tns/delicious-3d.tns", + "/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns", // 2 + "/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns", // 3 + "/home/min/a/kadhitha/ispc-examples/data/tns/nell-1.tns", // 4 + "/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns", // 5 + "/home/min/a/kadhitha/ispc-examples/data/tns/darpa1998.tns", // 6 + "/home/min/a/kadhitha/ispc-examples/data/tns/freebase_music.tns", + "/home/min/a/kadhitha/ispc-examples/data/tns/freebase_sampled.tns" // 8 + }; + std::vector matfilesrw = { + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/matmul_5-5-5.tns", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/delicious-3d.tns", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/flickr-3d.tns", // 2 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/nell-2.tns", // 3 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/nell-1.tns", // 4 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/vast-2015-mc1-3d.tns", // 5 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/darpa1998.tns", // 6 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/freebase_music.tns", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/freebase_sampled.tns" + }; + std::string matfile = matfiles[matfilenum]; + Tensor B = read(matfile, csf, true); + // write(matfilesrw[matfilenum], B); + + // Generate a random dense matrix and store it in row-major (dense) format. + // Matrices correspond to order-2 tensors in taco. + Tensor C({B.getDimension(1), jDim}, rm); + for (int i = 0; i < C.getDimension(0); ++i) { + for (int j = 0; j < C.getDimension(1); ++j) { + C.insert({i,j}, unif(gen)); + } + } + C.pack(); + + // Generate another random dense matrix and store it in row-major format. + Tensor D({B.getDimension(2), jDim}, rm); + for (int i = 0; i < D.getDimension(0); ++i) { + for (int j = 0; j < D.getDimension(1); ++j) { + D.insert({i,j}, unif(gen)); + } + } + D.pack(); + + Tensor E({jDim, mDim}, rm); + for (int i = 0; i < E.getDimension(0); ++i) { + for (int j = 0; j < E.getDimension(1); ++j) { + E.insert({i,j}, unif(gen)); + } + } + E.pack(); + + if (statfile.is_open()) { + statfile + << matfile << std::endl + << "A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m)" << std::endl + << "B1_dimension: " << B.getDimension(0) << ", B2_dimension: " << B.getDimension(1) << ", B3_dimension: " << B.getDimension(0) << ", vals: " << B.getStorage().getValues().getSize() << std::endl + << "C1_dimension: " << C.getDimension(0) << ", C2_dimension: " << C.getDimension(1) << ", vals: " << C.getStorage().getValues().getSize() << std::endl + << "D1_dimension: " << D.getDimension(0) << ", D2_dimension: " << D.getDimension(1) << ", vals: " << D.getStorage().getValues().getSize() << std::endl + << "E1_dimension: " << E.getDimension(0) << ", E2_dimension: " << E.getDimension(1) << ", vals: " << E.getStorage().getValues().getSize() << std::endl + << std::endl; + } + + // Declare the output matrix to be a dense matrix with 25 columns and the same + // number of rows as the number of slices along the first dimension of input + // tensor B, to be also stored as a row-major dense matrix. + Tensor A({B.getDimension(0), mDim}, sd); + Tensor ref({B.getDimension(0), mDim}, sd); + + // Define the MTTKRP computation using index notation. + IndexVar i, k, l, j, m; + IndexVar i1("i1"), i2("i2"), j1("j1"), j2("j2"), m1("m1"), m2("m2"); + + A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m); + + IndexStmt stmt = makeReductionNotation(A.getAssignment()); + stmt = makeConcreteNotation(stmt); + stmt = reorderLoopsTopologically(stmt); + // stmt = stmt.reorder({i,j,k,l,m}); + stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 1); + stmt = stmt.split(i, i1, i2, 16); + stmt = insertTemporaries(stmt); + stmt = parallelizeOuterLoop(stmt); + printToFile("fusedMTTKRPFusedPar", stmt); + A.compile(stmt); + A.assemble(); + + + ref(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m); + IndexStmt refStmt = makeReductionNotation(ref.getAssignment()); + refStmt = makeConcreteNotation(refStmt); + refStmt = refStmt + .split(i, i1, i2, 16); + refStmt = insertTemporaries(refStmt); + refStmt = parallelizeOuterLoop(refStmt); + ref.compile(refStmt); + ref.assemble(); + + Tensor ref2({B.getDimension(0), jDim}, sd); + ref2(i,j) = B(i,k,l) * D(l,j) * C(k,j); + IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment()); + ref2Stmt = makeConcreteNotation(ref2Stmt); + ref2Stmt = ref2Stmt + .split(i, i1, i2, 16); + ref2Stmt = insertTemporaries(ref2Stmt); + ref2Stmt = parallelizeOuterLoop(ref2Stmt); + ref2.compile(ref2Stmt); + ref2.assemble(); + + Tensor ref2_ryan({B.getDimension(0), jDim}, sd); + ref2_ryan(i,j) = B(i,k,l) * D(l,j) * C(k,j); + + IndexStmt ref2RyanStmt = makeReductionNotation(ref2_ryan.getAssignment()); + ref2RyanStmt = makeConcreteNotation(ref2RyanStmt); + + IndexExpr precomputeExpr = ref2RyanStmt.as().getStmt().as().getStmt() + .as().getStmt().as().getStmt() + .as().getRhs().as().getA(); + TensorVar w("w", Type(Float64, {Dimension(j)}), taco::dense); + ref2RyanStmt = ref2RyanStmt.split(i, i1, i2, 16) + .reorder({i1, i2, k, l, j}) + .precompute(precomputeExpr, j, j, w) + .parallelize(i1, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); + ref2RyanStmt = insertTemporaries(ref2RyanStmt); + // ref2RyanStmt = parallelizeOuterLoop(ref2RyanStmt); + ref2_ryan.compile(ref2RyanStmt); + ref2_ryan.assemble(); + + Tensor ref3({B.getDimension(0), mDim}, sd); + ref3(i,m) = ref2(i,j) * E(j,m); + IndexStmt ref3Stmt = makeReductionNotation(ref3.getAssignment()); + ref3Stmt = makeConcreteNotation(ref3Stmt); + ref3Stmt = ref3Stmt + .split(i, i1, i2, 16) + .split(j, j1, j2, 16) + .split(m, m1, m2, 16) + .reorder({i1, j1, m1, i2, j2, m2}) + .parallelize(i1, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); + ref3Stmt = insertTemporaries(ref3Stmt); + ref3Stmt = parallelizeOuterLoop(ref3Stmt); + ref3.compile(ref3Stmt); + ref3.assemble(); + + + std::cout << "compute start\n"; + taco::util::TimeResults timevalue; + bool time = true; + + TOOL_BENCHMARK_TIMER(ref2.compute(statfile), "\n\nDefault MTTKRP: ", timevalue); + if (statfile.is_open()) { + statfile << "default mttkrp time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + TOOL_BENCHMARK_TIMER(ref2_ryan.compute(statfile), "\n\nRyan MTTKRP workspace: ", timevalue); + if (statfile.is_open()) { + statfile << "ryan mttkrp workspace time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + double* ref2_vals = (double*) (ref2.getTacoTensorT()->vals); + double* ref2_ryan_vals = (double*) (ref2_ryan.getTacoTensorT()->vals); + for (int q=0; q < B.getDimension(0)* jDim; q++) { + if ( abs(ref2_vals[q] - ref2_ryan_vals[q])/abs(ref2_ryan_vals[q]) > ERROR_MARGIN) { + std::cout << "error: results don't match i: " << q << ", avals: " << ref2_vals[q] << " " + << "refvals: " << ref2_ryan_vals[q] << std::endl; + ASSERT_TRUE(false); + } + } + + TOOL_BENCHMARK_TIMER(ref3.compute(statfile), "\n\nGeMM time: ", timevalue); + if (statfile.is_open()) { + statfile << "GeMM time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + + TOOL_BENCHMARK_TIMER(ref.compute(statfile), "\n\nReference MTTKRP+GEMM: ", timevalue); + if (statfile.is_open()) { + statfile << "reference asymptotic blowup time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + double* ref3_vals = (double*) (ref3.getTacoTensorT()->vals); + double* ref_vals = (double*) (ref.getTacoTensorT()->vals); + for (int q=0; q < B.getDimension(0)* mDim; q++) { + if ( abs(ref3_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) { + std::cout << "error: results don't match i: " << q << ", avals: " << ref3_vals[q] << " " + << "refvals: " << ref_vals[q] << std::endl; + ASSERT_TRUE(false); + } + } + + TOOL_BENCHMARK_TIMER(A.compute(statfile), "\n\nFused MTTKRP+GEMM: ", timevalue); + if (statfile.is_open()) { + statfile << "fused mttkrp+gemm time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + if (statfile.is_open()) { + statfile.close(); + } + + double* A_vals = (double*) (A.getTacoTensorT()->vals); + for (int q=0; q < B.getDimension(0)* mDim; q++) { + if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) { + std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + << "refvals: " << ref_vals[q] << std::endl; + ASSERT_TRUE(false); + } + } + + +} + +TEST(scheduling_eval, ttmFusedWithSyntheticData) { + if (should_use_CUDA_codegen()) { + return; + } + + taco_set_num_threads(NUM_THREADS_TO_USE); + + std::default_random_engine gen(0); + std::uniform_real_distribution unif(0.0, 1.0); + Format csf({Sparse,Sparse,Sparse}); + Format custom({Sparse,Sparse,Dense}); + Format rm({Dense,Dense}); + + int NUM_I = 5; + int NUM_J = 5; + int NUM_K = 5; + int NUM_L = 64; + int NUM_M = 1024; + float SPARSITY = .1; + + Tensor B("B", {NUM_I, NUM_J, NUM_K}, csf); + srand(549694); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float) rand() / (float) (RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, j, k}, (double) ((int) (rand_float * 3 / SPARSITY))); + } + } + } + } + B.pack(); + write("/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/synthetic.tns", B); + + // Generate a random dense matrix and store it in row-major (dense) format. + // Matrices correspond to order-2 tensors in taco. + Tensor C({B.getDimension(2), NUM_L}, rm); + for (int i = 0; i < C.getDimension(0); ++i) { + for (int j = 0; j < C.getDimension(1); ++j) { + C.insert({i,j}, unif(gen)); + } + } + C.pack(); + + // Generate another random dense matrix and store it in row-major format. + Tensor D({NUM_L, NUM_M}, rm); + for (int i = 0; i < D.getDimension(0); ++i) { + for (int j = 0; j < D.getDimension(1); ++j) { + D.insert({i,j}, unif(gen)); + } + } + D.pack(); + + Tensor A({B.getDimension(0), B.getDimension(1), NUM_M}, custom); + Tensor ref({B.getDimension(0), B.getDimension(1), NUM_M}, custom); + + // Define the MTTKRP computation using index notation. + IndexVar i, j, k, l, m; + A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m); + + IndexStmt stmt = makeReductionNotation(A.getAssignment()); + stmt = makeConcreteNotation(stmt); + printToFile("fusedTTMTTKRPConcrete", stmt); + + stmt = reorderLoopsTopologically(stmt); + printToFile("fusedTTMOrdered", stmt); + + stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 1); + printToFile("fusedTTMFused", stmt); + + stmt = insertTemporaries(stmt); + printToFile("fusedTTMWithTemps", stmt); + stmt = parallelizeOuterLoop(stmt); + printToFile("fusedTTMFinal", stmt); + + + // At this point, we have defined how entries in the output matrix should be + // computed from entries in the input tensor and matrices but have not actually + // performed the computation yet. To do so, we must first tell taco to generate + // code that can be executed to compute the MTTKRP operation. + A.compile(stmt); + // We can now call the functions taco generated to assemble the indices of the + // output matrix and then actually compute the MTTKRP. + A.assemble(); + + + ref(i,j,m) = B(i,j,k) * C(k,l) * D(l,m); + IndexStmt refStmt = makeReductionNotation(ref.getAssignment()); + refStmt = makeConcreteNotation(refStmt); + refStmt = insertTemporaries(refStmt); + refStmt = parallelizeOuterLoop(refStmt); + printToFile("tacoFusedTTM", refStmt); + ref.compile(refStmt); + ref.assemble(); + + Tensor ref1({B.getDimension(0), B.getDimension(1), NUM_L}, custom); + ref1(i,j,l) = B(i,j,k) * C(k,l); + IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment()); + ref1Stmt = makeConcreteNotation(ref1Stmt); + ref1Stmt = insertTemporaries(ref1Stmt); + ref1Stmt = parallelizeOuterLoop(ref1Stmt); + ref1.compile(ref1Stmt); + ref1.assemble(); + + Tensor ref2({B.getDimension(0), B.getDimension(1), NUM_M}, custom); + ref2(i,j,m) = ref1(i,j,l) * D(l,m); + IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment()); + ref2Stmt = makeConcreteNotation(ref2Stmt); + ref2Stmt = insertTemporaries(ref2Stmt); + ref2Stmt = parallelizeOuterLoop(ref2Stmt); + ref2.compile(ref2Stmt); + ref2.assemble(); + + Tensor ref3({B.getDimension(2), NUM_M}, rm); + ref3(k,m) = C(k,l) * D(l,m); + IndexStmt ref3Stmt = makeReductionNotation(ref3.getAssignment()); + ref3Stmt = makeConcreteNotation(ref3Stmt); + ref3Stmt = insertTemporaries(ref3Stmt); + ref3Stmt = parallelizeOuterLoop(ref3Stmt); + ref3.compile(ref3Stmt); + ref3.assemble(); + + Tensor ref4({B.getDimension(0), B.getDimension(1), NUM_M}, custom); + ref4(i,j,m) = B(i,j,k) * ref3(k,m); + IndexStmt ref4Stmt = makeReductionNotation(ref4.getAssignment()); + ref4Stmt = makeConcreteNotation(ref4Stmt); + ref4Stmt = insertTemporaries(ref4Stmt); + ref4Stmt = parallelizeOuterLoop(ref4Stmt); + ref4.compile(ref4Stmt); + ref4.assemble(); + + std::cout << "compute start\n"; + taco::util::TimeResults timevalue; + bool time = true; + // TOOL_BENCHMARK_TIMER(ref.compute(), "\n\nReference ISPC: ", timevalue); + TOOL_BENCHMARK_TIMER(A.compute(), "\n\nFused TTM->TTM: ", timevalue); + TOOL_BENCHMARK_TIMER(ref.compute(), "\n\nReference TTM->TTM: ", timevalue); + TOOL_BENCHMARK_TIMER(ref1.compute(), "\n\nTTM1: ", timevalue); + TOOL_BENCHMARK_TIMER(ref2.compute(), "\n\nTTM1: ", timevalue); + TOOL_BENCHMARK_TIMER(ref3.compute(), "\n\ndense: ", timevalue); + TOOL_BENCHMARK_TIMER(ref4.compute(), "\n\nTTM after dense: ", timevalue); + ASSERT_TENSOR_EQ(ref, A); + ASSERT_TENSOR_EQ(ref, ref2); + ASSERT_TENSOR_EQ(ref, ref4); + + for (int q = 0; q < A.getDimension(0); ++q) { + for (int w = 0; w < A.getDimension(1); ++w) { + for (int z = 0; z < A.getDimension(2); ++z) { + // std::cout << "(" << q << "," << w << "," << z << ")" + // << "a: " << A(q,w,z) << ", ref: " << ref(q,w,z) << std::endl; + if ( abs(A(q,w,z) - ref(q,w,z))/abs(ref(q,w,z)) > ERROR_MARGIN) { + std::cout << "error: results don't match A: " + << A(q,w,z) << ", ref: " << ref(q,w,z) << std::endl; + ASSERT_TRUE(false); + } + } + } + } + +} + +TEST(scheduling_eval, ttmFused) { + if (should_use_CUDA_codegen()) { + return; + } + + int retval, EventSet = PAPI_NULL; + retval = PAPI_hl_region_begin("dummy"); + if ( retval != PAPI_OK ) handle_error(1); + + retval = PAPI_hl_region_end("dummy"); + if ( retval != PAPI_OK ) handle_error(1); + + taco_set_num_threads(NUM_THREADS_TO_USE); + + ofstream statfile; + statfile.open( + "/home/min/a/kadhitha/workspace/my_taco/taco/test/stats/ttm-ttm.txt", std::ios::app); + if (statfile.is_open()) { + statfile << "\nttm-ttm execution\n"; + statfile << "\n-----------------------------------------\n"; + } + + std::default_random_engine gen(0); + std::uniform_real_distribution unif(0.0, 1.0); + Format csf({Dense,Sparse,Sparse}); + Format custom({Dense,Sparse,Dense}); + Format rm({Dense,Dense}); + int ldim = 32; + int mdim = 64; + + int64_t dummy_array_size = 2e6; + int64_t* dummy_array_to_flush_cache = (int64_t*) malloc(dummy_array_size*sizeof(int64_t)); + + vector matfilenums = {5}; + + for (auto matfilenum : matfilenums) { + + // int matfilenum = 0; + + + + // Load a sparse order-3 tensor from file (stored in the FROSTT format) and + // store it as a compressed sparse fiber tensor. The tensor in this example + // can be download from: http://frostt.io/tensors/nell-2/ + std::vector matfiles = { + "/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns", + "/home/min/a/kadhitha/ispc-examples/data/tns/delicious-3d.tns", + "/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns", // 2 + "/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns", // 3 + "/home/min/a/kadhitha/ispc-examples/data/tns/nell-1.tns", // 4 + "/home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns", // 5 + "/home/min/a/kadhitha/workspace/my_taco/tns/darpa1998.tns", // 6 + "/home/min/a/kadhitha/ispc-examples/data/tns/freebase_music.tns", + "/home/min/a/kadhitha/ispc-examples/data/tns/freebase_sampled.tns" + }; + std::vector matfilesrw = { + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/matmul_5-5-5.tns", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/delicious-3d.tns", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/flickr-3d.tns", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/nell-2.tns", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/nell-1.tns", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/vast-2015-mc1-3d.tns", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/darpa1998.tns", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/freebase_music.tns", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/freebase_sampled.tns" + }; + statfile << "\nfile: " << matfiles[matfilenum] << std::endl; + statfile << "----------------------------------------------------------------\n"; + + std::string matfile = matfiles[matfilenum]; + Tensor B = read(matfile, csf); + B.setName("B"); + B.pack(); + // write(matfilesrw[matfilenum], B); + + // Generate a random dense matrix and store it in row-major (dense) format. + // Matrices correspond to order-2 tensors in taco. + Tensor C("C", {B.getDimension(2), ldim}, rm); + for (int i = 0; i < C.getDimension(0); ++i) { + for (int j = 0; j < C.getDimension(1); ++j) { + C.insert({i,j}, unif(gen)); + } + } + C.pack(); + + // Generate another random dense matrix and store it in row-major format. + Tensor D("D", {ldim, mdim}, rm); + for (int i = 0; i < D.getDimension(0); ++i) { + for (int j = 0; j < D.getDimension(1); ++j) { + D.insert({i,j}, unif(gen)); + } + } + D.pack(); + + if (statfile.is_open()) { + statfile + << matfile << std::endl + << "A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m)" << std::endl + << "B1_dimension: " << B.getDimension(0) << ", B2_dimension: " << B.getDimension(1) << ", B3_dimension: " << B.getDimension(2) << ", vals: " << B.getStorage().getValues().getSize() << std::endl + << "C1_dimension: " << C.getDimension(0) << ", C2_dimension: " << C.getDimension(1) << ", vals: " << C.getStorage().getValues().getSize() << std::endl + << "D1_dimension: " << D.getDimension(0) << ", D2_dimension: " << D.getDimension(1) << ", vals: " << D.getStorage().getValues().getSize() << std::endl + << std::endl; + } + + Tensor A({B.getDimension(0), B.getDimension(1), mdim}, custom); + Tensor ref({B.getDimension(0), B.getDimension(1), mdim}, custom); + Tensor refn({B.getDimension(0), B.getDimension(1), mdim}, custom); + + // Define the MTTKRP computation using index notation. + IndexVar i, j, k, l, m; + IndexVar i0,i1, j0, j1, k0, k1, l0, l1, m0, m1; + A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m); + + + IndexStmt stmt = makeReductionNotation(A.getAssignment()); + stmt = makeConcreteNotation(stmt); + stmt = reorderLoopsTopologically(stmt); + stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 1); + stmt = stmt.split(i, i0, i1, 16); + stmt = insertTemporaries(stmt); + stmt = parallelizeOuterLoop(stmt); + printToFile("fusedTTMFinal", stmt); + + A.compile(stmt); + A.assemble(); + + + ref(i,j,m) = B(i,j,k) * C(k,l) * D(l,m); // TTM->TTM TACO + IndexStmt refStmt = makeReductionNotation(ref.getAssignment()); + refStmt = makeConcreteNotation(refStmt); + refStmt = refStmt + .split(i, i0, i1, 16); + refStmt = insertTemporaries(refStmt); + refStmt = parallelizeOuterLoop(refStmt); + printToFile("tacoFusedTTM", refStmt); + ref.compile(refStmt); + ref.assemble(); + + refn(i,j,m) = B(i,j,k) * C(k,l) * D(l,m); // TTM->TTM TACO + IndexStmt refnStmt = makeReductionNotation(refn.getAssignment()); + refnStmt = makeConcreteNotation(refnStmt); + refnStmt = refnStmt + .split(i, i0, i1, 16) + .reorder({i0, i1, j, k, l, m}); + refnStmt = insertTemporaries(refnStmt); + refnStmt = parallelizeOuterLoop(refnStmt); + printToFile("tacoFusedTTM", refnStmt); + refn.compile(refnStmt); + refn.assemble(); + + Tensor ref1({B.getDimension(0), B.getDimension(1), ldim}, custom); + ref1(i,j,l) = B(i,j,k) * C(k,l); // TTM1 + IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment()); + ref1Stmt = makeConcreteNotation(ref1Stmt); + // ref1Stmt = ref1Stmt.split(i, i0, i1, 16); + ref1Stmt = insertTemporaries(ref1Stmt); + ref1Stmt = parallelizeOuterLoop(ref1Stmt); + ref1.compile(ref1Stmt); + ref1.assemble(); + + Tensor ref2({B.getDimension(0), B.getDimension(1), mdim}, custom); + ref2(i,j,m) = ref1(i,j,l) * D(l,m); // TTM2 + IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment()); + ref2Stmt = makeConcreteNotation(ref2Stmt); + // ref2Stmt = ref2Stmt.split(i, i0, i1, 16); + ref2Stmt = insertTemporaries(ref2Stmt); + ref2Stmt = parallelizeOuterLoop(ref2Stmt); + ref2.compile(ref2Stmt); + ref2.assemble(); + + Tensor ref3({B.getDimension(2), mdim}, rm); + ref3(k,m) = C(k,l) * D(l,m); // GeMM + IndexStmt ref3Stmt = makeReductionNotation(ref3.getAssignment()); + ref3Stmt = makeConcreteNotation(ref3Stmt); + ref3Stmt = ref3Stmt + .split(k, k0, k1, 32) + .split(l, l0, l1, 32) + .split(m, m0, m1, 32) + .reorder({k0, l0, m0, k1, l1, m1}); + ref3Stmt = insertTemporaries(ref3Stmt); + ref3Stmt = parallelizeOuterLoop(ref3Stmt); + ref3.compile(ref3Stmt); + ref3.assemble(); + + Tensor ref4({B.getDimension(0), B.getDimension(1), mdim}, custom); + ref4(i,j,m) = B(i,j,k) * ref3(k,m); // TTM1 + IndexStmt ref4Stmt = makeReductionNotation(ref4.getAssignment()); + ref4Stmt = makeConcreteNotation(ref4Stmt); + // ref4Stmt = ref4Stmt + // .split(i, i0, i1, 16); + // // .split(k, k0, k1, 16) + // .split(m, m0, m1, 16) + // .reorder({i0, i1, j, m0, k, m1}); + ref4Stmt = insertTemporaries(ref4Stmt); + ref4Stmt = parallelizeOuterLoop(ref4Stmt); + ref4.compile(ref4Stmt); + ref4.assemble(); + + std::cout << "compute start\n"; + taco::util::TimeResults timevalue; + bool time = true; + + int r = rand(); + for (int64_t i=0; iTTM: ", timevalue); + retval = PAPI_hl_region_end("fusedTTM"); if ( retval != PAPI_OK ) handle_error(1); + if (statfile.is_open()) { + statfile << "fused time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + r = rand(); + for (int64_t i=0; iTTM: ", timevalue); + retval = PAPI_hl_region_end("referenceTTM"); if ( retval != PAPI_OK ) handle_error(1); + if (statfile.is_open()) { + statfile << "reference time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + r = rand(); + for (int64_t i=0; iTTM: ", timevalue); + retval = PAPI_hl_region_end("ref2TTM"); if ( retval != PAPI_OK ) handle_error(1); + if (statfile.is_open()) { + statfile << "reference new time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + statfile << "\nschedule 1\n"; + + r = rand(); + for (int64_t i=0; ivals); + double* ref_vals = (double*) (ref.getTacoTensorT()->vals); + double* ref2_vals = (double*) (ref2.getTacoTensorT()->vals); + double* ref4_vals = (double*) (ref4.getTacoTensorT()->vals); + + // int* A2_pos = (double*) (ref.getTacoTensorT()->vals); + + // for (size_t q=0; q < B.getStorage().getValues().getSize(); q++) { + // if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) { + // std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + // << "refvals: " << ref_vals[q] << std::endl; + // ASSERT_TRUE(false); + // } + // } + + // std::cout << "our fused vs taco original fused check\n"; + // for (size_t q=0; q < A.getStorage().getValues().getSize(); q++) { + // if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) { + // std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + // << "refvals: " << ref_vals[q] << std::endl; + // ASSERT_TRUE(false); + // } + // } + // std::cout << "taco original fused vs TTM1, TTM2 check\n"; + // for (size_t q=0; q < A.getStorage().getValues().getSize(); q++) { + // if ( abs(ref_vals[q] - ref2_vals[q])/abs(ref2_vals[q]) > ERROR_MARGIN) { + // std::cout << "error: results don't match i: " << q << ", avals: " << ref_vals[q] << " " + // << "refvals: " << ref2_vals[q] << std::endl; + // ASSERT_TRUE(false); + // } + // } + // std::cout << "taco original fused vs GeMM, TTM1 check\n"; + // for (size_t q=0; q < A.getStorage().getValues().getSize(); q++) { + // if ( abs(ref_vals[q] - ref4_vals[q])/abs(ref4_vals[q]) > ERROR_MARGIN) { + // std::cout << "error: results don't match i: " << q << ", avals: " << ref_vals[q] << " " + // << "refvals: " << ref4_vals[q] << std::endl; + // ASSERT_TRUE(false); + // } + // } + + } // end of forloop + + if (statfile.is_open()) { + statfile.close(); + } + +} + + + + +TEST(scheduling_eval, spmmFusedWithSyntheticData) { + if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) { + return; + } + + taco_set_num_threads(NUM_THREADS_TO_USE); + + std::default_random_engine gen(0); + std::uniform_real_distribution unif(0.0, 1.0); + + Format csr({dense, sparse}); + Format rm({dense, dense}); + int ldim = 32; + int kdim = 64; + + // uncomment this for reading the csr matrix saved in mtx file + std::cout << "reading B mat mtx\n"; + + int NUM_I = 128; + int NUM_J = 96; + int NUM_K = 64; + float SPARSITY = .3; + Tensor B("B", {NUM_I, NUM_J}, csr); + srand(75883); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + } + B.pack(); + + Tensor C("C", {NUM_J, NUM_K}, csr); + for (int j = 0; j < NUM_J; j++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({j, k}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + } + C.pack(); + // write("/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx", B); + + std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl; + std::cout << "adding c mat\n"; + Tensor D({C.getDimension(1), ldim}, rm); + for (int i = 0; i < D.getDimension(0); ++i) { + for (int j = 0; j < D.getDimension(1); ++j) { + D.insert({i,j}, unif(gen)); + } + } + std::cout << "packing C mat\n"; + D.pack(); + + // Tensor E({B.getDimension(1), kdim}, rm); + // for (int i = 0; i < D.getDimension(0); ++i) { + // for (int j = 0; j < D.getDimension(1); ++j) { + // D.insert({i,j}, unif(gen)); + // } + // } + // std::cout << "packing D mat\n"; + // D.pack(); + + // Tensor F({B.getDimension(1), ldim}, rm); + // for (int i = 0; i < F.getDimension(0); ++i) { + // for (int j = 0; j < F.getDimension(1); ++j) { + // F.insert({i,j}, unif(gen)); + // } + // } + // std::cout << "packing F mat\n"; + // F.pack(); + + Tensor A({B.getDimension(0), ldim}, rm); + Tensor ref({B.getDimension(0), ldim}, rm); + IndexVar i, j, k, l; + A(i,l)=B(i,j)*C(j,k)*D(k,l); + + // IndexStmt stmt = A.getAssignment().concretize(); + IndexStmt stmt = makeReductionNotation(A.getAssignment()); + stmt = makeConcreteNotation(stmt); + printToFile("fusedMMConcrete", stmt); + + stmt = reorderLoopsTopologically(stmt); + printToFile("fusedMMOrdered", stmt); + + stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 1); + printToFile("fusedMMFused", stmt); + + stmt = insertTemporaries(stmt); + printToFile("fusedMMWithTemps", stmt); + stmt = parallelizeOuterLoop(stmt); + printToFile("fusedMMFusedPar", stmt); + + A.compile(stmt); + // We can now call the functions taco generated to assemble the indices of the + // output matrix and then actually compute the MTTKRP. + A.assemble(); + + + ref(i,l)=B(i,j)*C(j,k)*D(k,l); + IndexStmt refStmt = makeReductionNotation(ref.getAssignment()); + refStmt = makeConcreteNotation(refStmt); + refStmt = insertTemporaries(refStmt); + refStmt = parallelizeOuterLoop(refStmt); + ref.compile(refStmt); + ref.assemble(); + + // Tensor ref1({B.getDimension(0), B.getDimension(1)}, csr); + // Tensor ref2({B.getDimension(0), ldim}, rm); + // ref1(i,j)=B(i,j)*C(i,k)*D(j,k); + // ref2(i,l)=ref1(i,j)*F(j,l); + + // IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment()); + // ref1Stmt = makeConcreteNotation(ref1Stmt); + // ref1Stmt = insertTemporaries(ref1Stmt); + // ref1Stmt = parallelizeOuterLoop(ref1Stmt); + // ref1.compile(ref1Stmt); + // ref1.assemble(); + + // IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment()); + // ref2Stmt = makeConcreteNotation(ref2Stmt); + // ref2Stmt = insertTemporaries(ref2Stmt); + // ref2Stmt = parallelizeOuterLoop(ref2Stmt); + // ref2.compile(ref2Stmt); + // ref2.assemble(); + + std::cout << "compute start\n"; + taco::util::TimeResults timevalue; + bool time = true; + TOOL_BENCHMARK_TIMER(ref.compute(), "\n\nReference Kernel: ", timevalue); + TOOL_BENCHMARK_TIMER(A.compute(), "\n\nFused Kernel: ", timevalue); + + // check results + for (int q = 0; q < A.getDimension(0); ++q) { + for (int w = 0; w < A.getDimension(1); ++w) { + if ( abs(A(q,w) - ref(q,w))/abs(ref(q,w)) > ERROR_MARGIN) { + std::cout << "error: results don't match A("<< q << "," << w << "): " + << A(q,w) << ", ref: " << ref(q,w) << std::endl; + ASSERT_TRUE(false); + } + } + } + // // ASSERT_TENSOR_EQ(A, ref); + // TOOL_BENCHMARK_TIMER(ref1.compute(), "\n\nSDDMM Kernel: ", timevalue); + // TOOL_BENCHMARK_TIMER(ref2.compute(), "\n\nSpMM Kernel: ", timevalue); + + // for (int q = 0; q < ref2.getDimension(0); ++q) { + // for (int w = 0; w < ref2.getDimension(1); ++w) { + // if ( abs(ref2(q,w) - ref(q,w))/abs(ref(q,w)) > ERROR_MARGIN) { + // std::cout << "error: results don't match A("<< q << "," << w << "): " + // << ref2(q,w) << ", ref: " << ref(q,w) << std::endl; + // ASSERT_TRUE(false); + // } + // } + // } + +} + + +TEST(scheduling_eval, spmmFused) { + if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) { + return; + } + + int retval, EventSet = PAPI_NULL; + retval = PAPI_hl_region_begin("dummy"); + if ( retval != PAPI_OK ) handle_error(1); + + /* Do some computation */ + + retval = PAPI_hl_region_end("dummy"); + if ( retval != PAPI_OK ) handle_error(1); + + taco_set_num_threads(NUM_THREADS_TO_USE); + + ofstream statfile; + statfile.open( + "/home/min/a/kadhitha/workspace/my_taco/taco/test/stats/spmm-spmm.txt", std::ios::app); + if (statfile.is_open()) { + statfile << "\nspmm-spmm execution\n"; + statfile << "\n-----------------------------------------\n"; + } + + std::default_random_engine gen(0); + std::uniform_real_distribution unif(0.0, 1.0); + + Format csr({dense, sparse}); + Format rm({dense, dense}); + int kdim = 128; + int ldim = 64; + + // vector filenums = {2,3,4,5,6,7,8,9,10,12,15}; + vector filenums = {3}; + + for (auto filenum : filenums) { + + + statfile << "filenum: " << filenum << std::endl; + statfile << "---------------------------------\n"; + // int filenum = 7; + + std::vector matfiles = { + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx", // 2 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx", // 10 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx", // 12 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wiki-Talk/wiki-Talk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/com-Orkut/com-Orkut.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx", // 15 + "/home/min/a/kadhitha/workspace/my_taco/FusedMM/dataset/harvard.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/twitter7/twitter7.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k.mtx", + }; + std::vector matfilesrw = { + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/synthetic.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cage3.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/bcsstk17.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/pdb1HYS.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/rma10.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cant.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/consph.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cop20k_A.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/shipsec1.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/scircuit.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/webbase-1M.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/wiki-Talk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/com-Orkut.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/circuit5M.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/harvard.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/twitter7.mtx" + }; + + std::string matfile = matfiles[filenum]; + std::cout << "reading B mat mtx\n"; + Tensor B = read(matfile, csr); + B.pack(); + // write(matfilesrw[filenum], B); + + if (statfile.is_open()) { + statfile << matfile << std::endl; + } + + std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl; + std::cout << "adding c mat\n"; + // Tensor C = read(matfiles2[filenum], csr, true); + // std::cout << "packing C mat\n"; + + std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl; + std::cout << "adding c mat\n"; + Tensor C("C", {B.getDimension(1), kdim}, rm); + for (int i = 0; i < C.getDimension(0); ++i) { + for (int j = 0; j < C.getDimension(1); ++j) { + C.insert({i,j}, unif(gen)); + } + } + std::cout << "packing C mat\n"; + C.pack(); + + Tensor D({C.getDimension(1), ldim}, rm); + for (int i = 0; i < D.getDimension(0); ++i) { + for (int j = 0; j < D.getDimension(1); ++j) { + D.insert({i,j}, unif(gen)); + } + } + std::cout << "packing D mat\n"; + D.pack(); + + // Tensor F({B.getDimension(1), ldim}, rm); + // for (int i = 0; i < F.getDimension(0); ++i) { + // for (int j = 0; j < F.getDimension(1); ++j) { + // F.insert({i,j}, unif(gen)); + // } + // } + // std::cout << "packing F mat\n"; + // F.pack(); + + Tensor A({B.getDimension(0), ldim}, rm); + Tensor ref({B.getDimension(0), ldim}, rm); + Tensor refn({B.getDimension(0), ldim}, rm); + IndexVar i, j, k, l; + IndexVar i0, i1, j0, j1, k0, k1, l0, l1; + + A(i,l)=B(i,j)*C(j,k)*D(k,l); + if (statfile.is_open()) { + statfile + << "ref(i,l)=B(i,j)*C(i,k)*D(j,k);" << std::endl + << "B1_dimension: " << B.getDimension(0) << ", B2_dimension: " << B.getDimension(1) << ", vals: " << B.getStorage().getValues().getSize() << std::endl + << "C1_dimension: " << C.getDimension(0) << ", C2_dimension: " << C.getDimension(1) << ", vals: " << C.getStorage().getValues().getSize() << std::endl + << "D1_dimension: " << D.getDimension(0) << ", D2_dimension: " << D.getDimension(1) << ", vals: " << D.getStorage().getValues().getSize() << std::endl + // << "E1_dimension: " << F.getDimension(0) << ", E2_dimension: " << F.getDimension(1) << ", vals: " << F.getStorage().getValues().getSize() << std::endl + << std::endl; + } + + // IndexStmt stmt = A.getAssignment().concretize(); + IndexStmt stmt = makeReductionNotation(A.getAssignment()); + stmt = makeConcreteNotation(stmt); + stmt = reorderLoopsTopologically(stmt); + stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 1); + stmt = stmt.split(i, i0, i1, 16); + stmt = insertTemporaries(stmt); + stmt = parallelizeOuterLoop(stmt); + + A.compile(stmt); + A.assemble(); + + + ref(i,l)=B(i,j)*C(j,k)*D(k,l); + refn(i,l)=B(i,j)*C(j,k)*D(k,l); + // IndexStmt refStmt = ref.getAssignment().concretize(); + + // ref1Stmt = ref1Stmt.split(i, i0, i1, 16); + // .pos(j, jpos, B(i,j)); + // .split(k, k0, k1, 8); + // .reorder({i0, i1, jpos0, k, jpos1}); + // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + // .parallelize(jpos1, ParallelUnit::CPUVector, OutputRaceStrategy::ParallelReduction); + IndexStmt refStmt = makeReductionNotation(ref.getAssignment()); + refStmt = makeConcreteNotation(refStmt); + refStmt = refStmt + .split(i, i0, i1, 16) + .split(k, k0, k1, 32) + .split(l, l0, l1, 32) + .reorder({i0, i1, j, k0, l0, k1, l1}); + refStmt = insertTemporaries(refStmt); + refStmt = parallelizeOuterLoop(refStmt); + ref.compile(refStmt); + ref.assemble(); + + IndexStmt refnStmt = makeReductionNotation(refn.getAssignment()); + refnStmt = makeConcreteNotation(refnStmt); + refnStmt = refnStmt + .split(i, i0, i1, 16); + refnStmt = insertTemporaries(refnStmt); + refnStmt = parallelizeOuterLoop(refnStmt); + refn.compile(refnStmt); + refn.assemble(); + + // SpMM , GEMM + + Tensor ref1({B.getDimension(0), kdim}, rm); + Tensor ref2({B.getDimension(0), ldim}, rm); + Tensor ref2_2({B.getDimension(0), ldim}, rm); + + ref1(i,k)=B(i,j)*C(j,k); + ref2(i,l)=ref1(i,k)*D(k,l); + ref2_2(i,l)=ref1(i,k)*D(k,l); + + IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment()); + ref1Stmt = makeConcreteNotation(ref1Stmt); + ref1Stmt = insertTemporaries(ref1Stmt); + ref1Stmt = parallelizeOuterLoop(ref1Stmt); + ref1.compile(ref1Stmt); + ref1.assemble(); + + IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment()); + ref2Stmt = makeConcreteNotation(ref2Stmt); + ref2Stmt = insertTemporaries(ref2Stmt); + ref2Stmt = ref2Stmt.split(i, i0, i1, 16); + ref2Stmt = parallelizeOuterLoop(ref2Stmt); + ref2.compile(ref2Stmt); + ref2.assemble(); + + IndexStmt ref2Stmt2 = makeReductionNotation(ref2_2.getAssignment()); + ref2Stmt2 = makeConcreteNotation(ref2Stmt2); + ref2Stmt2 = ref2Stmt2 + .split(i, i0, i1, 32) + .split(k,k0,k1, 32) + .split(l, l0, l1, 32) + .reorder({i0, k0, l0, i1, k1, l1}) + .parallelize(j0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); + ref2Stmt2 = insertTemporaries(ref2Stmt2); + // ref2Stmt2 = parallelizeOuterLoop(ref2Stmt2); + ref2_2.compile(ref2Stmt2); + ref2_2.assemble(); + + + // -------------- GeMM and SpMM + + Tensor ref3({C.getDimension(0), ldim}, rm); + Tensor ref4({C.getDimension(0), ldim}, rm); + ref3(j,l)=C(j,k)*D(k,l); // GEMM + ref4(i,l) = B(i,j)*ref3(j,l); // SpMM + + IndexStmt ref3Stmt = ref3.getAssignment().concretize(); + ref3Stmt = ref3Stmt + .split(j, j0, j1, 32) // changed to 32 + .split(k, k0, k1, 32) + .split(l, l0, l1, 32) + .reorder({j0, k0, l0, j1, k1, l1}) + .parallelize(j0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); + ref2Stmt2 = insertTemporaries(ref2Stmt2); + ref3.compile(ref3Stmt); + ref3.assemble(); + + IndexStmt ref4Stmt = makeReductionNotation(ref4.getAssignment()); // SpMM operation + ref4Stmt = makeConcreteNotation(ref4Stmt); + ref4Stmt = ref4Stmt.split(i, i0, i1, 16); + ref4Stmt = insertTemporaries(ref4Stmt); + ref4Stmt = parallelizeOuterLoop(ref4Stmt); + ref4.compile(ref4Stmt); + ref4.assemble(); + + + std::cout << "compute start\n"; + taco::util::TimeResults timevalue; + bool time = true; + + statfile << "\n--------- 1st pattern computation TTM, GEMM\n"; + + retval = PAPI_hl_region_begin("spmm"); + if ( retval != PAPI_OK ) handle_error(1); + TOOL_BENCHMARK_TIMER(ref1.compute(statfile), "\n\nSpMM Kernel: ", timevalue); + retval = PAPI_hl_region_end("spmm"); + if ( retval != PAPI_OK ) handle_error(1); + if (statfile.is_open()) { + statfile << "SpMM time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + std::string sofile_spmm_template = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so"; + retval = PAPI_hl_region_begin("spmmtemplate"); + if ( retval != PAPI_OK ) handle_error(1); + TOOL_BENCHMARK_TIMER(ref1.compute(statfile, sofile_spmm_template), "\n\nSpMM template Kernel: ", timevalue); + retval = PAPI_hl_region_end("spmmtemplate"); + if ( retval != PAPI_OK ) handle_error(1); + if (statfile.is_open()) { + statfile << "SpMM template time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + retval = PAPI_hl_region_begin("gemm"); + if ( retval != PAPI_OK ) handle_error(1); + TOOL_BENCHMARK_TIMER(ref2.compute(statfile), "\n\nGeMM Kernel: ", timevalue); + retval = PAPI_hl_region_end("gemm"); + if ( retval != PAPI_OK ) handle_error(1); + if (statfile.is_open()) { + statfile << "GeMM time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + retval = PAPI_hl_region_begin("gemmtemplate"); + if ( retval != PAPI_OK ) handle_error(1); + TOOL_BENCHMARK_TIMER(ref2_2.compute(statfile), "\n\nref GeMM template Kernel: ", timevalue); + retval = PAPI_hl_region_end("gemmtemplate"); + if ( retval != PAPI_OK ) handle_error(1); + if (statfile.is_open()) { + statfile << "ref 2 GeMM template time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + // std::string sofile_gemm_template = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/spmm_gemm/spmm_template.so"; + statfile << "\n--------- 2nd pattern computation GEMM, SpMM\n"; + retval = PAPI_hl_region_begin("gemmtemplate2"); + if ( retval != PAPI_OK ) handle_error(1); + TOOL_BENCHMARK_TIMER(ref3.compute(statfile), "\n\nGeMM template ref3 Kernel: ", timevalue); + retval = PAPI_hl_region_end("gemmtemplate2"); + if ( retval != PAPI_OK ) handle_error(1); + if (statfile.is_open()) { + statfile << "ref3 GeMM template time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + retval = PAPI_hl_region_begin("spmm2"); + if ( retval != PAPI_OK ) handle_error(1); + TOOL_BENCHMARK_TIMER(ref4.compute(statfile, sofile_spmm_template), "\n\nSpMM template Kernel ref4: ", timevalue); + retval = PAPI_hl_region_end("spmm2"); + if ( retval != PAPI_OK ) handle_error(1); + if (statfile.is_open()) { + statfile << "SpMM template time ref4: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + + statfile << "\n-------- reference pattern computation\n"; + + retval = PAPI_hl_region_begin("ref"); + if ( retval != PAPI_OK ) handle_error(1); + TOOL_BENCHMARK_TIMER(ref.compute(statfile), "\n\nReference Kernel: ", timevalue); + retval = PAPI_hl_region_end("ref"); + if ( retval != PAPI_OK ) handle_error(1); + if (statfile.is_open()) { + statfile << "taco reference time: "; + statfile << timevalue << std::endl; + } else { std::cout << " stat file is not open\n"; } + + retval = PAPI_hl_region_begin("refnew"); + if ( retval != PAPI_OK ) handle_error(1); + TOOL_BENCHMARK_TIMER(refn.compute(statfile), "\n\nReference new Kernel: ", timevalue); + retval = PAPI_hl_region_end("refnew"); + if ( retval != PAPI_OK ) handle_error(1); + if (statfile.is_open()) { + statfile << "taco reference new time: "; + statfile << timevalue << std::endl; + } else { std::cout << " stat file is not open\n"; } + + + retval = PAPI_hl_region_begin("sparselnr"); + if ( retval != PAPI_OK ) handle_error(1); + TOOL_BENCHMARK_TIMER(A.compute(statfile), "\n\nFused Kernel: ", timevalue); + retval = PAPI_hl_region_end("sparselnr"); + if ( retval != PAPI_OK ) handle_error(1); + if (statfile.is_open()) { + statfile << "fused time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + + double* A_vals = (double*) (A.getTacoTensorT()->vals); + double* ref_vals = (double*) (ref.getTacoTensorT()->vals); + double* ref2_vals = (double*) (ref2.getTacoTensorT()->vals); + double* ref4_vals = (double*) (ref2.getTacoTensorT()->vals); + + // int* A2_pos = (double*) (ref.getTacoTensorT()->vals); + + // for (size_t q=0; q < B.getStorage().getValues().getSize(); q++) { + // if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) { + // std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + // << "refvals: " << ref_vals[q] << std::endl; + // ASSERT_TRUE(false); + // } + // } + + for (int q=0; q < A.getDimension(0)* A.getDimension(1); q++) { + if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) { + std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + << "refvals: " << ref_vals[q] << std::endl; + ASSERT_TRUE(false); + } + } + for (int q=0; q < A.getDimension(0)* A.getDimension(1); q++) { + if ( abs(A_vals[q] - ref2_vals[q])/abs(ref2_vals[q]) > ERROR_MARGIN) { + std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + << "refvals: " << ref2_vals[q] << std::endl; + ASSERT_TRUE(false); + } + } + for (int q=0; q < A.getDimension(0)* A.getDimension(1); q++) { + if ( abs(A_vals[q] - ref4_vals[q])/abs(ref4_vals[q]) > ERROR_MARGIN) { + std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + << "refvals: " << ref4_vals[q] << std::endl; + ASSERT_TRUE(false); + } + } + + } // end of file num for loop + + if (statfile.is_open()) { + statfile.close(); + } + + + // unsigned int native = 0x0; + + // retval = PAPI_library_init(PAPI_VER_CURRENT); + + // if (retval != PAPI_VER_CURRENT) { + // printf("PAPI library init error!\n"); + // exit(1); + // } else { + // printf("PAPI library init success\n"); + // } + + // if (PAPI_create_eventset(&EventSet) != PAPI_OK) { + // handle_error(1); + // } + + // /* Add the native event */ + // native = () + + retval = PAPI_hl_region_begin("computation1"); + if ( retval != PAPI_OK ) + handle_error(1); + + /* Do some computation */ + + retval = PAPI_hl_region_end("computation1"); + if ( retval != PAPI_OK ) + handle_error(1); + + retval = PAPI_hl_region_begin("computation2"); + if ( retval != PAPI_OK ) + handle_error(1); + + /* Do some computation */ + + retval = PAPI_hl_region_end("computation2"); + if ( retval != PAPI_OK ) + handle_error(1); +} + + + + + + +TEST(scheduling_eval, sddmmspmmFused) { + if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) { + return; + } + + taco_set_num_threads(NUM_THREADS_TO_USE); + + ofstream statfile; + statfile.open( + "/home/min/a/kadhitha/workspace/my_taco/taco/test/stats/sddmm-spmm-gemm.txt", std::ios::app); + if (statfile.is_open()) { + statfile << "\nsddmm-spmm-gemm execution\n"; + statfile << "\n-----------------------------------------\n"; + } + + std::default_random_engine gen(0); + std::uniform_real_distribution unif(0.0, 1.0); + + Format csr({dense, sparse}); + Format rm({dense, dense}); + + int kdim = 64; + int ldim = 64; + int mdim = 64; + + vector filenums{2, 3,4,5,6,7,8,9,10,12,15}; + + for (auto filenum : filenums) { + + + std::vector matfiles = { + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx", // 5 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx", // 8 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx", // 10 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx", // 12 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wiki-Talk/wiki-Talk.mtx", // 13 + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/com-Orkut/com-Orkut.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx", // 15 + "/home/min/a/kadhitha/workspace/my_taco/FusedMM/dataset/harvard.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/twitter7/twitter7.mtx" + }; + std::vector matfilesrw = { + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/synthetic.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cage3.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/bcsstk17.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/pdb1HYS.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/rma10.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cant.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/consph.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cop20k_A.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/shipsec1.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/scircuit.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/webbase-1M.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/wiki-Talk.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/com-Orkut.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/circuit5M.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/harvard.mtx", + "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/twitter7.mtx" + }; + + std::string matfile = matfiles[filenum]; + std::cout << "reading B mat mtx\n"; + Tensor B = read(matfile, csr, true); + B.setName("B"); + B.pack(); + // write(matfilesrw[filenum], B); + + if (statfile.is_open()) { + statfile << matfile << std::endl; + } + + std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl; + std::cout << "adding c mat\n"; + Tensor C({B.getDimension(0), kdim}, rm); + for (int i = 0; i < C.getDimension(0); ++i) { + for (int j = 0; j < C.getDimension(1); ++j) { + C.insert({i,j}, unif(gen)); + } + } + std::cout << "packing C mat\n"; + C.pack(); + + Tensor D({B.getDimension(1), kdim}, rm); + for (int i = 0; i < D.getDimension(0); ++i) { + for (int j = 0; j < D.getDimension(1); ++j) { + D.insert({i,j}, unif(gen)); + } + } + std::cout << "packing D mat\n"; + D.pack(); + + Tensor F({B.getDimension(1), ldim}, rm); + for (int i = 0; i < F.getDimension(0); ++i) { + for (int j = 0; j < F.getDimension(1); ++j) { + F.insert({i,j}, unif(gen)); + } + } + std::cout << "packing F mat\n"; + F.pack(); + + Tensor G({ldim, mdim}, rm); + for (int i = 0; i < G.getDimension(0); ++i) { + for (int j = 0; j < G.getDimension(1); ++j) { + G.insert({i,j}, unif(gen)); + } + } + std::cout << "packing F mat\n"; + G.pack(); + + Tensor A({B.getDimension(0), mdim}, rm); + Tensor ref({B.getDimension(0), mdim}, rm); + IndexVar i, j, k, l, m; + IndexVar i0("i0"), i1("i1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"), k0("k0"), k1("k1"); + IndexVar l0("l0"), l1("l1"), m0("m0"), m1("m1"); + + A(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); + + if (statfile.is_open()) { + statfile + << "ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m);" << std::endl + << "B1_dimension: " << B.getDimension(0) << ", B2_dimension: " << B.getDimension(1) << ", vals: " << B.getStorage().getValues().getSize() << std::endl + << "C1_dimension: " << C.getDimension(0) << ", C2_dimension: " << C.getDimension(1) << ", vals: " << C.getStorage().getValues().getSize() << std::endl + << "D1_dimension: " << D.getDimension(0) << ", D2_dimension: " << D.getDimension(1) << ", vals: " << D.getStorage().getValues().getSize() << std::endl + << "E1_dimension: " << F.getDimension(0) << ", E2_dimension: " << F.getDimension(1) << ", vals: " << F.getStorage().getValues().getSize() << std::endl + << "G1_dimension: " << F.getDimension(0) << ", G2_dimension: " << G.getDimension(1) << ", vals: " << G.getStorage().getValues().getSize() << std::endl + << std::endl; + } + + // IndexStmt stmt = A.getAssignment().concretize(); + IndexStmt stmt = makeReductionNotation(A.getAssignment()); + stmt = makeConcreteNotation(stmt); + stmt = reorderLoopsTopologically(stmt); + stmt = loopFusionOverFission(stmt, A.getAssignment(), "b", 2); + stmt = stmt.split(i, i0, i1, 16); + + stmt = insertTemporaries(stmt); + stmt = parallelizeOuterLoop(stmt); + printToFile("sddmmSpMMGeMM", stmt); + + A.compile(stmt); + A.assemble(); + + + ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); + IndexStmt refStmt = makeReductionNotation(ref.getAssignment()); + refStmt = makeConcreteNotation(refStmt); + refStmt = refStmt.split(i, i0, i1, 16); + refStmt = insertTemporaries(refStmt); + refStmt = parallelizeOuterLoop(refStmt); + ref.compile(refStmt); + ref.assemble(); + + Tensor ref1({B.getDimension(0), B.getDimension(1)}, csr); + Tensor ref2({B.getDimension(0), ldim}, rm); + Tensor ref3({B.getDimension(0), mdim}, rm); + ref1(i,j)=B(i,j)*C(i,k)*D(j,k); + ref2(i,l)=ref1(i,j)*F(j,l); + ref3(i,m)=ref2(i,l)*G(l,m); + + IndexStmt ref1Stmt = ref1.getAssignment().concretize(); + + ref1Stmt = ref1Stmt.split(i, i0, i1, 16); + // // .pos(j, jpos, B(i,j)); + // // .split(k, k0, k1, 8); + // // .reorder({i0, i1, jpos0, k, jpos1}); + // // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + // // .parallelize(jpos1, ParallelUnit::CPUVector, OutputRaceStrategy::ParallelReduction); + // // ref1Stmt.split(i, ); + // // stmt = scheduleSDDMMCPU_forfuse(ref1Stmt, B); + // IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment()); + // ref1Stmt = makeConcreteNotation(ref1Stmt); + ref1Stmt = insertTemporaries(ref1Stmt); + ref1Stmt = parallelizeOuterLoop(ref1Stmt); + ref1.compile(ref1Stmt); + ref1.assemble(); + + IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment()); + ref2Stmt = makeConcreteNotation(ref2Stmt); + ref2Stmt = insertTemporaries(ref2Stmt); + ref2Stmt = parallelizeOuterLoop(ref2Stmt); + ref2.compile(ref2Stmt); + ref2.assemble(); + + // ref3(i,m)=ref2(i,l)*G(l,m); + IndexStmt ref3Stmt = makeReductionNotation(ref3.getAssignment()); + ref3Stmt = makeConcreteNotation(ref3Stmt); + ref3Stmt = ref3Stmt + .split(i, i0, i1, 32) + .split(l, l0, l1, 32) + .split(m, m0, m1, 32) + .reorder({i0, l0, m0, i1, l1, m1}); + ref3Stmt = insertTemporaries(ref3Stmt); + ref3Stmt = parallelizeOuterLoop(ref3Stmt); + ref3.compile(ref3Stmt); + ref3.assemble(); + + std::cout << "compute start\n"; + taco::util::TimeResults timevalue; + bool time = true; + + // std::string sofile_fused = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/fused_kernel.so"; + TOOL_BENCHMARK_TIMER(A.compute(statfile), "\n\nFused Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "fused time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + // std::string sofile_sddmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so"; + std::string sofile_sddmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.so"; + TOOL_BENCHMARK_TIMER(ref1.compute(statfile, sofile_sddmm), "\n\nSDDMM Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "sddmm time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + std::string sofile_sddmm_ryan = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/sddmm_ryan.so"; + TOOL_BENCHMARK_TIMER(ref1.compute(statfile, sofile_sddmm_ryan), "\n\nSDDMM ryan Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "sddmm ryan time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + std::string sofile_spmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so"; + TOOL_BENCHMARK_TIMER(ref2.compute(statfile, sofile_spmm), "\n\nSpMM ryan Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "spmm ryan time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + // std::string sofile_spmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so"; + TOOL_BENCHMARK_TIMER(ref3.compute(statfile), "\n\nGeMM Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "gemm time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } + + // std::string sofile_original = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/taco_original.so"; + TOOL_BENCHMARK_TIMER(ref.compute(statfile), "\n\nReference Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "taco reference time: "; + statfile << timevalue << std::endl; + } else { std::cout << " stat file is not open\n"; } + + double* A_vals = (double*) (A.getTacoTensorT()->vals); + double* ref_vals = (double*) (ref.getTacoTensorT()->vals); + double* ref3_vals = (double*) (ref3.getTacoTensorT()->vals); + + // int* A2_pos = (double*) (ref.getTacoTensorT()->vals); + + for (int q=0; q < A.getDimension(0)* A.getDimension(1); q++) { + if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) { + std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + << "refvals: " << ref_vals[q] << std::endl; + ASSERT_TRUE(false); + } + } + + for (int q=0; q < A.getDimension(0)* A.getDimension(1); q++) { + if ( abs(ref3_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) { + std::cout << "error: results don't match i: " << q << ", avals: " << ref3_vals[q] << " " + << "refvals: " << ref_vals[q] << std::endl; + ASSERT_TRUE(false); + } + } + + + + } + + // int filenum = 3; + + + // for (size_t q=0; q < A.getDimension(0)* A.getDimension(1); q++) { + // if ( abs(A_vals[q] - ref3_vals[q])/abs(ref3_vals[q]) > ERROR_MARGIN) { + // std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + // << "refvals: " << ref3_vals[q] << std::endl; + // ASSERT_TRUE(false); + // } + // } + // for (int q= 0; q< A_vals + // for (int q = 0; q < A.getDimension(0); ++q) { + // for (int w = 0; w < A.getDimension(1); ++w) { + // if ( abs(A(q,w) - ref(q,w))/abs(ref(q,w)) > ERROR_MARGIN) { + // std::cout << "error: results don't match A("<< q << "," << w << "): " + // << A(q,w) << ", ref: " << ref(q,w) << std::endl; + // ASSERT_TRUE(false); + // } + // } + // } + // ASSERT_TENSOR_EQ(A, ref); + + if (statfile.is_open()) { + statfile.close(); + } + +} \ No newline at end of file diff --git a/test/tests-scheduling-ispc-eval.cpp b/test/tests-scheduling-ispc-eval.cpp new file mode 100644 index 000000000..139597f9c --- /dev/null +++ b/test/tests-scheduling-ispc-eval.cpp @@ -0,0 +1,2 @@ + + diff --git a/test/tests-transformation.cpp b/test/tests-transformation.cpp index abfec3d45..9a472906f 100644 --- a/test/tests-transformation.cpp +++ b/test/tests-transformation.cpp @@ -255,6 +255,8 @@ INSTANTIATE_TEST_CASE_P(parallelize, apply, struct reorderLoopsTopologically : public TestWithParam {}; + +// TEST_P(reorderLoopsTopologically, test) { IndexStmt actual = taco::reorderLoopsTopologically(GetParam().actual); ASSERT_NOTATION_EQ(GetParam().expected, actual); diff --git a/test/util.h b/test/util.h new file mode 100644 index 000000000..f96087ba1 --- /dev/null +++ b/test/util.h @@ -0,0 +1,113 @@ +#ifndef __SCHEDULE_UTIL_HH__ +#define __SCHEDULE_UTIL_HH__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "taco/cuda.h" +#include "test.h" +#include "test_tensors.h" +#include "taco/tensor.h" +#include "taco/index_notation/index_notation.h" +#include "taco/index_notation/transformations.h" +#include "codegen/codegen.h" +#include "taco/lower/lower.h" +#include "taco/util/timers.h" + +using namespace taco; + +#define ERROR_MARGIN (1.0e-2) + +#define TOOL_BENCHMARK_TIMER(CODE,NAME,TIMER) { \ + if (time) { \ + taco::util::Timer timer; \ + timer.start(); \ + CODE; \ + timer.stop(); \ + taco::util::TimeResults result = timer.getResult(); \ + cout << NAME << " " << result << " ms" << endl; \ + TIMER=result; \ + } \ + else { \ + CODE; \ + } \ +} + +#define TOOL_BENCHMARK_TIMER2(CODE,NAME,TIMER) { \ + if (time) { \ + taco::util::Timer timer; \ + timer.start(); \ + CODE; \ + timer.stop(); \ + taco::util::TimeResults result = timer.getResult(); \ + if (statfile.is_open()) { \ + statfile << NAME << " " << result << " ms" << endl; \ + } else { \ + cout << NAME << " " << result << " ms" << endl; \ + } \ + TIMER=result; \ + } \ + else { \ + CODE; \ + } \ +} + +static void printToCout(IndexStmt stmt); +static void printToFile(string filename, IndexStmt stmt); +static void printToFile(string filename, string additional_filename, IndexStmt stmt); + + +static void printToCout(IndexStmt stmt) { + std::shared_ptr codegen = ir::CodeGen::init_default(cout, ir::CodeGen::ImplementationGen); + ir::Stmt compute = lower(stmt, "compute", false, true); + codegen->compile(compute, true); +} + +void printToFile(string filename, IndexStmt stmt) { + stringstream source; + + string file_path = "eval_generated/"; + mkdir(file_path.c_str(), 0777); + + std::shared_ptr codegen = ir::CodeGen::init_default(source, ir::CodeGen::ImplementationGen); + ir::Stmt compute = lower(stmt, "compute", false, true); + codegen->compile(compute, true); + + ofstream source_file; + string file_ending = should_use_CUDA_codegen() ? ".cu" : ".c"; + source_file.open(file_path + filename + file_ending); + source_file << source.str(); + source_file.close(); +} + +void printToFile(string filename, string additional_filename, IndexStmt stmt) { + stringstream source1; + stringstream source2; + + string file_path = "eval_generated/"; + mkdir(file_path.c_str(), 0777); + + std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); + ir::Stmt compute = lower(stmt, "compute", false, true); + codegen->compile(compute, true); + + ofstream source_file; + string file_ending = should_use_CUDA_codegen() ? ".cu" : ".c"; + source_file.open(file_path+filename+file_ending); + source_file << source1.str(); + source_file.close(); + + ofstream additional_source_file; + string additional_file_ending = ".ispc"; + additional_source_file.open(file_path+additional_filename+additional_file_ending); + additional_source_file << source2.str(); + additional_source_file.close(); + +} + +#endif // __SCHEDULE_UTIL_HH__ \ No newline at end of file diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt index 922f7e52e..41699d3fd 100644 --- a/tools/CMakeLists.txt +++ b/tools/CMakeLists.txt @@ -4,6 +4,7 @@ foreach(TOOL_SOURCE ${TOOL_SOURCES}) get_filename_component(TOOL ${TOOL_SOURCE} NAME_WE) add_executable("${TOOL}-tool" ${TOOL_SOURCE}) target_link_libraries("${TOOL}-tool" taco) + target_link_libraries("${TOOL}-tool" papi) target_include_directories("${TOOL}-tool" PRIVATE "${CMAKE_BINARY_DIR}/include") SET_TARGET_PROPERTIES("${TOOL}-tool" PROPERTIES OUTPUT_NAME ${TOOL}) install(TARGETS "${TOOL}-tool" DESTINATION bin) diff --git a/tools/taco.cpp b/tools/taco.cpp index bf7e7c9dc..7384874ec 100644 --- a/tools/taco.cpp +++ b/tools/taco.cpp @@ -9,6 +9,7 @@ #include "taco.h" #include "taco/error.h" +#include "taco/index_notation/index_notation.h" #include "taco/parser/lexer.h" #include "taco/parser/parser.h" #include "taco/parser/schedule_parser.h" @@ -313,7 +314,9 @@ static void printCommandLine(ostream& os, int argc, char* argv[]) { } } -static int setSchedulingCommands(vector> scheduleCommands, parser::Parser& parser, IndexStmt& stmt) { +static int setSchedulingCommands(vector> scheduleCommands, + parser::Parser& parser, IndexStmt& stmt, Assignment assignment) { + std::cout << "setting scheduling commands\n"; auto findVar = [&stmt](string name) { ProvenanceGraph graph(stmt); @@ -364,6 +367,16 @@ static int setSchedulingCommands(vector> scheduleCommands, parser IndexVar fused(f); stmt = stmt.fuse(findVar(i), findVar(j), fused); + } else if (command == "loopfuse") { + taco_uassert(scheduleCommand.size() == 2) + << "'loopfuse' scheduling directive takes 2 parameters: fuse(b, 2)"; + std::string side = scheduleCommand[0]; + taco_uassert(side == "b" || side == "f") + << "first parameter must be either 'f' or 'b'"; + + int iters = std::stoi(scheduleCommand[1]); + + stmt = loopFusionOverFission(stmt, assignment, side, iters); } else if (command == "split") { taco_uassert(scheduleCommand.size() == 4) << "'split' scheduling directive takes 4 parameters: split(i, i1, i2, splitFactor)"; @@ -1048,6 +1061,7 @@ int main(int argc, char* argv[]) { map loadedTensors; TensorBase temp_tensor; parser::Parser temp_parser(exprStr, formats, dataTypes, tensorsDimensions, loadedTensors, 42); + std::cout << exprStr << std::endl; try { temp_parser.parse(); temp_tensor = temp_parser.getResultTensor(); @@ -1148,19 +1162,27 @@ int main(int argc, char* argv[]) { taco_set_parallel_schedule(sched, chunkSize); taco_set_num_threads(nthreads); - IndexStmt stmt = - makeConcreteNotation(makeReductionNotation(tensor.getAssignment())); + Assignment assignment = tensor.getAssignment(); + std::cout << "tensor.getAssignment(): " << assignment << std::endl; + + IndexStmt stmt2 = makeReductionNotation(tensor.getAssignment()); + std::cout << "reducedNotation: " << stmt2 << std::endl; + // IndexStmt stmt = + // makeConcreteNotation(makeReductionNotation(tensor.getAssignment())); + IndexStmt stmt = makeConcreteNotation(stmt2); std::cout << "concrete index statement: " << stmt << std::endl; - stmt = justTraverseThroughTheIndexStmt(stmt); stmt = reorderLoopsTopologically(stmt); + std::cout << "topologically reordered loops statement: " << stmt << std::endl; if (setSchedule) { - int val = setSchedulingCommands(scheduleCommands, parser, stmt); + int val = setSchedulingCommands(scheduleCommands, parser, stmt, tensor.getAssignment()); + // stmt = loopFusionOverFission(stmt, tensor.getAssignment()); cuda |= (val==1); ispc |= (val==2); } else { + // stmt = loopFusionOverFission(stmt, tensor.getAssignment()); stmt = insertTemporaries(stmt); stmt = parallelizeOuterLoop(stmt); } @@ -1186,12 +1208,15 @@ int main(int argc, char* argv[]) { set_ISPC_codegen_enabled(false); } - std::cout << "running scalar promote\n" << std::endl; + std::cout << "running scalar promote\n" << std::endl; // stmt = scalarPromote(stmt); + std::cout << "\nafter scalar promote: \n" << stmt << std::endl << std::endl; + if (printConcrete) { cout << stmt << endl; } + // lower index statement to ir statement Kernel kernel; if (benchmark) { if (time) cout << endl; @@ -1278,6 +1303,11 @@ int main(int argc, char* argv[]) { compute = lower(stmt, prefix+"compute", computeWithAssemble, true); assemble = lower(stmt, prefix+"assemble", true, false); evaluate = lower(stmt, prefix+"evaluate", true, true); + + std::cout << "\n\ncompute kernel\n------------\n" << compute << std::endl << std::endl; + // compute kernel is the most basic kernel after lowering phase + + std::cout << "\n\nevaluate kernel\n------------\n" << evaluate << std::endl << std::endl; } string packComment = @@ -1411,7 +1441,7 @@ int main(int argc, char* argv[]) { } IterationGraph iterationGraph; - if (printIterationGraph) { + if (printIterationGraph) { // print iteration graph iterationGraph = IterationGraph::make(tensor.getAssignment()); } From 43d1bf7f03397c0445f74b7d78643313968e3d0a Mon Sep 17 00:00:00 2001 From: Adhhitha Dias Date: Tue, 10 May 2022 11:14:59 -0400 Subject: [PATCH 10/16] add results --- CMakeLists.txt | 2 +- test/stats/hadamard-gemm.txt | 172 +++++++ test/stats/sddmm-spmm-gemm.txt | 318 +++++++++++++ test/stats/sddmm-spmm.txt | 821 +++++++++++++++++++++++++++++++++ test/stats/spmm-spmm.txt | 172 +++++++ test/tests-scheduling-fuse.cpp | 239 +++++----- 6 files changed, 1613 insertions(+), 111 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index aff905db5..c9012ca2d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,7 +16,7 @@ option(OPENMP "Build with OpenMP execution support" ON) option(COVERAGE "Build with code coverage analysis" OFF) set(TACO_FEATURE_CUDA 0) set(TACO_FEATURE_ISPC 0) -set(TACO_FEATURE_OPENMP 0) +set(TACO_FEATURE_OPENMP 1) set(TACO_FEATURE_PYTHON 0) if(CUDA) message("-- Searching for CUDA Installation") diff --git a/test/stats/hadamard-gemm.txt b/test/stats/hadamard-gemm.txt index 7de96d3c5..6e730cf50 100644 --- a/test/stats/hadamard-gemm.txt +++ b/test/stats/hadamard-gemm.txt @@ -747,3 +747,175 @@ gemm time: 61505.4 kernel execution time: 245613 ms taco reference time: 245614 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 128, E2_dimension: 128, vals: 16384 + + +kernel execution time: 18.3809 ms +fused time: 19.1229 + +kernel execution time: 0.635828 ms +hadamard time: 0.983143 + +kernel execution time: 30.5122 ms +gemm time: 30.7819 + +kernel execution time: 23.6746 ms +taco reference time: 24.0784 +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 548551, B2_dimension: 548551, vals: 1851744 +C1_dimension: 548551, C2_dimension: 128, vals: 70214528 +D1_dimension: 548551, D2_dimension: 128, vals: 70214528 +E1_dimension: 128, E2_dimension: 128, vals: 16384 + + +kernel execution time: 3580.2 ms +fused time: 3581 + +kernel execution time: 567.762 ms +hadamard time: 568.301 + +kernel execution time: 6079.96 ms +gemm time: 6080.46 + +kernel execution time: 8129.78 ms +taco reference time: 8130.38 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 128, E2_dimension: 128, vals: 16384 + + +kernel execution time: 18.4625 ms +fused time: 19.1824 + +kernel execution time: 0.520446 ms +hadamard time: 0.824011 + +kernel execution time: 30.2097 ms +gemm time: 30.46 + +kernel execution time: 23.4681 ms +taco reference time: 23.826 +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 548551, B2_dimension: 548551, vals: 1851744 +C1_dimension: 548551, C2_dimension: 128, vals: 70214528 +D1_dimension: 548551, D2_dimension: 128, vals: 70214528 +E1_dimension: 128, E2_dimension: 128, vals: 16384 + + +kernel execution time: 3528.39 ms +fused time: 3529.23 + +kernel execution time: 558.625 ms +hadamard time: 559.16 + +kernel execution time: 6157.3 ms +gemm time: 6158.14 + +kernel execution time: 8131.73 ms +taco reference time: 8132.69 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 128, E2_dimension: 128, vals: 16384 + + +kernel execution time: 2.27347 ms +fused time: 2.7115 + +kernel execution time: 0.180952 ms +hadamard time: 0.76318 + +kernel execution time: 2.72672 ms +gemm time: 3.22211 + +kernel execution time: 5.227 ms +taco reference time: 5.75632 +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 548551, B2_dimension: 548551, vals: 1851744 +C1_dimension: 548551, C2_dimension: 128, vals: 70214528 +D1_dimension: 548551, D2_dimension: 128, vals: 70214528 +E1_dimension: 128, E2_dimension: 128, vals: 16384 + + +kernel execution time: 164.815 ms +fused time: 165.539 + +kernel execution time: 96.629 ms +hadamard time: 97.303 + +kernel execution time: 202.068 ms +gemm time: 202.628 + +kernel execution time: 273.96 ms +taco reference time: 274.643 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 128, E2_dimension: 128, vals: 16384 + + +kernel execution time: 2.37004 ms +fused time: 3.11591 + +kernel execution time: 0.176612 ms +hadamard time: 0.833621 + +kernel execution time: 2.08823 ms +gemm time: 2.59022 + +kernel execution time: 3.36531 ms +taco reference time: 4.11087 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 128, E2_dimension: 128, vals: 16384 + + +kernel execution time: 19.3307 ms +fused time: 20.0662 + +kernel execution time: 0.496176 ms +hadamard time: 0.931803 + +kernel execution time: 30.1194 ms +gemm time: 30.3654 + +kernel execution time: 23.3946 ms +taco reference time: 23.7411 diff --git a/test/stats/sddmm-spmm-gemm.txt b/test/stats/sddmm-spmm-gemm.txt index 7bd2084ed..02665478f 100644 --- a/test/stats/sddmm-spmm-gemm.txt +++ b/test/stats/sddmm-spmm-gemm.txt @@ -1151,3 +1151,321 @@ gemm time: 421.26 kernel execution time: 326305 ms taco reference time: 326311 + +sddmm-spmm-gemm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 64, vals: 173312 +D1_dimension: 2708, D2_dimension: 64, vals: 173312 +E1_dimension: 2708, E2_dimension: 64, vals: 173312 +G1_dimension: 2708, G2_dimension: 64, vals: 4096 + + +kernel execution time: 5.08607 ms +fused time: 5.61989 + +kernel execution time: 0.557608 ms +sddmm time: 0.871642 + +kernel execution time: 0.465526 ms +sddmm ryan time: 0.7713 + +kernel execution time: 0.498686 ms +spmm ryan time: 0.739309 + +kernel execution time: 0.7957 ms +gemm time: 1.05919 + +kernel execution time: 42.447 ms +taco reference time: 42.885 +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 548551, B2_dimension: 548551, vals: 1851744 +C1_dimension: 548551, C2_dimension: 64, vals: 35107264 +D1_dimension: 548551, D2_dimension: 64, vals: 35107264 +E1_dimension: 548551, E2_dimension: 64, vals: 35107264 +G1_dimension: 548551, G2_dimension: 64, vals: 4096 + + +kernel execution time: 89.9099 ms +fused time: 90.5117 + +kernel execution time: 29.9086 ms +sddmm time: 30.4936 + +kernel execution time: 29.1529 ms +sddmm ryan time: 29.7063 + +kernel execution time: 34.6318 ms +spmm ryan time: 35.1535 + +kernel execution time: 66.4663 ms +gemm time: 67.0316 + +kernel execution time: 6272.25 ms +taco reference time: 6273.42 + +sddmm-spmm-gemm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 64, vals: 173312 +D1_dimension: 2708, D2_dimension: 64, vals: 173312 +E1_dimension: 2708, E2_dimension: 64, vals: 173312 +G1_dimension: 2708, G2_dimension: 64, vals: 4096 + + +kernel execution time: 3.72391 ms +fused time: 4.19698 + +kernel execution time: 0.585647 ms +sddmm time: 0.893112 + +kernel execution time: 0.483056 ms +sddmm ryan time: 0.79108 + +kernel execution time: 0.567518 ms +spmm ryan time: 0.808711 + +kernel execution time: 0.929183 ms +gemm time: 1.32543 + +kernel execution time: 35.7066 ms +taco reference time: 36.3331 +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 548551, B2_dimension: 548551, vals: 1851744 +C1_dimension: 548551, C2_dimension: 64, vals: 35107264 +D1_dimension: 548551, D2_dimension: 64, vals: 35107264 +E1_dimension: 548551, E2_dimension: 64, vals: 35107264 +G1_dimension: 548551, G2_dimension: 64, vals: 4096 + + +kernel execution time: 94.9377 ms +fused time: 95.7687 + +kernel execution time: 32.2051 ms +sddmm time: 32.7881 + +kernel execution time: 30.3982 ms +sddmm ryan time: 30.95 + +kernel execution time: 34.4172 ms +spmm ryan time: 34.9049 + +kernel execution time: 67.2709 ms +gemm time: 67.8035 + +kernel execution time: 6215.08 ms +taco reference time: 6216.26 + +sddmm-spmm-gemm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 64, vals: 173312 +D1_dimension: 2708, D2_dimension: 64, vals: 173312 +E1_dimension: 2708, E2_dimension: 64, vals: 173312 +G1_dimension: 2708, G2_dimension: 64, vals: 4096 + + +kernel execution time: 6.99173 ms +fused time: 7.86448 + +kernel execution time: 0.78061 ms +sddmm time: 1.28867 + +kernel execution time: 0.554227 ms +sddmm ryan time: 0.837111 + +kernel execution time: 0.909912 ms +spmm ryan time: 1.12908 + +kernel execution time: 7.60724 ms +gemm time: 7.85047 + +kernel execution time: 652.888 ms +taco reference time: 653.271 +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 548551, B2_dimension: 548551, vals: 1851744 +C1_dimension: 548551, C2_dimension: 64, vals: 35107264 +D1_dimension: 548551, D2_dimension: 64, vals: 35107264 +E1_dimension: 548551, E2_dimension: 64, vals: 35107264 +G1_dimension: 548551, G2_dimension: 64, vals: 4096 + + +kernel execution time: 1236.33 ms +fused time: 1236.87 + +kernel execution time: 249.805 ms +sddmm time: 250.356 + +kernel execution time: 247.195 ms +sddmm ryan time: 247.729 + +kernel execution time: 285.764 ms +spmm ryan time: 286.235 + +kernel execution time: 1529.34 ms +gemm time: 1529.83 + +kernel execution time: 190620 ms +taco reference time: 190621 + +sddmm-spmm-gemm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 64, vals: 173312 +D1_dimension: 2708, D2_dimension: 64, vals: 173312 +E1_dimension: 2708, E2_dimension: 64, vals: 173312 +G1_dimension: 2708, G2_dimension: 64, vals: 4096 + + +kernel execution time: 1.86163 ms +fused time: 2.34746 + +kernel execution time: 0.542927 ms +sddmm time: 1.05528 + +kernel execution time: 0.541998 ms +sddmm ryan time: 1.07672 + +kernel execution time: 0.524767 ms +spmm ryan time: 0.944293 + +kernel execution time: 0.75947 ms +gemm time: 1.2162 + +kernel execution time: 36.3755 ms +taco reference time: 37.0989 + +sddmm-spmm-gemm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 64, vals: 173312 +D1_dimension: 2708, D2_dimension: 64, vals: 173312 +E1_dimension: 2708, E2_dimension: 64, vals: 173312 +G1_dimension: 2708, G2_dimension: 64, vals: 4096 + + +kernel execution time: 1.97375 ms +fused time: 2.84436 + +kernel execution time: 0.881212 ms +sddmm time: 1.38907 + +kernel execution time: 0.545557 ms +sddmm ryan time: 1.0807 + +kernel execution time: 0.548488 ms +spmm ryan time: 0.978813 + +kernel execution time: 0.72955 ms +gemm time: 1.2023 + +kernel execution time: 34.867 ms +taco reference time: 35.5819 + +sddmm-spmm-gemm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 64, vals: 173312 +D1_dimension: 2708, D2_dimension: 64, vals: 173312 +E1_dimension: 2708, E2_dimension: 64, vals: 173312 +G1_dimension: 2708, G2_dimension: 64, vals: 4096 + + +kernel execution time: 1.69165 ms +fused time: 2.2114 + +kernel execution time: 0.908102 ms +sddmm time: 1.19792 + +kernel execution time: 0.513137 ms +sddmm ryan time: 0.807571 + +kernel execution time: 0.510327 ms +spmm ryan time: 0.76134 + +kernel execution time: 0.803101 ms +gemm time: 1.0684 + +kernel execution time: 45.9784 ms +taco reference time: 46.3901 + +sddmm-spmm-gemm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 64, vals: 173312 +D1_dimension: 2708, D2_dimension: 64, vals: 173312 +E1_dimension: 2708, E2_dimension: 64, vals: 173312 +G1_dimension: 2708, G2_dimension: 64, vals: 4096 + + +kernel execution time: 1.82354 ms +fused time: 2.81223 + +kernel execution time: 0.926052 ms +sddmm time: 1.48292 + +kernel execution time: 0.564157 ms +sddmm ryan time: 1.14611 + +kernel execution time: 0.512447 ms +spmm ryan time: 0.925102 + +kernel execution time: 0.689109 ms +gemm time: 1.08196 + +kernel execution time: 34.7847 ms +taco reference time: 35.4182 + +sddmm-spmm-gemm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 64, vals: 173312 +D1_dimension: 2708, D2_dimension: 64, vals: 173312 +E1_dimension: 2708, E2_dimension: 64, vals: 173312 +G1_dimension: 2708, G2_dimension: 64, vals: 4096 + + +kernel execution time: 6.8174 ms +fused time: 7.69061 + +kernel execution time: 0.935843 ms +sddmm time: 1.46847 + +kernel execution time: 0.612468 ms +sddmm ryan time: 0.880662 + +kernel execution time: 0.831351 ms +spmm ryan time: 1.05745 + +kernel execution time: 7.58342 ms +gemm time: 7.82297 + +kernel execution time: 566.881 ms +taco reference time: 567.264 diff --git a/test/stats/sddmm-spmm.txt b/test/stats/sddmm-spmm.txt index cc1713e9f..df8d924b8 100644 --- a/test/stats/sddmm-spmm.txt +++ b/test/stats/sddmm-spmm.txt @@ -5172,3 +5172,824 @@ separate execution kernel execution time: 4107.02 ms sddmm time: 4122.77 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 5, B2_dimension: 5, vals: 19 +C1_dimension: 5, C2_dimension: 128, vals: 640 +D1_dimension: 5, D2_dimension: 128, vals: 640 +E1_dimension: 5, E2_dimension: 128, vals: 640 + + +kernel execution time: 0.115981 ms +fused time: 0.499507 + +separate execution + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 5, B2_dimension: 5, vals: 19 +C1_dimension: 5, C2_dimension: 128, vals: 640 +D1_dimension: 5, D2_dimension: 128, vals: 640 +E1_dimension: 5, E2_dimension: 128, vals: 640 + + +kernel execution time: 0.133052 ms +fused time: 3.69599 + +separate execution + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2700, B2_dimension: 2700, vals: 5400 +C1_dimension: 2700, C2_dimension: 128, vals: 345600 +D1_dimension: 2700, D2_dimension: 128, vals: 345600 +E1_dimension: 2700, E2_dimension: 128, vals: 345600 + + +kernel execution time: 0.606469 ms +fused time: 4.32552 + +separate execution + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2700, B2_dimension: 2700, vals: 5400 +C1_dimension: 2700, C2_dimension: 128, vals: 345600 +D1_dimension: 2700, D2_dimension: 128, vals: 345600 +E1_dimension: 2700, E2_dimension: 128, vals: 345600 + + +kernel execution time: 0.650529 ms +fused time: 1.40893 + +separate execution + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5400 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 0.620999 ms +fused time: 1.38301 + +separate execution + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5400 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 0.652959 ms +fused time: 3.94184 + +separate execution + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 0.597158 ms +fused time: 4.27836 + +separate execution + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 0.659809 ms +fused time: 4.6484 + +separate execution + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 0.591018 ms +fused time: 2.44084 + +separate execution + +kernel execution time: 0.607388 ms +sddmm time: 0.891202 + +kernel execution time: 0.857981 ms +sddmm time: 1.16087 + +kernel execution time: 0.922992 ms +spmm time: 1.60378 + +reference execution + +kernel execution time: 4.47191 ms +taco reference time: 5.26226 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 0.658879 ms +fused time: 4.15402 + +separate execution + +kernel execution time: 0.70888 ms +sddmm time: 1.21343 + +kernel execution time: 0.531398 ms +sddmm time: 1.30729 + +kernel execution time: 0.965464 ms +spmm time: 2.35378 + +reference execution + +kernel execution time: 3.48771 ms +taco reference time: 7.55141 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 0.616739 ms +fused time: 4.4146 + +separate execution + +kernel execution time: 0.556318 ms +sddmm time: 3.03196 + +kernel execution time: 0.945623 ms +sddmm time: 1.89019 + +kernel execution time: 0.777471 ms +spmm time: 3.57728 + +reference execution + +kernel execution time: 3.22827 ms +taco reference time: 7.39799 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 0.65531 ms +fused time: 4.08374 + +separate execution + +kernel execution time: 0.666219 ms +sddmm time: 1.20641 + +kernel execution time: 0.941573 ms +sddmm time: 1.73185 + +kernel execution time: 1.01493 ms +spmm time: 1.75608 + +reference execution + +kernel execution time: 5.25507 ms +taco reference time: 6.04624 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 0.670959 ms +fused time: 1.50328 + +separate execution + +kernel execution time: 0.600268 ms +sddmm time: 1.32833 + +kernel execution time: 0.476237 ms +sddmm time: 0.792151 + +kernel execution time: 0.781091 ms +spmm time: 1.10271 + +reference execution + +kernel execution time: 3.07623 ms +taco reference time: 3.53829 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 0.760541 ms +fused time: 1.49073 + +separate execution + +kernel execution time: 0.639829 ms +sddmm time: 1.21327 + +kernel execution time: 0.576218 ms +sddmm time: 1.14083 + +kernel execution time: 0.829512 ms +spmm time: 1.33624 + +reference execution + +kernel execution time: 4.14591 ms +taco reference time: 4.82508 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 0.638949 ms +fused time: 1.02277 + +separate execution + +kernel execution time: 0.945034 ms +sddmm time: 1.20456 + +kernel execution time: 0.6772 ms +sddmm time: 0.943263 + +kernel execution time: 0.888033 ms +spmm time: 1.133 + +reference execution + +kernel execution time: 3.82989 ms +taco reference time: 4.18452 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 0.7361 ms +fused time: 1.45315 + +separate execution + +kernel execution time: 0.7335 ms +sddmm time: 1.25184 + +kernel execution time: 0.642509 ms +sddmm time: 1.16064 + +kernel execution time: 1.02361 ms +spmm time: 1.48614 + +reference execution + +kernel execution time: 4.12035 ms +taco reference time: 4.75857 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 334863, B2_dimension: 334863, vals: 777323 +C1_dimension: 334863, C2_dimension: 128, vals: 42862464 +D1_dimension: 334863, D2_dimension: 128, vals: 42862464 +E1_dimension: 334863, E2_dimension: 128, vals: 42862464 + + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 548551, B2_dimension: 548551, vals: 925872 +C1_dimension: 548551, C2_dimension: 128, vals: 70214528 +D1_dimension: 548551, D2_dimension: 128, vals: 70214528 +E1_dimension: 548551, E2_dimension: 128, vals: 70214528 + + +kernel execution time: 66.4595 ms +fused time: 66.9196 + +separate execution + +kernel execution time: 22.9317 ms +sddmm time: 23.4738 + +kernel execution time: 22.4453 ms +sddmm time: 23.0045 + +kernel execution time: 44.2796 ms +spmm time: 44.8052 + +reference execution + +kernel execution time: 187.6 ms +taco reference time: 188.247 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 548551, B2_dimension: 548551, vals: 1851744 +C1_dimension: 548551, C2_dimension: 128, vals: 70214528 +D1_dimension: 548551, D2_dimension: 128, vals: 70214528 +E1_dimension: 548551, E2_dimension: 128, vals: 70214528 + + +kernel execution time: 103.551 ms +fused time: 104.018 + +separate execution + +kernel execution time: 39.9535 ms +sddmm time: 40.5639 + +kernel execution time: 39.2683 ms +sddmm time: 39.8581 + +kernel execution time: 65.8336 ms +spmm time: 66.417 + +reference execution + +kernel execution time: 306.901 ms +taco reference time: 307.61 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 548551, B2_dimension: 548551, vals: 1851744 +C1_dimension: 548551, C2_dimension: 128, vals: 70214528 +D1_dimension: 548551, D2_dimension: 128, vals: 70214528 +E1_dimension: 548551, E2_dimension: 128, vals: 70214528 + + +kernel execution time: 106.782 ms +fused time: 107.261 + +separate execution + +kernel execution time: 40.7961 ms +sddmm time: 41.3604 + +kernel execution time: 39.8676 ms +sddmm time: 40.4959 + +kernel execution time: 66.2656 ms +spmm time: 66.8105 + +reference execution + +kernel execution time: 367.416 ms +taco reference time: 368.086 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 548551, B2_dimension: 548551, vals: 1851744 +C1_dimension: 548551, C2_dimension: 128, vals: 70214528 +D1_dimension: 548551, D2_dimension: 128, vals: 70214528 +E1_dimension: 548551, E2_dimension: 128, vals: 70214528 + + +kernel execution time: 108.809 ms +fused time: 109.274 + +separate execution + +kernel execution time: 42.2311 ms +sddmm time: 42.826 + +kernel execution time: 41.711 ms +sddmm time: 42.3721 + +kernel execution time: 65.9512 ms +spmm time: 66.5647 + +reference execution + +kernel execution time: 360.581 ms +taco reference time: 361.225 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 548551, B2_dimension: 548551, vals: 1851744 +C1_dimension: 548551, C2_dimension: 128, vals: 70214528 +D1_dimension: 548551, D2_dimension: 128, vals: 70214528 +E1_dimension: 548551, E2_dimension: 128, vals: 70214528 + + +kernel execution time: 922.149 ms +fused time: 922.605 + +separate execution + +kernel execution time: 392.18 ms +sddmm time: 392.716 + +kernel execution time: 393.251 ms +sddmm time: 393.777 + +kernel execution time: 520.496 ms +spmm time: 521.007 + +reference execution + +kernel execution time: 9912.29 ms +taco reference time: 9913.37 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 2.15935 ms +fused time: 2.88765 + +separate execution + +kernel execution time: 1.09729 ms +sddmm time: 1.64867 + +kernel execution time: 0.987463 ms +sddmm time: 1.50853 + +kernel execution time: 2.22996 ms +spmm time: 2.71273 + +reference execution + +kernel execution time: 29.4617 ms +taco reference time: 29.8511 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 0.667108 ms +fused time: 1.05163 + +separate execution + +kernel execution time: 0.680159 ms +sddmm time: 0.994963 + +kernel execution time: 0.611478 ms +sddmm time: 1.1057 + +kernel execution time: 0.988313 ms +spmm time: 1.4939 + +reference execution + +kernel execution time: 3.64386 ms +taco reference time: 4.33446 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 0.691709 ms +fused time: 1.07767 + +separate execution + +kernel execution time: 0.516997 ms +sddmm time: 0.77957 + +kernel execution time: 0.458366 ms +sddmm time: 0.73026 + +kernel execution time: 0.777811 ms +spmm time: 1.01678 + +reference execution + +kernel execution time: 3.47463 ms +taco reference time: 3.82426 +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 548551, B2_dimension: 548551, vals: 1851744 +C1_dimension: 548551, C2_dimension: 128, vals: 70214528 +D1_dimension: 548551, D2_dimension: 128, vals: 70214528 +E1_dimension: 548551, E2_dimension: 128, vals: 70214528 + + +kernel execution time: 104.681 ms +fused time: 105.128 + +separate execution + +kernel execution time: 39.5478 ms +sddmm time: 40.1164 + +kernel execution time: 40.2068 ms +sddmm time: 40.7802 + +kernel execution time: 67.2769 ms +spmm time: 67.8666 + +reference execution + +kernel execution time: 378.806 ms +taco reference time: 379.526 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 2.0421 ms +fused time: 2.77318 + +separate execution + +kernel execution time: 0.890922 ms +sddmm time: 1.4406 + +kernel execution time: 0.673509 ms +sddmm time: 0.955103 + +kernel execution time: 1.93153 ms +spmm time: 2.18341 + +reference execution + +kernel execution time: 33.2851 ms +taco reference time: 33.6343 +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 548551, B2_dimension: 548551, vals: 1851744 +C1_dimension: 548551, C2_dimension: 128, vals: 70214528 +D1_dimension: 548551, D2_dimension: 128, vals: 70214528 +E1_dimension: 548551, E2_dimension: 128, vals: 70214528 + + +kernel execution time: 913.728 ms +fused time: 914.178 + +separate execution + +kernel execution time: 389.744 ms +sddmm time: 390.317 + +kernel execution time: 389.105 ms +sddmm time: 389.68 + +kernel execution time: 520.43 ms +spmm time: 520.979 + +reference execution + +kernel execution time: 9970.19 ms +taco reference time: 9971.18 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 1.81249 ms +fused time: 2.53831 + +separate execution + +kernel execution time: 1.41327 ms +sddmm time: 1.9866 + +kernel execution time: 0.687839 ms +sddmm time: 0.957583 + +kernel execution time: 1.99132 ms +spmm time: 2.2301 + +reference execution + +kernel execution time: 33.8389 ms +taco reference time: 34.1855 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 2.08639 ms +fused time: 2.81403 + +separate execution + +kernel execution time: 0.75901 ms +sddmm time: 1.27309 + +kernel execution time: 0.72208 ms +sddmm time: 1.00494 + +kernel execution time: 1.95748 ms +spmm time: 2.20503 + +reference execution + +kernel execution time: 33.4827 ms +taco reference time: 33.8347 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 2.09414 ms +fused time: 2.82691 + +separate execution + +kernel execution time: 1.03623 ms +sddmm time: 1.58316 + +kernel execution time: 0.653819 ms +sddmm time: 0.926463 + +kernel execution time: 1.88145 ms +spmm time: 2.12517 + +reference execution + +kernel execution time: 33.3395 ms +taco reference time: 33.6915 + +sddmm-spmm execution + +----------------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 2708, D2_dimension: 128, vals: 346624 +E1_dimension: 2708, E2_dimension: 128, vals: 346624 + + +kernel execution time: 1.70968 ms +fused time: 2.43176 + +separate execution + +kernel execution time: 0.76455 ms +sddmm time: 1.31209 + +kernel execution time: 0.664099 ms +sddmm time: 0.932353 + +kernel execution time: 1.92536 ms +spmm time: 2.17072 + +reference execution + +kernel execution time: 32.5601 ms +taco reference time: 32.9017 diff --git a/test/stats/spmm-spmm.txt b/test/stats/spmm-spmm.txt index 2cc71e519..329aacd65 100644 --- a/test/stats/spmm-spmm.txt +++ b/test/stats/spmm-spmm.txt @@ -3430,3 +3430,175 @@ spmm-spmm execution ----------------------------------------- filenum: 3 --------------------------------- + +spmm-spmm execution + +----------------------------------------- +filenum: 0 +--------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +--------- 1st pattern computation TTM, GEMM + +kernel execution time: 0.924512 ms +SpMM time: 1.22967 + +kernel execution time: 1.23287 ms +SpMM template time: 1.51353 + +kernel execution time: 20.7805 ms +GeMM time: 21.0769 + +kernel execution time: 19.6116 ms +ref 2 GeMM template time: 19.8379 + +--------- 2nd pattern computation GEMM, SpMM + +kernel execution time: 14.7563 ms +ref3 GeMM template time: 15.0245 + +kernel execution time: 0.823641 ms +SpMM template time ref4: 1.05233 + +-------- reference pattern computation + +kernel execution time: 34.1041 ms +taco reference time: 34.4607 + +kernel execution time: 41.9195 ms +taco reference new time: 42.2061 + +kernel execution time: 4.76242 ms +fused time: 5.04101 +filenum: 1 +--------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 548551, B2_dimension: 548551, vals: 1851744 +C1_dimension: 548551, C2_dimension: 128, vals: 70214528 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +--------- 1st pattern computation TTM, GEMM + +kernel execution time: 394.8 ms +SpMM time: 395.503 + +kernel execution time: 473.148 ms +SpMM template time: 473.684 + +kernel execution time: 4117.68 ms +GeMM time: 4118.6 + +kernel execution time: 3957.31 ms +ref 2 GeMM template time: 3958.16 + +--------- 2nd pattern computation GEMM, SpMM + +kernel execution time: 3017.13 ms +ref3 GeMM template time: 3017.67 + +kernel execution time: 314.652 ms +SpMM template time ref4: 315.164 + +-------- reference pattern computation + +kernel execution time: 11644.6 ms +taco reference time: 11645.6 + +kernel execution time: 14402.6 ms +taco reference new time: 14403.6 + +kernel execution time: 1261.33 ms +fused time: 1261.88 + +spmm-spmm execution + +----------------------------------------- +filenum: 0 +--------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 2708, B2_dimension: 2708, vals: 5429 +C1_dimension: 2708, C2_dimension: 128, vals: 346624 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +--------- 1st pattern computation TTM, GEMM + +kernel execution time: 0.209133 ms +SpMM time: 0.517016 + +kernel execution time: 0.579748 ms +SpMM template time: 0.864251 + +kernel execution time: 1.0574 ms +GeMM time: 1.37727 + +kernel execution time: 19.621 ms +ref 2 GeMM template time: 19.8504 + +--------- 2nd pattern computation GEMM, SpMM + +kernel execution time: 1.44618 ms +ref3 GeMM template time: 1.72243 + +kernel execution time: 0.384425 ms +SpMM template time ref4: 0.610708 + +-------- reference pattern computation + +kernel execution time: 3.59893 ms +taco reference time: 3.95508 + +kernel execution time: 4.81855 ms +taco reference new time: 5.10349 + +kernel execution time: 1.47107 ms +fused time: 1.90463 +filenum: 1 +--------------------------------- +/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx +ref(i,l)=B(i,j)*C(i,k)*D(j,k); +B1_dimension: 548551, B2_dimension: 548551, vals: 1851744 +C1_dimension: 548551, C2_dimension: 128, vals: 70214528 +D1_dimension: 128, D2_dimension: 64, vals: 8192 + + +--------- 1st pattern computation TTM, GEMM + +kernel execution time: 50.1795 ms +SpMM time: 50.5567 + +kernel execution time: 64.2504 ms +SpMM template time: 64.8179 + +kernel execution time: 96.8464 ms +GeMM time: 97.4123 + +kernel execution time: 3949.87 ms +ref 2 GeMM template time: 3950.93 + +--------- 2nd pattern computation GEMM, SpMM + +kernel execution time: 123.802 ms +ref3 GeMM template time: 124.342 + +kernel execution time: 39.2723 ms +SpMM template time ref4: 39.8322 + +-------- reference pattern computation + +kernel execution time: 457.271 ms +taco reference time: 457.979 + +kernel execution time: 427.194 ms +taco reference new time: 427.789 + +kernel execution time: 93.1417 ms +fused time: 93.7188 diff --git a/test/tests-scheduling-fuse.cpp b/test/tests-scheduling-fuse.cpp index bd77f1d64..41fb86f6f 100644 --- a/test/tests-scheduling-fuse.cpp +++ b/test/tests-scheduling-fuse.cpp @@ -7,8 +7,8 @@ #include #include -// #define NUM_THREADS_TO_USE 64 -#define NUM_THREADS_TO_USE 32 +#define NUM_THREADS_TO_USE 1 +// #define NUM_THREADS_TO_USE 32 void handle_error (int retval) { @@ -518,13 +518,15 @@ TEST(scheduling_eval, sddmmFused) { // vector filenums = {2,3,4,5,6,7,8,9,10,12,15}; - vector filenums = {1}; + vector filenums = {0}; for (auto filenum : filenums) { // int filenum = 5; std::vector matfiles = { + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx", + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx", "/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx", "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx", "/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx", @@ -545,6 +547,8 @@ TEST(scheduling_eval, sddmmFused) { "/home/min/a/kadhitha/ispc-examples/data/suitesparse/twitter7/twitter7.mtx" }; std::vector matfilesrw = { + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/rw/cora.mtx", + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/rw/amazon.mtx", "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/synthetic.mtx", "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cage3.mtx", "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/bcsstk17.mtx", @@ -688,66 +692,66 @@ TEST(scheduling_eval, sddmmFused) { statfile << "\nseparate execution\n"; - // // std::string sofile_sddmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so"; - // std::string sofile_sddmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.so"; - // TOOL_BENCHMARK_TIMER(ref1.compute(statfile, sofile_sddmm), "\n\nSDDMM Kernel: ", timevalue); - // if (statfile.is_open()) { - // statfile << "sddmm time: "; - // statfile << timevalue.mean << std::endl; - // } else { std::cout << " stat file is not open\n"; } + // std::string sofile_sddmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so"; + std::string sofile_sddmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_dense_sddmm.so"; + TOOL_BENCHMARK_TIMER(ref1.compute(statfile, sofile_sddmm), "\n\nSDDMM Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "sddmm time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } - // std::string sofile_sddmm_ryan = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/sddmm_ryan.so"; - // TOOL_BENCHMARK_TIMER(ref1.compute(statfile, sofile_sddmm_ryan), "\n\nSDDMM Kernel: ", timevalue); - // if (statfile.is_open()) { - // statfile << "sddmm time: "; - // statfile << timevalue.mean << std::endl; - // } else { std::cout << " stat file is not open\n"; } + std::string sofile_sddmm_ryan = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/sddmm_ryan.so"; + TOOL_BENCHMARK_TIMER(ref1.compute(statfile, sofile_sddmm_ryan), "\n\nSDDMM Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "sddmm time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } - // std::string sofile_spmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so"; - // TOOL_BENCHMARK_TIMER(ref2.compute(statfile, sofile_spmm), "\n\nSpMM Kernel: ", timevalue); - // if (statfile.is_open()) { - // statfile << "spmm time: "; - // statfile << timevalue.mean << std::endl; - // } else { std::cout << " stat file is not open\n"; } + std::string sofile_spmm = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so"; + TOOL_BENCHMARK_TIMER(ref2.compute(statfile, sofile_spmm), "\n\nSpMM Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "spmm time: "; + statfile << timevalue.mean << std::endl; + } else { std::cout << " stat file is not open\n"; } - // statfile << "\nreference execution \n"; + statfile << "\nreference execution \n"; - // std::string sofile_original = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/taco_original.so"; - // TOOL_BENCHMARK_TIMER(ref.compute(statfile, sofile_original), "\n\nReference Kernel: ", timevalue); - // if (statfile.is_open()) { - // statfile << "taco reference time: "; - // statfile << timevalue << std::endl; - // } else { std::cout << " stat file is not open\n"; } - - // double* A_vals = (double*) (A.getTacoTensorT()->vals); - // double* ref_vals = (double*) (ref.getTacoTensorT()->vals); - // double* ref2_vals = (double*) (ref2.getTacoTensorT()->vals); + std::string sofile_original = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/taco_original.so"; + TOOL_BENCHMARK_TIMER(ref.compute(statfile, sofile_original), "\n\nReference Kernel: ", timevalue); + if (statfile.is_open()) { + statfile << "taco reference time: "; + statfile << timevalue << std::endl; + } else { std::cout << " stat file is not open\n"; } - // // int* A2_pos = (double*) (ref.getTacoTensorT()->vals); + double* A_vals = (double*) (A.getTacoTensorT()->vals); + double* ref_vals = (double*) (ref.getTacoTensorT()->vals); + double* ref2_vals = (double*) (ref2.getTacoTensorT()->vals); - // // for (size_t q=0; q < B.getStorage().getValues().getSize(); q++) { - // // if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) { - // // std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " - // // << "refvals: " << ref_vals[q] << std::endl; - // // ASSERT_TRUE(false); - // // } - // // } + // int* A2_pos = (double*) (ref.getTacoTensorT()->vals); - // for (size_t q=0; q < A.getDimension(0)* A.getDimension(1); q++) { + // for (size_t q=0; q < B.getStorage().getValues().getSize(); q++) { // if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) { // std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " // << "refvals: " << ref_vals[q] << std::endl; // ASSERT_TRUE(false); // } // } - // for (size_t q=0; q < A.getDimension(0)* A.getDimension(1); q++) { - // if ( abs(A_vals[q] - ref2_vals[q])/abs(ref2_vals[q]) > ERROR_MARGIN) { - // std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " - // << "refvals: " << ref2_vals[q] << std::endl; - // ASSERT_TRUE(false); - // } - // } - // for (int q= 0; q< A_vals + + for (size_t q=0; q < A.getDimension(0)* A.getDimension(1); q++) { + if ( abs(A_vals[q] - ref_vals[q])/abs(ref_vals[q]) > ERROR_MARGIN) { + std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + << "refvals: " << ref_vals[q] << std::endl; + ASSERT_TRUE(false); + } + } + for (size_t q=0; q < A.getDimension(0)* A.getDimension(1); q++) { + if ( abs(A_vals[q] - ref2_vals[q])/abs(ref2_vals[q]) > ERROR_MARGIN) { + std::cout << "error: results don't match i: " << q << ", avals: " << A_vals[q] << " " + << "refvals: " << ref2_vals[q] << std::endl; + ASSERT_TRUE(false); + } + } + // // for (int q= 0; q< A_vals // for (int q = 0; q < A.getDimension(0); ++q) { // for (int w = 0; w < A.getDimension(1); ++w) { // if ( abs(A(q,w) - ref(q,w))/abs(ref(q,w)) > ERROR_MARGIN) { @@ -775,6 +779,8 @@ TEST(scheduling_eval, hadamardFused) { return; } + taco_set_num_threads(NUM_THREADS_TO_USE); + ofstream statfile; statfile.open( "/home/min/a/kadhitha/workspace/my_taco/taco/test/stats/hadamard-gemm.txt", std::ios::app); @@ -791,14 +797,16 @@ TEST(scheduling_eval, hadamardFused) { int kdim = 128; int ldim = 128; - vector filenums = {2,3,4,5,6,7,8,9,10,12,15}; - // vector filenums = {8,9,10,12}; + // vector filenums = {2,3,4,5,6,7,8,9,10,12,15}; + vector filenums = {0}; for (auto filenum : filenums) { // int filenum = 15; std::vector matfiles = { + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx", + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx", "/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx", "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx", "/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx", // 2 @@ -819,6 +827,8 @@ TEST(scheduling_eval, hadamardFused) { "/home/min/a/kadhitha/ispc-examples/data/suitesparse/twitter7/twitter7.mtx" }; std::vector matfilesrw = { + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/rw/cora.mtx", + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/rw/amazon.mtx", "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/synthetic.mtx", "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cage3.mtx", "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/bcsstk17.mtx", @@ -2121,20 +2131,20 @@ TEST(scheduling_eval, spmmFused) { return; } - int retval, EventSet = PAPI_NULL; - retval = PAPI_hl_region_begin("dummy"); - if ( retval != PAPI_OK ) handle_error(1); + // int retval, EventSet = PAPI_NULL; + // retval = PAPI_hl_region_begin("dummy"); + // if ( retval != PAPI_OK ) handle_error(1); /* Do some computation */ - retval = PAPI_hl_region_end("dummy"); - if ( retval != PAPI_OK ) handle_error(1); + // retval = PAPI_hl_region_end("dummy"); + // if ( retval != PAPI_OK ) handle_error(1); taco_set_num_threads(NUM_THREADS_TO_USE); ofstream statfile; statfile.open( - "/home/min/a/kadhitha/workspace/my_taco/taco/test/stats/spmm-spmm.txt", std::ios::app); + "/home/min/a/kadhitha/workspace/my_taco/taco/test/stats/spmm-gemm.txt", std::ios::app); if (statfile.is_open()) { statfile << "\nspmm-spmm execution\n"; statfile << "\n-----------------------------------------\n"; @@ -2149,7 +2159,7 @@ TEST(scheduling_eval, spmmFused) { int ldim = 64; // vector filenums = {2,3,4,5,6,7,8,9,10,12,15}; - vector filenums = {3}; + vector filenums = {0}; for (auto filenum : filenums) { @@ -2159,6 +2169,8 @@ TEST(scheduling_eval, spmmFused) { // int filenum = 7; std::vector matfiles = { + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx", + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx", "/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx", "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx", "/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx", // 2 @@ -2180,6 +2192,8 @@ TEST(scheduling_eval, spmmFused) { "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k.mtx", }; std::vector matfilesrw = { + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/rw/cora.mtx", + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/rw/amazon.mtx", "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/synthetic.mtx", "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cage3.mtx", "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/bcsstk17.mtx", @@ -2377,42 +2391,42 @@ TEST(scheduling_eval, spmmFused) { statfile << "\n--------- 1st pattern computation TTM, GEMM\n"; - retval = PAPI_hl_region_begin("spmm"); - if ( retval != PAPI_OK ) handle_error(1); + // retval = PAPI_hl_region_begin("spmm"); + // if ( retval != PAPI_OK ) handle_error(1); TOOL_BENCHMARK_TIMER(ref1.compute(statfile), "\n\nSpMM Kernel: ", timevalue); - retval = PAPI_hl_region_end("spmm"); - if ( retval != PAPI_OK ) handle_error(1); + // retval = PAPI_hl_region_end("spmm"); + // if ( retval != PAPI_OK ) handle_error(1); if (statfile.is_open()) { statfile << "SpMM time: "; statfile << timevalue.mean << std::endl; } else { std::cout << " stat file is not open\n"; } std::string sofile_spmm_template = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so"; - retval = PAPI_hl_region_begin("spmmtemplate"); - if ( retval != PAPI_OK ) handle_error(1); + // retval = PAPI_hl_region_begin("spmmtemplate"); + // if ( retval != PAPI_OK ) handle_error(1); TOOL_BENCHMARK_TIMER(ref1.compute(statfile, sofile_spmm_template), "\n\nSpMM template Kernel: ", timevalue); - retval = PAPI_hl_region_end("spmmtemplate"); - if ( retval != PAPI_OK ) handle_error(1); + // retval = PAPI_hl_region_end("spmmtemplate"); + // if ( retval != PAPI_OK ) handle_error(1); if (statfile.is_open()) { statfile << "SpMM template time: "; statfile << timevalue.mean << std::endl; } else { std::cout << " stat file is not open\n"; } - retval = PAPI_hl_region_begin("gemm"); - if ( retval != PAPI_OK ) handle_error(1); + // retval = PAPI_hl_region_begin("gemm"); + // if ( retval != PAPI_OK ) handle_error(1); TOOL_BENCHMARK_TIMER(ref2.compute(statfile), "\n\nGeMM Kernel: ", timevalue); - retval = PAPI_hl_region_end("gemm"); - if ( retval != PAPI_OK ) handle_error(1); + // retval = PAPI_hl_region_end("gemm"); + // if ( retval != PAPI_OK ) handle_error(1); if (statfile.is_open()) { statfile << "GeMM time: "; statfile << timevalue.mean << std::endl; } else { std::cout << " stat file is not open\n"; } - retval = PAPI_hl_region_begin("gemmtemplate"); - if ( retval != PAPI_OK ) handle_error(1); + // retval = PAPI_hl_region_begin("gemmtemplate"); + // if ( retval != PAPI_OK ) handle_error(1); TOOL_BENCHMARK_TIMER(ref2_2.compute(statfile), "\n\nref GeMM template Kernel: ", timevalue); - retval = PAPI_hl_region_end("gemmtemplate"); - if ( retval != PAPI_OK ) handle_error(1); + // retval = PAPI_hl_region_end("gemmtemplate"); + // if ( retval != PAPI_OK ) handle_error(1); if (statfile.is_open()) { statfile << "ref 2 GeMM template time: "; statfile << timevalue.mean << std::endl; @@ -2420,21 +2434,21 @@ TEST(scheduling_eval, spmmFused) { // std::string sofile_gemm_template = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/spmm_gemm/spmm_template.so"; statfile << "\n--------- 2nd pattern computation GEMM, SpMM\n"; - retval = PAPI_hl_region_begin("gemmtemplate2"); - if ( retval != PAPI_OK ) handle_error(1); + // retval = PAPI_hl_region_begin("gemmtemplate2"); + // if ( retval != PAPI_OK ) handle_error(1); TOOL_BENCHMARK_TIMER(ref3.compute(statfile), "\n\nGeMM template ref3 Kernel: ", timevalue); - retval = PAPI_hl_region_end("gemmtemplate2"); - if ( retval != PAPI_OK ) handle_error(1); + // retval = PAPI_hl_region_end("gemmtemplate2"); + // if ( retval != PAPI_OK ) handle_error(1); if (statfile.is_open()) { statfile << "ref3 GeMM template time: "; statfile << timevalue.mean << std::endl; } else { std::cout << " stat file is not open\n"; } - retval = PAPI_hl_region_begin("spmm2"); - if ( retval != PAPI_OK ) handle_error(1); + // retval = PAPI_hl_region_begin("spmm2"); + // if ( retval != PAPI_OK ) handle_error(1); TOOL_BENCHMARK_TIMER(ref4.compute(statfile, sofile_spmm_template), "\n\nSpMM template Kernel ref4: ", timevalue); - retval = PAPI_hl_region_end("spmm2"); - if ( retval != PAPI_OK ) handle_error(1); + // retval = PAPI_hl_region_end("spmm2"); + // if ( retval != PAPI_OK ) handle_error(1); if (statfile.is_open()) { statfile << "SpMM template time ref4: "; statfile << timevalue.mean << std::endl; @@ -2443,32 +2457,32 @@ TEST(scheduling_eval, spmmFused) { statfile << "\n-------- reference pattern computation\n"; - retval = PAPI_hl_region_begin("ref"); - if ( retval != PAPI_OK ) handle_error(1); + // retval = PAPI_hl_region_begin("ref"); + // if ( retval != PAPI_OK ) handle_error(1); TOOL_BENCHMARK_TIMER(ref.compute(statfile), "\n\nReference Kernel: ", timevalue); - retval = PAPI_hl_region_end("ref"); - if ( retval != PAPI_OK ) handle_error(1); + // retval = PAPI_hl_region_end("ref"); + // if ( retval != PAPI_OK ) handle_error(1); if (statfile.is_open()) { statfile << "taco reference time: "; statfile << timevalue << std::endl; } else { std::cout << " stat file is not open\n"; } - retval = PAPI_hl_region_begin("refnew"); - if ( retval != PAPI_OK ) handle_error(1); + // retval = PAPI_hl_region_begin("refnew"); + // if ( retval != PAPI_OK ) handle_error(1); TOOL_BENCHMARK_TIMER(refn.compute(statfile), "\n\nReference new Kernel: ", timevalue); - retval = PAPI_hl_region_end("refnew"); - if ( retval != PAPI_OK ) handle_error(1); + // retval = PAPI_hl_region_end("refnew"); + // if ( retval != PAPI_OK ) handle_error(1); if (statfile.is_open()) { statfile << "taco reference new time: "; statfile << timevalue << std::endl; } else { std::cout << " stat file is not open\n"; } - retval = PAPI_hl_region_begin("sparselnr"); - if ( retval != PAPI_OK ) handle_error(1); + // retval = PAPI_hl_region_begin("sparselnr"); + // if ( retval != PAPI_OK ) handle_error(1); TOOL_BENCHMARK_TIMER(A.compute(statfile), "\n\nFused Kernel: ", timevalue); - retval = PAPI_hl_region_end("sparselnr"); - if ( retval != PAPI_OK ) handle_error(1); + // retval = PAPI_hl_region_end("sparselnr"); + // if ( retval != PAPI_OK ) handle_error(1); if (statfile.is_open()) { statfile << "fused time: "; statfile << timevalue.mean << std::endl; @@ -2537,25 +2551,25 @@ TEST(scheduling_eval, spmmFused) { // /* Add the native event */ // native = () - retval = PAPI_hl_region_begin("computation1"); - if ( retval != PAPI_OK ) - handle_error(1); + // retval = PAPI_hl_region_begin("computation1"); + // if ( retval != PAPI_OK ) + // handle_error(1); - /* Do some computation */ + // /* Do some computation */ - retval = PAPI_hl_region_end("computation1"); - if ( retval != PAPI_OK ) - handle_error(1); + // retval = PAPI_hl_region_end("computation1"); + // if ( retval != PAPI_OK ) + // handle_error(1); - retval = PAPI_hl_region_begin("computation2"); - if ( retval != PAPI_OK ) - handle_error(1); + // retval = PAPI_hl_region_begin("computation2"); + // if ( retval != PAPI_OK ) + // handle_error(1); - /* Do some computation */ + // /* Do some computation */ - retval = PAPI_hl_region_end("computation2"); - if ( retval != PAPI_OK ) - handle_error(1); + // retval = PAPI_hl_region_end("computation2"); + // if ( retval != PAPI_OK ) + // handle_error(1); } @@ -2588,12 +2602,15 @@ TEST(scheduling_eval, sddmmspmmFused) { int ldim = 64; int mdim = 64; - vector filenums{2, 3,4,5,6,7,8,9,10,12,15}; + // vector filenums{2, 3,4,5,6,7,8,9,10,12,15}; + vector filenums{0}; for (auto filenum : filenums) { std::vector matfiles = { + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx", + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx", "/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx", "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx", "/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx", @@ -2614,6 +2631,8 @@ TEST(scheduling_eval, sddmmspmmFused) { "/home/min/a/kadhitha/ispc-examples/data/suitesparse/twitter7/twitter7.mtx" }; std::vector matfilesrw = { + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/rw/cora.mtx", + "/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/rw/amazon.mtx", "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/synthetic.mtx", "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cage3.mtx", "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/bcsstk17.mtx", From ea26c08f7bf42c38859bc70681268fd7369dcbe9 Mon Sep 17 00:00:00 2001 From: Adhhitha Dias Date: Wed, 11 May 2022 13:33:27 -0400 Subject: [PATCH 11/16] task: remove stat files and papi --- test/stats/hadamard-gemm.txt | 921 ---- test/stats/mttkrp-spmm.txt | 1090 ----- test/stats/sddmm-spmm-gemm.txt | 1471 ------- test/stats/sddmm-spmm.txt | 5995 --------------------------- test/stats/spmm-spmm.txt | 3604 ---------------- test/stats/spmv-spmv.txt | 81 - test/stats/ttm-ttm.txt | 2924 ------------- test/tests-scheduling-fuse.cpp | 143 +- test/tests-scheduling-ispc-eval.cpp | 2 - 9 files changed, 16 insertions(+), 16215 deletions(-) delete mode 100644 test/stats/hadamard-gemm.txt delete mode 100644 test/stats/mttkrp-spmm.txt delete mode 100644 test/stats/sddmm-spmm-gemm.txt delete mode 100644 test/stats/sddmm-spmm.txt delete mode 100644 test/stats/spmm-spmm.txt delete mode 100644 test/stats/spmv-spmv.txt delete mode 100644 test/stats/ttm-ttm.txt delete mode 100644 test/tests-scheduling-ispc-eval.cpp diff --git a/test/stats/hadamard-gemm.txt b/test/stats/hadamard-gemm.txt deleted file mode 100644 index 6e730cf50..000000000 --- a/test/stats/hadamard-gemm.txt +++ /dev/null @@ -1,921 +0,0 @@ - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 64, vals: 2330688 -D1_dimension: 36417, D2_dimension: 64, vals: 2330688 -E1_dimension: 64, E2_dimension: 64, vals: 4096 - - -kernel execution time: 22.4288 ms -fused time: 23.1383 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 64, vals: 2330688 -D1_dimension: 36417, D2_dimension: 64, vals: 2330688 -E1_dimension: 64, E2_dimension: 64, vals: 4096 - - -kernel execution time: 8.99985 ms -fused time: 9.71943 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 64, vals: 2330688 -D1_dimension: 36417, D2_dimension: 64, vals: 2330688 -E1_dimension: 64, E2_dimension: 64, vals: 4096 - - -kernel execution time: 8.65832 ms -fused time: 9.33544 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 64, vals: 2330688 -D1_dimension: 36417, D2_dimension: 64, vals: 2330688 -E1_dimension: 64, E2_dimension: 64, vals: 4096 - - -kernel execution time: 21.7432 ms -fused time: 22.466 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 206500, B2_dimension: 206500, vals: 1273389 -C1_dimension: 206500, C2_dimension: 64, vals: 13216000 -D1_dimension: 206500, D2_dimension: 64, vals: 13216000 -E1_dimension: 64, E2_dimension: 64, vals: 4096 - - -kernel execution time: 25.8057 ms -fused time: 26.4891 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 206500, B2_dimension: 206500, vals: 1273389 -C1_dimension: 206500, C2_dimension: 64, vals: 13216000 -D1_dimension: 206500, D2_dimension: 64, vals: 13216000 -E1_dimension: 64, E2_dimension: 64, vals: 4096 - - -kernel execution time: 26.7972 ms -fused time: 27.2892 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 206500, B2_dimension: 206500, vals: 1273389 -C1_dimension: 206500, C2_dimension: 64, vals: 13216000 -D1_dimension: 206500, D2_dimension: 64, vals: 13216000 -E1_dimension: 64, E2_dimension: 64, vals: 4096 - - -kernel execution time: 46.4376 ms -fused time: 47.1315 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 206500, B2_dimension: 206500, vals: 1273389 -C1_dimension: 206500, C2_dimension: 64, vals: 13216000 -D1_dimension: 206500, D2_dimension: 64, vals: 13216000 -E1_dimension: 64, E2_dimension: 64, vals: 4096 - - -kernel execution time: 26.8781 ms -fused time: 27.4325 - -kernel execution time: 61.7475 ms -taco reference time: 62.3899 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 206500, B2_dimension: 206500, vals: 1273389 -C1_dimension: 206500, C2_dimension: 64, vals: 13216000 -D1_dimension: 206500, D2_dimension: 64, vals: 13216000 -E1_dimension: 64, E2_dimension: 64, vals: 4096 - - -kernel execution time: 25.4837 ms -fused time: 25.9563 - -kernel execution time: 15.5567 ms -sddmm time: 16.2101 - -kernel execution time: 73.7443 ms -taco reference time: 74.42 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 206500, B2_dimension: 206500, vals: 1273389 -C1_dimension: 206500, C2_dimension: 64, vals: 13216000 -D1_dimension: 206500, D2_dimension: 64, vals: 13216000 -E1_dimension: 64, E2_dimension: 64, vals: 4096 - - -kernel execution time: 24.5312 ms -fused time: 25.0641 - -kernel execution time: 14.7877 ms -hadamard time: 15.4539 - -kernel execution time: 18.149 ms -gemm time: 18.7191 - -kernel execution time: 73.8142 ms -taco reference time: 74.4567 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 206500, B2_dimension: 206500, vals: 1273389 -C1_dimension: 206500, C2_dimension: 128, vals: 26432000 -D1_dimension: 206500, D2_dimension: 128, vals: 26432000 -E1_dimension: 128, E2_dimension: 64, vals: 8192 - - -kernel execution time: 36.5794 ms -fused time: 37.1963 - -kernel execution time: 31.9277 ms -hadamard time: 32.6108 - -kernel execution time: 28.0947 ms -gemm time: 28.7572 - -kernel execution time: 203.157 ms -taco reference time: 203.921 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 206500, B2_dimension: 206500, vals: 1273389 -C1_dimension: 206500, C2_dimension: 128, vals: 26432000 -D1_dimension: 206500, D2_dimension: 128, vals: 26432000 -E1_dimension: 128, E2_dimension: 128, vals: 16384 - - -kernel execution time: 42.4207 ms -fused time: 42.9584 - -kernel execution time: 31.1526 ms -hadamard time: 31.8623 - -kernel execution time: 62.6041 ms -gemm time: 63.199 - -kernel execution time: 416.714 ms -taco reference time: 417.403 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291 -C1_dimension: 5558326, C2_dimension: 128, vals: 711465728 -D1_dimension: 5558326, D2_dimension: 128, vals: 711465728 -E1_dimension: 128, E2_dimension: 128, vals: 16384 - - -kernel execution time: 1265.12 ms -fused time: 1266.15 - -kernel execution time: 4815.82 ms -hadamard time: 4816.95 - -kernel execution time: 1478.77 ms -gemm time: 1479.51 - -kernel execution time: 63618.8 ms -taco reference time: 63619.9 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 128, vals: 1404672 -D1_dimension: 10974, D2_dimension: 128, vals: 1404672 -E1_dimension: 128, E2_dimension: 128, vals: 16384 - - -kernel execution time: 4.44366 ms -fused time: 5.30002 - -kernel execution time: 1.60353 ms -hadamard time: 2.06029 - -kernel execution time: 4.56709 ms -gemm time: 4.9084 - -kernel execution time: 52.2837 ms -taco reference time: 52.7156 -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 128, vals: 4661376 -D1_dimension: 36417, D2_dimension: 128, vals: 4661376 -E1_dimension: 128, E2_dimension: 128, vals: 16384 - - -kernel execution time: 13.0806 ms -fused time: 13.6544 - -kernel execution time: 12.1216 ms -hadamard time: 12.8046 - -kernel execution time: 11.8732 ms -gemm time: 12.47 - -kernel execution time: 477.422 ms -taco reference time: 477.987 -/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 46835, B2_dimension: 46835, vals: 2374001 -C1_dimension: 46835, C2_dimension: 128, vals: 5994880 -D1_dimension: 46835, D2_dimension: 128, vals: 5994880 -E1_dimension: 128, E2_dimension: 128, vals: 16384 - - -kernel execution time: 13.6475 ms -fused time: 14.2071 - -kernel execution time: 12.1816 ms -hadamard time: 12.8468 - -kernel execution time: 14.7018 ms -gemm time: 15.233 - -kernel execution time: 251.649 ms -taco reference time: 252.229 -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 62451, B2_dimension: 62451, vals: 4007383 -C1_dimension: 62451, C2_dimension: 128, vals: 7993728 -D1_dimension: 62451, D2_dimension: 128, vals: 7993728 -E1_dimension: 128, E2_dimension: 128, vals: 16384 - - -kernel execution time: 20.2137 ms -fused time: 20.7037 - -kernel execution time: 19.6828 ms -hadamard time: 20.2722 - -kernel execution time: 18.5323 ms -gemm time: 19.0234 - -kernel execution time: 415.255 ms -taco reference time: 415.805 -/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 83334, B2_dimension: 83334, vals: 6010480 -C1_dimension: 83334, C2_dimension: 128, vals: 10666752 -D1_dimension: 83334, D2_dimension: 128, vals: 10666752 -E1_dimension: 128, E2_dimension: 128, vals: 16384 - - -kernel execution time: 28.1295 ms -fused time: 28.6289 - -kernel execution time: 28.2393 ms -hadamard time: 28.8514 - -kernel execution time: 24.2246 ms -gemm time: 24.7551 - -kernel execution time: 597.455 ms -taco reference time: 598.049 -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 -C1_dimension: 121192, C2_dimension: 128, vals: 15512576 -D1_dimension: 121192, D2_dimension: 128, vals: 15512576 -E1_dimension: 128, E2_dimension: 128, vals: 16384 - - -kernel execution time: 49.6444 ms -fused time: 50.1899 - -kernel execution time: 45.97 ms -hadamard time: 46.6381 - -kernel execution time: 33.5119 ms -gemm time: 34.0815 - -kernel execution time: 258.507 ms -taco reference time: 259.153 -/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 140874, B2_dimension: 140874, vals: 7813404 -C1_dimension: 140874, C2_dimension: 128, vals: 18031872 -D1_dimension: 140874, D2_dimension: 128, vals: 18031872 -E1_dimension: 128, E2_dimension: 128, vals: 16384 - - -kernel execution time: 42.1499 ms -fused time: 42.7069 - -kernel execution time: 41.9158 ms -hadamard time: 42.597 - -kernel execution time: 37.5761 ms -gemm time: 38.1603 - -kernel execution time: 748.178 ms -taco reference time: 748.913 -/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 170998, B2_dimension: 170998, vals: 958936 -C1_dimension: 170998, C2_dimension: 128, vals: 21887744 -D1_dimension: 170998, D2_dimension: 128, vals: 21887744 -E1_dimension: 128, E2_dimension: 128, vals: 16384 - - -kernel execution time: 32.0664 ms -fused time: 32.5614 - -kernel execution time: 27.8304 ms -hadamard time: 28.5102 - -kernel execution time: 45.5743 ms -gemm time: 46.1921 - -kernel execution time: 97.9936 ms -taco reference time: 98.6611 -/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 206500, B2_dimension: 206500, vals: 1273389 -C1_dimension: 206500, C2_dimension: 128, vals: 26432000 -D1_dimension: 206500, D2_dimension: 128, vals: 26432000 -E1_dimension: 128, E2_dimension: 128, vals: 16384 - - -kernel execution time: 42.0101 ms -fused time: 42.5555 - -kernel execution time: 38.2596 ms -hadamard time: 38.9704 - -kernel execution time: 55.2502 ms -gemm time: 55.8132 - -kernel execution time: 128.93 ms -taco reference time: 129.615 -/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536 -C1_dimension: 1000005, C2_dimension: 128, vals: 128000640 -D1_dimension: 1000005, D2_dimension: 128, vals: 128000640 -E1_dimension: 128, E2_dimension: 128, vals: 16384 - - -kernel execution time: 156.672 ms -fused time: 157.149 - -kernel execution time: 108.579 ms -hadamard time: 109.187 - -kernel execution time: 266.855 ms -gemm time: 267.343 - -kernel execution time: 325.2 ms -taco reference time: 325.907 -/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291 -C1_dimension: 5558326, C2_dimension: 128, vals: 711465728 -D1_dimension: 5558326, D2_dimension: 128, vals: 711465728 -E1_dimension: 128, E2_dimension: 128, vals: 16384 - - -kernel execution time: 1267.69 ms -fused time: 1268.78 - -kernel execution time: 1173.34 ms -hadamard time: 1174.13 - -kernel execution time: 1502.45 ms -gemm time: 1503.33 - -kernel execution time: 12918.1 ms -taco reference time: 12919.5 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 140874, B2_dimension: 140874, vals: 7813404 -C1_dimension: 140874, C2_dimension: 128, vals: 18031872 -D1_dimension: 140874, D2_dimension: 128, vals: 18031872 -E1_dimension: 128, E2_dimension: 128, vals: 16384 - - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 140874, B2_dimension: 140874, vals: 7813404 -C1_dimension: 140874, C2_dimension: 128, vals: 18031872 -D1_dimension: 140874, D2_dimension: 128, vals: 18031872 -E1_dimension: 128, E2_dimension: 128, vals: 16384 - - -kernel execution time: 44.4685 ms -fused time: 47.652 - -kernel execution time: 39.859 ms -hadamard time: 40.465 - -kernel execution time: 40.2328 ms -gemm time: 40.7652 - -kernel execution time: 770.504 ms -taco reference time: 771.113 -/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 170998, B2_dimension: 170998, vals: 958936 -C1_dimension: 170998, C2_dimension: 128, vals: 21887744 -D1_dimension: 170998, D2_dimension: 128, vals: 21887744 -E1_dimension: 128, E2_dimension: 128, vals: 16384 - - -kernel execution time: 32.6037 ms -fused time: 36.0777 - -kernel execution time: 27.1815 ms -hadamard time: 27.8676 - -kernel execution time: 46.1458 ms -gemm time: 46.6699 - -kernel execution time: 97.8299 ms -taco reference time: 98.5149 -/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 206500, B2_dimension: 206500, vals: 1273389 -C1_dimension: 206500, C2_dimension: 128, vals: 26432000 -D1_dimension: 206500, D2_dimension: 128, vals: 26432000 -E1_dimension: 128, E2_dimension: 128, vals: 16384 - - -kernel execution time: 42.3414 ms -fused time: 46.4717 - -kernel execution time: 37.0604 ms -hadamard time: 37.7717 - -kernel execution time: 55.4753 ms -gemm time: 56.0538 - -kernel execution time: 129.339 ms -taco reference time: 130.028 -/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536 -C1_dimension: 1000005, C2_dimension: 128, vals: 128000640 -D1_dimension: 1000005, D2_dimension: 128, vals: 128000640 -E1_dimension: 128, E2_dimension: 128, vals: 16384 - - -kernel execution time: 159.647 ms -fused time: 164.344 - -kernel execution time: 110.823 ms -hadamard time: 111.516 - -kernel execution time: 268.805 ms -gemm time: 269.465 - -kernel execution time: 326.437 ms -taco reference time: 327.144 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 128, vals: 1404672 -D1_dimension: 10974, D2_dimension: 128, vals: 1404672 -E1_dimension: 128, E2_dimension: 128, vals: 16384 - - -kernel execution time: 80.3808 ms -fused time: 82.9372 - -kernel execution time: 17.8402 ms -hadamard time: 18.4152 - -kernel execution time: 127.495 ms -gemm time: 128.275 - -kernel execution time: 1763.16 ms -taco reference time: 1763.78 -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 128, vals: 4661376 -D1_dimension: 36417, D2_dimension: 128, vals: 4661376 -E1_dimension: 128, E2_dimension: 128, vals: 16384 - - -kernel execution time: 352.899 ms -fused time: 356.76 - -kernel execution time: 157.362 ms -hadamard time: 157.893 - -kernel execution time: 406.42 ms -gemm time: 407.203 - -kernel execution time: 17839.4 ms -taco reference time: 17840.5 -/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 46835, B2_dimension: 46835, vals: 2374001 -C1_dimension: 46835, C2_dimension: 128, vals: 5994880 -D1_dimension: 46835, D2_dimension: 128, vals: 5994880 -E1_dimension: 128, E2_dimension: 128, vals: 16384 - - -kernel execution time: 360.403 ms -fused time: 364.207 - -kernel execution time: 92.7639 ms -hadamard time: 93.2881 - -kernel execution time: 519.132 ms -gemm time: 519.668 - -kernel execution time: 9767.06 ms -taco reference time: 9767.66 -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 62451, B2_dimension: 62451, vals: 4007383 -C1_dimension: 62451, C2_dimension: 128, vals: 7993728 -D1_dimension: 62451, D2_dimension: 128, vals: 7993728 -E1_dimension: 128, E2_dimension: 128, vals: 16384 - - -kernel execution time: 499.64 ms -fused time: 503.449 - -kernel execution time: 148.888 ms -hadamard time: 149.416 - -kernel execution time: 689.134 ms -gemm time: 689.652 - -kernel execution time: 16929 ms -taco reference time: 16930 -/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 83334, B2_dimension: 83334, vals: 6010480 -C1_dimension: 83334, C2_dimension: 128, vals: 10666752 -D1_dimension: 83334, D2_dimension: 128, vals: 10666752 -E1_dimension: 128, E2_dimension: 128, vals: 16384 - - -kernel execution time: 690.556 ms -fused time: 694.221 - -kernel execution time: 230.454 ms -hadamard time: 230.979 - -kernel execution time: 922.831 ms -gemm time: 923.322 - -kernel execution time: 24781.4 ms -taco reference time: 24782.4 -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 -C1_dimension: 121192, C2_dimension: 128, vals: 15512576 -D1_dimension: 121192, D2_dimension: 128, vals: 15512576 -E1_dimension: 128, E2_dimension: 128, vals: 16384 - - -kernel execution time: 871.577 ms -fused time: 876.166 - -kernel execution time: 213.157 ms -hadamard time: 213.706 - -kernel execution time: 1342.88 ms -gemm time: 1343.39 - -kernel execution time: 10845 ms -taco reference time: 10846.1 -/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 140874, B2_dimension: 140874, vals: 7813404 -C1_dimension: 140874, C2_dimension: 128, vals: 18031872 -D1_dimension: 140874, D2_dimension: 128, vals: 18031872 -E1_dimension: 128, E2_dimension: 128, vals: 16384 - - -kernel execution time: 1074.54 ms -fused time: 1078.91 - -kernel execution time: 302.447 ms -hadamard time: 302.972 - -kernel execution time: 1560.59 ms -gemm time: 1561.07 - -kernel execution time: 32089.4 ms -taco reference time: 32090.3 -/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 170998, B2_dimension: 170998, vals: 958936 -C1_dimension: 170998, C2_dimension: 128, vals: 21887744 -D1_dimension: 170998, D2_dimension: 128, vals: 21887744 -E1_dimension: 128, E2_dimension: 128, vals: 16384 - - -kernel execution time: 1034.29 ms -fused time: 1037.96 - -kernel execution time: 85.577 ms -hadamard time: 86.1357 - -kernel execution time: 1881.63 ms -gemm time: 1882.13 - -kernel execution time: 3962.92 ms -taco reference time: 3963.97 -/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 206500, B2_dimension: 206500, vals: 1273389 -C1_dimension: 206500, C2_dimension: 128, vals: 26432000 -D1_dimension: 206500, D2_dimension: 128, vals: 26432000 -E1_dimension: 128, E2_dimension: 128, vals: 16384 - - -kernel execution time: 1241.65 ms -fused time: 1244.6 - -kernel execution time: 87.8479 ms -hadamard time: 88.3878 - -kernel execution time: 2286.72 ms -gemm time: 2287.22 - -kernel execution time: 5303.69 ms -taco reference time: 5304.69 -/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536 -C1_dimension: 1000005, C2_dimension: 128, vals: 128000640 -D1_dimension: 1000005, D2_dimension: 128, vals: 128000640 -E1_dimension: 128, E2_dimension: 128, vals: 16384 - - -kernel execution time: 5642.42 ms -fused time: 5643.31 - -kernel execution time: 264.874 ms -hadamard time: 265.396 - -kernel execution time: 10966.5 ms -gemm time: 10967.4 - -kernel execution time: 12863.7 ms -taco reference time: 12864.8 -/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291 -C1_dimension: 5558326, C2_dimension: 128, vals: 711465728 -D1_dimension: 5558326, D2_dimension: 128, vals: 711465728 -E1_dimension: 128, E2_dimension: 128, vals: 16384 - - -kernel execution time: 35075.5 ms -fused time: 35079.3 - -kernel execution time: 3869.9 ms -hadamard time: 3870.98 - -kernel execution time: 61504.6 ms -gemm time: 61505.4 - -kernel execution time: 245613 ms -taco reference time: 245614 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 2708, B2_dimension: 2708, vals: 5429 -C1_dimension: 2708, C2_dimension: 128, vals: 346624 -D1_dimension: 2708, D2_dimension: 128, vals: 346624 -E1_dimension: 128, E2_dimension: 128, vals: 16384 - - -kernel execution time: 18.3809 ms -fused time: 19.1229 - -kernel execution time: 0.635828 ms -hadamard time: 0.983143 - -kernel execution time: 30.5122 ms -gemm time: 30.7819 - -kernel execution time: 23.6746 ms -taco reference time: 24.0784 -/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 548551, B2_dimension: 548551, vals: 1851744 -C1_dimension: 548551, C2_dimension: 128, vals: 70214528 -D1_dimension: 548551, D2_dimension: 128, vals: 70214528 -E1_dimension: 128, E2_dimension: 128, vals: 16384 - - -kernel execution time: 3580.2 ms -fused time: 3581 - -kernel execution time: 567.762 ms -hadamard time: 568.301 - -kernel execution time: 6079.96 ms -gemm time: 6080.46 - -kernel execution time: 8129.78 ms -taco reference time: 8130.38 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 2708, B2_dimension: 2708, vals: 5429 -C1_dimension: 2708, C2_dimension: 128, vals: 346624 -D1_dimension: 2708, D2_dimension: 128, vals: 346624 -E1_dimension: 128, E2_dimension: 128, vals: 16384 - - -kernel execution time: 18.4625 ms -fused time: 19.1824 - -kernel execution time: 0.520446 ms -hadamard time: 0.824011 - -kernel execution time: 30.2097 ms -gemm time: 30.46 - -kernel execution time: 23.4681 ms -taco reference time: 23.826 -/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 548551, B2_dimension: 548551, vals: 1851744 -C1_dimension: 548551, C2_dimension: 128, vals: 70214528 -D1_dimension: 548551, D2_dimension: 128, vals: 70214528 -E1_dimension: 128, E2_dimension: 128, vals: 16384 - - -kernel execution time: 3528.39 ms -fused time: 3529.23 - -kernel execution time: 558.625 ms -hadamard time: 559.16 - -kernel execution time: 6157.3 ms -gemm time: 6158.14 - -kernel execution time: 8131.73 ms -taco reference time: 8132.69 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 2708, B2_dimension: 2708, vals: 5429 -C1_dimension: 2708, C2_dimension: 128, vals: 346624 -D1_dimension: 2708, D2_dimension: 128, vals: 346624 -E1_dimension: 128, E2_dimension: 128, vals: 16384 - - -kernel execution time: 2.27347 ms -fused time: 2.7115 - -kernel execution time: 0.180952 ms -hadamard time: 0.76318 - -kernel execution time: 2.72672 ms -gemm time: 3.22211 - -kernel execution time: 5.227 ms -taco reference time: 5.75632 -/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 548551, B2_dimension: 548551, vals: 1851744 -C1_dimension: 548551, C2_dimension: 128, vals: 70214528 -D1_dimension: 548551, D2_dimension: 128, vals: 70214528 -E1_dimension: 128, E2_dimension: 128, vals: 16384 - - -kernel execution time: 164.815 ms -fused time: 165.539 - -kernel execution time: 96.629 ms -hadamard time: 97.303 - -kernel execution time: 202.068 ms -gemm time: 202.628 - -kernel execution time: 273.96 ms -taco reference time: 274.643 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 2708, B2_dimension: 2708, vals: 5429 -C1_dimension: 2708, C2_dimension: 128, vals: 346624 -D1_dimension: 2708, D2_dimension: 128, vals: 346624 -E1_dimension: 128, E2_dimension: 128, vals: 16384 - - -kernel execution time: 2.37004 ms -fused time: 3.11591 - -kernel execution time: 0.176612 ms -hadamard time: 0.833621 - -kernel execution time: 2.08823 ms -gemm time: 2.59022 - -kernel execution time: 3.36531 ms -taco reference time: 4.11087 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 2708, B2_dimension: 2708, vals: 5429 -C1_dimension: 2708, C2_dimension: 128, vals: 346624 -D1_dimension: 2708, D2_dimension: 128, vals: 346624 -E1_dimension: 128, E2_dimension: 128, vals: 16384 - - -kernel execution time: 19.3307 ms -fused time: 20.0662 - -kernel execution time: 0.496176 ms -hadamard time: 0.931803 - -kernel execution time: 30.1194 ms -gemm time: 30.3654 - -kernel execution time: 23.3946 ms -taco reference time: 23.7411 diff --git a/test/stats/mttkrp-spmm.txt b/test/stats/mttkrp-spmm.txt deleted file mode 100644 index fd6226179..000000000 --- a/test/stats/mttkrp-spmm.txt +++ /dev/null @@ -1,1090 +0,0 @@ - - mttkrp-spmm execution -A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) -B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 -C1_dimension: 25, C2_dimension: 25, vals: 625 -D1_dimension: 25, D2_dimension: 25, vals: 625 -E1_dimension: 25, E2_dimension: 48, vals: 1200 - - -kernel execution time: 0.03045 ms -fused time: 0.870912 - -kernel execution time: 0.168452 ms -reference asymptotic blowup time: 0.983003 - -kernel execution time: 0.015 ms -mttkrp time: 0.493997 - -kernel execution time: 0.0267 ms -spmm time: 0.74405 - -mttkrp-spmm execution - -0.015 0.0267 0.03045 0.168452 - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/delicious-3d.tns -A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) -B1_dimension: 532924, B2_dimension: 17262471, B3_dimension: 532924, vals: 140126181 -C1_dimension: 17262471, C2_dimension: 25, vals: 431561775 -D1_dimension: 2480308, D2_dimension: 25, vals: 62007700 -E1_dimension: 25, E2_dimension: 48, vals: 1200 - - -kernel execution time: 8190.76 ms -fused time: 8191.78 - -kernel execution time: 112801 ms -reference asymptotic blowup time: 112802 - -kernel execution time: 11198.5 ms -mttkrp time: 11199.5 - -kernel execution time: 238.88 ms -spmm time: 239.385 - -0.015 0.0267 0.03045 0.168452 -11198.5 238.88 8190.76 112801 - - -mttkrp-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns -A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) -B1_dimension: 319686, B2_dimension: 28153045, B3_dimension: 319686, vals: 112890310 -C1_dimension: 28153045, C2_dimension: 25, vals: 703826125 -D1_dimension: 1607191, D2_dimension: 25, vals: 40179775 -E1_dimension: 25, E2_dimension: 48, vals: 1200 - - -kernel execution time: 3951.18 ms -fused time: 3952.21 - -kernel execution time: 76964 ms -reference asymptotic blowup time: 76965.1 - -kernel execution time: 6212.97 ms -mttkrp time: 6213.89 - -kernel execution time: 142.233 ms -spmm time: 142.726 - -mttkrp-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns -A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) -B1_dimension: 12092, B2_dimension: 9184, B3_dimension: 12092, vals: 76879419 -C1_dimension: 9184, C2_dimension: 25, vals: 229600 -D1_dimension: 28818, D2_dimension: 25, vals: 720450 -E1_dimension: 25, E2_dimension: 48, vals: 1200 - - -kernel execution time: 997.696 ms -fused time: 998.725 - -kernel execution time: 55544.7 ms -reference asymptotic blowup time: 55545.9 - -kernel execution time: 1944.26 ms -mttkrp time: 1944.75 - -kernel execution time: 5.40774 ms -spmm time: 5.8765 - -mttkrp-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/nell-1.tns -A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) -B1_dimension: 2902330, B2_dimension: 2143368, B3_dimension: 2902330, vals: 143599552 -C1_dimension: 2143368, C2_dimension: 25, vals: 53584200 -D1_dimension: 25495389, D2_dimension: 25, vals: 637384725 -E1_dimension: 25, E2_dimension: 48, vals: 1200 - - -kernel execution time: 16275.3 ms -fused time: 16276.4 - -kernel execution time: 325523 ms -reference asymptotic blowup time: 325525 - -kernel execution time: 29202.5 ms -mttkrp time: 29203.5 - -kernel execution time: 1240.14 ms -spmm time: 1240.66 - -mttkrp-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns -A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) -B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854 -C1_dimension: 11374, C2_dimension: 25, vals: 284350 -D1_dimension: 2, D2_dimension: 25, vals: 50 -E1_dimension: 25, E2_dimension: 48, vals: 1200 - - -kernel execution time: 400.942 ms -fused time: 401.47 - -kernel execution time: 21565.2 ms -reference asymptotic blowup time: 21566.3 - -kernel execution time: 1292.53 ms -mttkrp time: 1293.05 - -kernel execution time: 72.2856 ms -spmm time: 72.8001 - -mttkrp-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/darpa1998.tns -A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) -B1_dimension: 22476, B2_dimension: 22476, B3_dimension: 22476, vals: 28421307 -C1_dimension: 22476, C2_dimension: 25, vals: 561900 -D1_dimension: 23776223, D2_dimension: 25, vals: 594405575 -E1_dimension: 25, E2_dimension: 48, vals: 1200 - - -kernel execution time: 1397.54 ms -fused time: 1398.54 - -kernel execution time: 39690 ms -reference asymptotic blowup time: 39691 - -kernel execution time: 4004.71 ms -mttkrp time: 4005.68 - -kernel execution time: 7.97584 ms -spmm time: 8.44535 - -mttkrp-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/freebase_music.tns -A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) -B1_dimension: 23343790, B2_dimension: 23344784, B3_dimension: 23343790, vals: 99546550 -C1_dimension: 23344784, C2_dimension: 25, vals: 583619600 -D1_dimension: 166, D2_dimension: 25, vals: 4150 -E1_dimension: 25, E2_dimension: 48, vals: 1200 - - -kernel execution time: 15804.8 ms -fused time: 15805.9 - -kernel execution time: 79175 ms -reference asymptotic blowup time: 79176.1 - -kernel execution time: 10624.7 ms -mttkrp time: 10625.6 - -kernel execution time: 10007.2 ms -spmm time: 10008.2 - -mttkrp-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/freebase_sampled.tns -A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) -B1_dimension: 38954435, B2_dimension: 38955429, B3_dimension: 38954435, vals: 139920770 -C1_dimension: 38955429, C2_dimension: 25, vals: 973885725 -D1_dimension: 532, D2_dimension: 25, vals: 13300 -E1_dimension: 25, E2_dimension: 48, vals: 1200 - - -kernel execution time: 23869.4 ms -fused time: 23870.5 - -kernel execution time: 113144 ms -reference asymptotic blowup time: 113145 - -kernel execution time: 15284.7 ms -mttkrp time: 15285.7 - -kernel execution time: 15154.3 ms -spmm time: 15155.6 - -mttkrp-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/delicious-3d.tns -A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) -B1_dimension: 958893, B2_dimension: 55584242, B3_dimension: 958893, vals: 140126165 -C1_dimension: 55584242, C2_dimension: 25, vals: 1389606050 -D1_dimension: 2480308, D2_dimension: 25, vals: 62007700 -E1_dimension: 25, E2_dimension: 48, vals: 1200 - - -mttkrp-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns -A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) -B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 -C1_dimension: 25, C2_dimension: 25, vals: 625 -D1_dimension: 25, D2_dimension: 25, vals: 625 -E1_dimension: 25, E2_dimension: 48, vals: 1200 - - -kernel execution time: 0.043711 ms -fused time: 0.864271 - -kernel execution time: 0.027391 ms -mttkrp time: 0.889931 - -kernel execution time: 0.02264 ms -spmm time: 1.09649 - -kernel execution time: 0.04233 ms -reference asymptotic blowup time: 1.01915 - -mttkrp-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns -A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) -B1_dimension: 12092, B2_dimension: 9184, B3_dimension: 12092, vals: 76879419 -C1_dimension: 9184, C2_dimension: 25, vals: 229600 -D1_dimension: 28818, D2_dimension: 25, vals: 720450 -E1_dimension: 25, E2_dimension: 48, vals: 1200 - - -kernel execution time: 813.743 ms -fused time: 814.267 - -kernel execution time: 458.835 ms -mttkrp time: 459.4 - -kernel execution time: 3.56961 ms -spmm time: 4.08913 - -kernel execution time: 13803.8 ms -reference asymptotic blowup time: 13804.8 - -mttkrp-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns -A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) -B1_dimension: 12092, B2_dimension: 9184, B3_dimension: 12092, vals: 76879419 -C1_dimension: 9184, C2_dimension: 25, vals: 229600 -D1_dimension: 28818, D2_dimension: 25, vals: 720450 -E1_dimension: 25, E2_dimension: 48, vals: 1200 - - -kernel execution time: 224.386 ms -fused time: 224.986 - -kernel execution time: 101.692 ms -mttkrp time: 102.264 - -kernel execution time: 5.95563 ms -spmm time: 6.44162 - -kernel execution time: 2647.79 ms -reference asymptotic blowup time: 2648.57 - -mttkrp-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/darpa1998.tns -A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) -B1_dimension: 22476, B2_dimension: 22476, B3_dimension: 22476, vals: 28421307 -C1_dimension: 22476, C2_dimension: 25, vals: 561900 -D1_dimension: 23776223, D2_dimension: 25, vals: 594405575 -E1_dimension: 25, E2_dimension: 48, vals: 1200 - - -kernel execution time: 208.602 ms -fused time: 209.122 - -kernel execution time: 631.37 ms -mttkrp time: 631.981 - -kernel execution time: 7.20919 ms -spmm time: 7.81651 - -kernel execution time: 6749.05 ms -reference asymptotic blowup time: 6750.17 - -mttkrp-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns -A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) -B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854 -C1_dimension: 11374, C2_dimension: 25, vals: 284350 -D1_dimension: 2, D2_dimension: 25, vals: 50 -E1_dimension: 25, E2_dimension: 48, vals: 1200 - - -kernel execution time: 95.6907 ms -fused time: 96.2212 - -kernel execution time: 59.1475 ms -mttkrp time: 59.7153 - -kernel execution time: 63.6734 ms -spmm time: 64.1704 - -kernel execution time: 884.275 ms -reference asymptotic blowup time: 884.934 - -mttkrp-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns -A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) -B1_dimension: 12092, B2_dimension: 9184, B3_dimension: 12092, vals: 76879419 -C1_dimension: 9184, C2_dimension: 25, vals: 229600 -D1_dimension: 28818, D2_dimension: 25, vals: 720450 -E1_dimension: 25, E2_dimension: 48, vals: 1200 - - -kernel execution time: 225.843 ms -fused time: 226.345 - -kernel execution time: 100.14 ms -mttkrp time: 100.738 - -kernel execution time: 6.32395 ms -spmm time: 6.85452 - -kernel execution time: 2678.56 ms -reference asymptotic blowup time: 2679.35 - -mttkrp-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns -A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) -B1_dimension: 319686, B2_dimension: 28153045, B3_dimension: 319686, vals: 112890310 -C1_dimension: 28153045, C2_dimension: 25, vals: 703826125 -D1_dimension: 1607191, D2_dimension: 25, vals: 40179775 -E1_dimension: 25, E2_dimension: 48, vals: 1200 - - -kernel execution time: 503.61 ms -fused time: 504.129 - -kernel execution time: 314.899 ms -mttkrp time: 315.501 - -kernel execution time: 125.456 ms -spmm time: 125.953 - -kernel execution time: 3415.65 ms -reference asymptotic blowup time: 3416.62 - -mttkrp-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/delicious-3d.tns -A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) -B1_dimension: 958893, B2_dimension: 55584242, B3_dimension: 958893, vals: 140126165 -C1_dimension: 55584242, C2_dimension: 25, vals: 1389606050 -D1_dimension: 2480308, D2_dimension: 25, vals: 62007700 -E1_dimension: 25, E2_dimension: 48, vals: 1200 - - -mttkrp-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/nell-1.tns -A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) -B1_dimension: 2902330, B2_dimension: 2143368, B3_dimension: 2902330, vals: 143599552 -C1_dimension: 2143368, C2_dimension: 25, vals: 53584200 -D1_dimension: 25495389, D2_dimension: 25, vals: 637384725 -E1_dimension: 25, E2_dimension: 48, vals: 1200 - - -kernel execution time: 1501.57 ms -fused time: 1502.59 - -kernel execution time: 1748.65 ms -mttkrp time: 1749.21 - -kernel execution time: 1135.01 ms -spmm time: 1135.51 - -kernel execution time: 16178.4 ms -reference asymptotic blowup time: 16179.5 - -mttkrp-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/nell-1.tns -A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) -B1_dimension: 2902330, B2_dimension: 2143368, B3_dimension: 2902330, vals: 143599552 -C1_dimension: 2143368, C2_dimension: 25, vals: 53584200 -D1_dimension: 25495389, D2_dimension: 25, vals: 637384725 -E1_dimension: 25, E2_dimension: 48, vals: 1200 - - -kernel execution time: 16005.7 ms -fused time: 16006.6 - -kernel execution time: 29157.8 ms -mttkrp time: 29158.8 - -kernel execution time: 1247.23 ms -spmm time: 1247.75 - -kernel execution time: 329124 ms -reference asymptotic blowup time: 329125 - -mttkrp-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/delicious-3d.tns -A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) -B1_dimension: 958893, B2_dimension: 55584242, B3_dimension: 958893, vals: 140126165 -C1_dimension: 55584242, C2_dimension: 25, vals: 1389606050 -D1_dimension: 2480308, D2_dimension: 25, vals: 62007700 -E1_dimension: 25, E2_dimension: 48, vals: 1200 - - -mttkrp-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/delicious-3d.tns -A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) -B1_dimension: 958893, B2_dimension: 55584242, B3_dimension: 958893, vals: 140126165 -C1_dimension: 55584242, C2_dimension: 25, vals: 1389606050 -D1_dimension: 2480308, D2_dimension: 25, vals: 62007700 -E1_dimension: 25, E2_dimension: 48, vals: 1200 - - -mttkrp-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/delicious-3d.tns -A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) -B1_dimension: 958893, B2_dimension: 55584242, B3_dimension: 958893, vals: 140126165 -C1_dimension: 55584242, C2_dimension: 25, vals: 1389606050 -D1_dimension: 2480308, D2_dimension: 25, vals: 62007700 -E1_dimension: 25, E2_dimension: 48, vals: 1200 - - -mttkrp-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/delicious-3d.tns -A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) -B1_dimension: 958893, B2_dimension: 55584242, B3_dimension: 958893, vals: 140126165 -C1_dimension: 55584242, C2_dimension: 25, vals: 1389606050 -D1_dimension: 2480308, D2_dimension: 25, vals: 62007700 -E1_dimension: 25, E2_dimension: 48, vals: 1200 - - -mttkrp-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns -A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) -B1_dimension: 12092, B2_dimension: 9184, B3_dimension: 12092, vals: 76879419 -C1_dimension: 9184, C2_dimension: 25, vals: 229600 -D1_dimension: 28818, D2_dimension: 25, vals: 720450 -E1_dimension: 25, E2_dimension: 48, vals: 1200 - - -kernel execution time: 2651.26 ms -reference asymptotic blowup time: 2652.08 - -mttkrp-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/delicious-3d.tns -A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) -B1_dimension: 958893, B2_dimension: 55584242, B3_dimension: 958893, vals: 140126165 -C1_dimension: 55584242, C2_dimension: 25, vals: 1389606050 -D1_dimension: 2480308, D2_dimension: 25, vals: 62007700 -E1_dimension: 25, E2_dimension: 48, vals: 1200 - - -mttkrp-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns -A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) -B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 -C1_dimension: 25, C2_dimension: 32, vals: 800 -D1_dimension: 25, D2_dimension: 32, vals: 800 -E1_dimension: 32, E2_dimension: 64, vals: 2048 - - -kernel execution time: 0.286814 ms -reference asymptotic blowup time: 1.00956 - -mttkrp-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns -A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) -B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 -C1_dimension: 25, C2_dimension: 32, vals: 800 -D1_dimension: 25, D2_dimension: 32, vals: 800 -E1_dimension: 32, E2_dimension: 64, vals: 2048 - - -kernel execution time: 0.036661 ms -mttkrp time: 0.77391 - -kernel execution time: 0.02948 ms -mttkrp ryan time: 0.932103 - -kernel execution time: 0.264104 ms -reference asymptotic blowup time: 1.32301 - -mttkrp-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns -A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) -B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 -C1_dimension: 25, C2_dimension: 32, vals: 800 -D1_dimension: 25, D2_dimension: 32, vals: 800 -E1_dimension: 32, E2_dimension: 64, vals: 2048 - - -kernel execution time: 0.04003 ms -mttkrp time: 0.779201 - -kernel execution time: 0.022291 ms -mttkrp ryan time: 0.821601 - -kernel execution time: 0.268404 ms -reference asymptotic blowup time: 1.28741 - -mttkrp-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns -A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) -B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 -C1_dimension: 25, C2_dimension: 32, vals: 800 -D1_dimension: 25, D2_dimension: 32, vals: 800 -E1_dimension: 32, E2_dimension: 64, vals: 2048 - - -kernel execution time: 0.03006 ms -default mttkrp time: 0.641369 - -kernel execution time: 0.023191 ms -ryan mttkrp workspace time: 0.982223 - -kernel execution time: 0.084371 ms -spmm time: 0.944412 - -kernel execution time: 0.262723 ms -reference asymptotic blowup time: 0.927732 - -mttkrp-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns -A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) -B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 -C1_dimension: 25, C2_dimension: 32, vals: 800 -D1_dimension: 25, D2_dimension: 32, vals: 800 -E1_dimension: 32, E2_dimension: 64, vals: 2048 - - -kernel execution time: 0.046181 ms -default mttkrp time: 0.459706 - -kernel execution time: 0.076311 ms -ryan mttkrp workspace time: 1.1076 - -kernel execution time: 0.06528 ms -GeMM time: 0.307835 - -kernel execution time: 0.230713 ms -reference asymptotic blowup time: 0.942012 - -kernel execution time: 0.081741 ms -fused mttkrp+gemm time: 0.885412 - -mttkrp-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns -A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) -B1_dimension: 319686, B2_dimension: 28153045, B3_dimension: 319686, vals: 112890310 -C1_dimension: 28153045, C2_dimension: 32, vals: 900897440 -D1_dimension: 1607191, D2_dimension: 32, vals: 51430112 -E1_dimension: 32, E2_dimension: 64, vals: 2048 - - -kernel execution time: 233.898 ms -default mttkrp time: 234.426 - -kernel execution time: 293.46 ms -ryan mttkrp workspace time: 294.21 - -kernel execution time: 23.4947 ms -GeMM time: 24.009 - -kernel execution time: 2753.37 ms -reference asymptotic blowup time: 2754.12 - -kernel execution time: 287.939 ms -fused mttkrp+gemm time: 288.576 - -mttkrp-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns -A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) -B1_dimension: 12092, B2_dimension: 9184, B3_dimension: 12092, vals: 76879419 -C1_dimension: 9184, C2_dimension: 32, vals: 293888 -D1_dimension: 28818, D2_dimension: 32, vals: 922176 -E1_dimension: 32, E2_dimension: 64, vals: 2048 - - -kernel execution time: 140.989 ms -default mttkrp time: 141.517 - -kernel execution time: 36.4285 ms -ryan mttkrp workspace time: 37.0544 - -kernel execution time: 1.06091 ms -GeMM time: 1.6425 - -kernel execution time: 3142.38 ms -reference asymptotic blowup time: 3143.28 - -kernel execution time: 43.1867 ms -fused mttkrp+gemm time: 43.8393 - -mttkrp-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/nell-1.tns -A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) -B1_dimension: 2902330, B2_dimension: 2143368, B3_dimension: 2902330, vals: 143599552 -C1_dimension: 2143368, C2_dimension: 32, vals: 68587776 -D1_dimension: 25495389, D2_dimension: 32, vals: 815852448 -E1_dimension: 32, E2_dimension: 64, vals: 2048 - - -kernel execution time: 2635.67 ms -default mttkrp time: 2636.7 - -kernel execution time: 913.661 ms -ryan mttkrp workspace time: 914.435 - -kernel execution time: 166.615 ms -GeMM time: 167.532 - -kernel execution time: 39080.1 ms -reference asymptotic blowup time: 39080.8 - -kernel execution time: 1141.77 ms -fused mttkrp+gemm time: 1142.88 - -mttkrp-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns -A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) -B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854 -C1_dimension: 11374, C2_dimension: 32, vals: 363968 -D1_dimension: 2, D2_dimension: 32, vals: 64 -E1_dimension: 32, E2_dimension: 64, vals: 2048 - - -kernel execution time: 38.5561 ms -default mttkrp time: 39.0876 - -kernel execution time: 18.0733 ms -ryan mttkrp workspace time: 18.6685 - -kernel execution time: 9.91856 ms -GeMM time: 10.4003 - -kernel execution time: 663.996 ms -reference asymptotic blowup time: 664.529 - -kernel execution time: 15.476 ms -fused mttkrp+gemm time: 16.1515 - -mttkrp-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/darpa1998.tns -A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) -B1_dimension: 22476, B2_dimension: 22476, B3_dimension: 22476, vals: 28421307 -C1_dimension: 22476, C2_dimension: 32, vals: 719232 -D1_dimension: 23776223, D2_dimension: 32, vals: 760839136 -E1_dimension: 32, E2_dimension: 64, vals: 2048 - - -kernel execution time: 893.657 ms -default mttkrp time: 894.664 - -kernel execution time: 228.227 ms -ryan mttkrp workspace time: 228.852 - -kernel execution time: 1.81839 ms -GeMM time: 2.27454 - -kernel execution time: 13301.8 ms -reference asymptotic blowup time: 13302.7 - -kernel execution time: 238.142 ms -fused mttkrp+gemm time: 238.778 - -mttkrp-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/delicious-3d.tns -A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) -B1_dimension: 958893, B2_dimension: 55584242, B3_dimension: 958893, vals: 140126164 -C1_dimension: 55584242, C2_dimension: 32, vals: 1778695744 -D1_dimension: 2480308, D2_dimension: 32, vals: 79369856 -E1_dimension: 32, E2_dimension: 64, vals: 2048 - - -mttkrp-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns -A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) -B1_dimension: 319686, B2_dimension: 28153045, B3_dimension: 319686, vals: 112890310 -C1_dimension: 28153045, C2_dimension: 32, vals: 900897440 -D1_dimension: 1607191, D2_dimension: 32, vals: 51430112 -E1_dimension: 32, E2_dimension: 64, vals: 2048 - - -kernel execution time: 6303 ms -default mttkrp time: 6303.86 - -kernel execution time: 4378.98 ms -ryan mttkrp workspace time: 4380.07 - -kernel execution time: 449.512 ms -GeMM time: 450.037 - -kernel execution time: 116274 ms -reference asymptotic blowup time: 116275 - -kernel execution time: 4299.26 ms -fused mttkrp+gemm time: 4300.33 - -mttkrp-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns -A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) -B1_dimension: 12092, B2_dimension: 9184, B3_dimension: 12092, vals: 76879419 -C1_dimension: 9184, C2_dimension: 32, vals: 293888 -D1_dimension: 28818, D2_dimension: 32, vals: 922176 -E1_dimension: 32, E2_dimension: 64, vals: 2048 - - -kernel execution time: 2606.24 ms -default mttkrp time: 2607.1 - -kernel execution time: 878.486 ms -ryan mttkrp workspace time: 879.009 - -kernel execution time: 17.5967 ms -GeMM time: 18.0274 - -kernel execution time: 93762.9 ms -reference asymptotic blowup time: 93763.7 - -kernel execution time: 1052.15 ms -fused mttkrp+gemm time: 1052.76 - -mttkrp-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/nell-1.tns -A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) -B1_dimension: 2902330, B2_dimension: 2143368, B3_dimension: 2902330, vals: 143599552 -C1_dimension: 2143368, C2_dimension: 32, vals: 68587776 -D1_dimension: 25495389, D2_dimension: 32, vals: 815852448 -E1_dimension: 32, E2_dimension: 64, vals: 2048 - - -kernel execution time: 36869.4 ms -default mttkrp time: 36870.3 - -kernel execution time: 17566.6 ms -ryan mttkrp workspace time: 17567.6 - -kernel execution time: 4060.98 ms -GeMM time: 4061.93 - -kernel execution time: 720483 ms -reference asymptotic blowup time: 720484 - -kernel execution time: 17354.7 ms -fused mttkrp+gemm time: 17355.9 - -mttkrp-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns -A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) -B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854 -C1_dimension: 11374, C2_dimension: 32, vals: 363968 -D1_dimension: 2, D2_dimension: 32, vals: 64 -E1_dimension: 32, E2_dimension: 64, vals: 2048 - - -kernel execution time: 1680.94 ms -default mttkrp time: 1681.8 - -kernel execution time: 615.002 ms -ryan mttkrp workspace time: 615.585 - -kernel execution time: 231.923 ms -GeMM time: 232.453 - -kernel execution time: 28415.3 ms -reference asymptotic blowup time: 28416.4 - -kernel execution time: 453.141 ms -fused mttkrp+gemm time: 453.827 - -mttkrp-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/darpa1998.tns -A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) -B1_dimension: 22476, B2_dimension: 22476, B3_dimension: 22476, vals: 28421307 -C1_dimension: 22476, C2_dimension: 32, vals: 719232 -D1_dimension: 23776223, D2_dimension: 32, vals: 760839136 -E1_dimension: 32, E2_dimension: 64, vals: 2048 - - -kernel execution time: 4430.77 ms -default mttkrp time: 4431.71 - -kernel execution time: 1465.2 ms -ryan mttkrp workspace time: 1465.77 - -kernel execution time: 32.1871 ms -GeMM time: 32.6436 - -kernel execution time: 71199.8 ms -reference asymptotic blowup time: 71200.9 - -kernel execution time: 1570.11 ms -fused mttkrp+gemm time: 1570.76 - -mttkrp-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/darpa1998.tns -A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) -B1_dimension: 22476, B2_dimension: 22476, B3_dimension: 22476, vals: 28421307 -C1_dimension: 22476, C2_dimension: 32, vals: 719232 -D1_dimension: 23776223, D2_dimension: 32, vals: 760839136 -E1_dimension: 32, E2_dimension: 64, vals: 2048 - - -kernel execution time: 882.674 ms -default mttkrp time: 883.69 - -kernel execution time: 231.925 ms -ryan mttkrp workspace time: 232.94 - -kernel execution time: 1.87878 ms -GeMM time: 2.38818 - -kernel execution time: 13018.7 ms -reference asymptotic blowup time: 13019.7 - -kernel execution time: 227.495 ms -fused mttkrp+gemm time: 228.182 - -mttkrp-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/darpa1998.tns -A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) -B1_dimension: 22476, B2_dimension: 22476, B3_dimension: 22476, vals: 28421307 -C1_dimension: 22476, C2_dimension: 32, vals: 719232 -D1_dimension: 23776223, D2_dimension: 32, vals: 760839136 -E1_dimension: 32, E2_dimension: 64, vals: 2048 - - -kernel execution time: 874.742 ms -default mttkrp time: 875.218 - -kernel execution time: 231.556 ms -ryan mttkrp workspace time: 232.223 - -kernel execution time: 1.7427 ms -GeMM time: 2.19512 - -kernel execution time: 13047.8 ms -reference asymptotic blowup time: 13048.7 - -kernel execution time: 232.174 ms -fused mttkrp+gemm time: 232.85 - -mttkrp-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns -A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) -B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854 -C1_dimension: 11374, C2_dimension: 32, vals: 363968 -D1_dimension: 2, D2_dimension: 32, vals: 64 -E1_dimension: 32, E2_dimension: 64, vals: 2048 - - -kernel execution time: 40.9013 ms -default mttkrp time: 41.4712 - -kernel execution time: 18.9468 ms -ryan mttkrp workspace time: 19.5875 - -kernel execution time: 10.8838 ms -GeMM time: 11.3865 - -kernel execution time: 700.825 ms -reference asymptotic blowup time: 701.445 - -kernel execution time: 15.8743 ms -fused mttkrp+gemm time: 16.5313 - -mttkrp-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns -A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) -B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 -C1_dimension: 25, C2_dimension: 32, vals: 800 -D1_dimension: 25, D2_dimension: 32, vals: 800 -E1_dimension: 32, E2_dimension: 64, vals: 2048 - - -kernel execution time: 0.02019 ms -default mttkrp time: 3.8105 - -kernel execution time: 0.01628 ms -ryan mttkrp workspace time: 0.602618 - -kernel execution time: 0.075521 ms -GeMM time: 0.491146 - -kernel execution time: 0.254864 ms -reference asymptotic blowup time: 0.897372 - -kernel execution time: 0.038201 ms -fused mttkrp+gemm time: 4.54224 - -mttkrp-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns -A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) -B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 -C1_dimension: 25, C2_dimension: 32, vals: 800 -D1_dimension: 25, D2_dimension: 32, vals: 800 -E1_dimension: 32, E2_dimension: 64, vals: 2048 - - -kernel execution time: 0.02015 ms -default mttkrp time: 3.93207 - -kernel execution time: 0.015561 ms -ryan mttkrp workspace time: 0.559818 - -kernel execution time: 0.074741 ms -GeMM time: 0.880342 - -kernel execution time: 0.250803 ms -reference asymptotic blowup time: 0.892052 - -kernel execution time: 0.038071 ms -fused mttkrp+gemm time: 3.0867 - -mttkrp-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns -A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) -B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 -C1_dimension: 25, C2_dimension: 32, vals: 800 -D1_dimension: 25, D2_dimension: 32, vals: 800 -E1_dimension: 32, E2_dimension: 64, vals: 2048 - - -kernel execution time: 0.02689 ms -default mttkrp time: 0.73934 - -kernel execution time: 0.02205 ms -ryan mttkrp workspace time: 0.863852 - -kernel execution time: 0.081811 ms -GeMM time: 0.527658 - -kernel execution time: 0.259993 ms -reference asymptotic blowup time: 0.923212 - -kernel execution time: 0.042261 ms -fused mttkrp+gemm time: 0.703349 - -mttkrp-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns -A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) -B1_dimension: 12092, B2_dimension: 9184, B3_dimension: 12092, vals: 76879419 -C1_dimension: 9184, C2_dimension: 32, vals: 293888 -D1_dimension: 28818, D2_dimension: 32, vals: 922176 -E1_dimension: 32, E2_dimension: 64, vals: 2048 - - -kernel execution time: 141.637 ms -default mttkrp time: 142.17 - -kernel execution time: 41.1194 ms -ryan mttkrp workspace time: 41.7838 - -kernel execution time: 1.06942 ms -GeMM time: 1.50588 - -kernel execution time: 3218.72 ms -reference asymptotic blowup time: 3219.51 - -kernel execution time: 145.235 ms -fused mttkrp+gemm time: 145.866 - -mttkrp-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns -A(i,m) = B(i,k,l) * D(l,j) * C(k,j) * E(j, m) -B1_dimension: 12092, B2_dimension: 9184, B3_dimension: 12092, vals: 76879419 -C1_dimension: 9184, C2_dimension: 32, vals: 293888 -D1_dimension: 28818, D2_dimension: 32, vals: 922176 -E1_dimension: 32, E2_dimension: 64, vals: 2048 - - -kernel execution time: 148.092 ms -default mttkrp time: 148.691 - -kernel execution time: 41.3947 ms -ryan mttkrp workspace time: 42.046 - -kernel execution time: 1.03445 ms -GeMM time: 1.45556 - -kernel execution time: 3211.6 ms -reference asymptotic blowup time: 3212.43 - -kernel execution time: 45.5971 ms -fused mttkrp+gemm time: 46.2057 diff --git a/test/stats/sddmm-spmm-gemm.txt b/test/stats/sddmm-spmm-gemm.txt deleted file mode 100644 index 02665478f..000000000 --- a/test/stats/sddmm-spmm-gemm.txt +++ /dev/null @@ -1,1471 +0,0 @@ - -sddmm-spmm-gemm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 10974, D2_dimension: 64, vals: 702336 -E1_dimension: 10974, E2_dimension: 64, vals: 702336 -G1_dimension: 10974, G2_dimension: 64, vals: 4096 - - -kernel execution time: 2.51139 ms -fused time: 3.49403 - -kernel execution time: 3.80634 ms -sddmm time: 4.13132 - -kernel execution time: 0.75853 ms -sddmm ryan time: 1.07946 - -kernel execution time: 0.968473 ms -spmm ryan time: 1.2051 - -kernel execution time: 1.39879 ms -gemm time: 1.6602 - -kernel execution time: 1070.79 ms -taco reference time: 1071.2 - -sddmm-spmm-gemm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 64, vals: 2330688 -D1_dimension: 36417, D2_dimension: 64, vals: 2330688 -E1_dimension: 36417, E2_dimension: 64, vals: 2330688 -G1_dimension: 36417, G2_dimension: 64, vals: 4096 - - -kernel execution time: 8.43361 ms -fused time: 9.03941 - -kernel execution time: 13.3195 ms -sddmm time: 13.9487 - -kernel execution time: 4.73639 ms -sddmm ryan time: 5.32202 - -kernel execution time: 4.735 ms -spmm ryan time: 5.22103 - -kernel execution time: 3.66798 ms -gemm time: 4.15167 - -kernel execution time: 10658.4 ms -taco reference time: 10659.3 - -sddmm-spmm-gemm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx -ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); -B1_dimension: 46835, B2_dimension: 46835, vals: 2374001 -C1_dimension: 46835, C2_dimension: 64, vals: 2997440 -D1_dimension: 46835, D2_dimension: 64, vals: 2997440 -E1_dimension: 46835, E2_dimension: 64, vals: 2997440 -G1_dimension: 46835, G2_dimension: 64, vals: 4096 - - -kernel execution time: 7.54896 ms -fused time: 8.15687 - -kernel execution time: 15.1277 ms -sddmm time: 15.796 - -kernel execution time: 3.51464 ms -sddmm ryan time: 4.10653 - -kernel execution time: 4.21975 ms -spmm ryan time: 4.6923 - -kernel execution time: 4.74088 ms -gemm time: 5.2156 - -kernel execution time: 5949.54 ms -taco reference time: 5950.52 - -sddmm-spmm-gemm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx -ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); -B1_dimension: 62451, B2_dimension: 62451, vals: 4007383 -C1_dimension: 62451, C2_dimension: 64, vals: 3996864 -D1_dimension: 62451, D2_dimension: 64, vals: 3996864 -E1_dimension: 62451, E2_dimension: 64, vals: 3996864 -G1_dimension: 62451, G2_dimension: 64, vals: 4096 - - -kernel execution time: 11.7188 ms -fused time: 12.3427 - -kernel execution time: 18.5962 ms -sddmm time: 19.2831 - -kernel execution time: 6.5821 ms -sddmm ryan time: 7.20737 - -kernel execution time: 6.6327 ms -spmm ryan time: 7.20703 - -kernel execution time: 6.06003 ms -gemm time: 6.61794 - -kernel execution time: 9765.93 ms -taco reference time: 9766.85 - -sddmm-spmm-gemm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx -ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); -B1_dimension: 83334, B2_dimension: 83334, vals: 6010480 -C1_dimension: 83334, C2_dimension: 64, vals: 5333376 -D1_dimension: 83334, D2_dimension: 64, vals: 5333376 -E1_dimension: 83334, E2_dimension: 64, vals: 5333376 -G1_dimension: 83334, G2_dimension: 64, vals: 4096 - - -kernel execution time: 16.3022 ms -fused time: 16.877 - -kernel execution time: 26.4065 ms -sddmm time: 26.9999 - -kernel execution time: 9.6103 ms -sddmm ryan time: 10.1859 - -kernel execution time: 9.5796 ms -spmm ryan time: 10.139 - -kernel execution time: 7.75909 ms -gemm time: 8.27337 - -kernel execution time: 14674.3 ms -taco reference time: 14675.2 - -sddmm-spmm-gemm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx -ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); -B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 -C1_dimension: 121192, C2_dimension: 64, vals: 7756288 -D1_dimension: 121192, D2_dimension: 64, vals: 7756288 -E1_dimension: 121192, E2_dimension: 64, vals: 7756288 -G1_dimension: 121192, G2_dimension: 64, vals: 4096 - - -kernel execution time: 28.3919 ms -fused time: 29.022 - -kernel execution time: 28.7666 ms -sddmm time: 29.4282 - -kernel execution time: 10.9353 ms -sddmm ryan time: 11.5639 - -kernel execution time: 12.2792 ms -spmm ryan time: 12.86 - -kernel execution time: 12.0463 ms -gemm time: 12.6219 - -kernel execution time: 6496.16 ms -taco reference time: 6497.16 - -sddmm-spmm-gemm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx -ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); -B1_dimension: 140874, B2_dimension: 140874, vals: 7813404 -C1_dimension: 140874, C2_dimension: 64, vals: 9015936 -D1_dimension: 140874, D2_dimension: 64, vals: 9015936 -E1_dimension: 140874, E2_dimension: 64, vals: 9015936 -G1_dimension: 140874, G2_dimension: 64, vals: 4096 - - -kernel execution time: 23.8673 ms -fused time: 24.4851 - -kernel execution time: 38.4245 ms -sddmm time: 39.0808 - -kernel execution time: 13.3169 ms -sddmm ryan time: 13.9402 - -kernel execution time: 13.8214 ms -spmm ryan time: 14.3969 - -kernel execution time: 13.3955 ms -gemm time: 14.0084 - -kernel execution time: 19010.9 ms -taco reference time: 19012 - -sddmm-spmm-gemm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx -ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); -B1_dimension: 170998, B2_dimension: 170998, vals: 958936 -C1_dimension: 170998, C2_dimension: 64, vals: 10943872 -D1_dimension: 170998, D2_dimension: 64, vals: 10943872 -E1_dimension: 170998, E2_dimension: 64, vals: 10943872 -G1_dimension: 170998, G2_dimension: 64, vals: 4096 - - -kernel execution time: 19.1593 ms -fused time: 19.7496 - -kernel execution time: 31.0395 ms -sddmm time: 31.6882 - -kernel execution time: 7.35776 ms -sddmm ryan time: 7.96434 - -kernel execution time: 9.33589 ms -spmm ryan time: 9.89731 - -kernel execution time: 16.4733 ms -gemm time: 17.0352 - -kernel execution time: 2397 ms -taco reference time: 2397.64 - -sddmm-spmm-gemm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx -ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); -B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536 -C1_dimension: 1000005, C2_dimension: 64, vals: 64000320 -D1_dimension: 1000005, D2_dimension: 64, vals: 64000320 -E1_dimension: 1000005, E2_dimension: 64, vals: 64000320 -G1_dimension: 1000005, G2_dimension: 64, vals: 4096 - - -kernel execution time: 66.7468 ms -fused time: 67.289 - -kernel execution time: 69.5837 ms -sddmm time: 70.1602 - -kernel execution time: 23.2899 ms -sddmm ryan time: 23.8277 - -kernel execution time: 41.9566 ms -spmm ryan time: 42.5095 - -kernel execution time: 93.8383 ms -gemm time: 94.3738 - -kernel execution time: 7587.7 ms -taco reference time: 7588.87 - -sddmm-spmm-gemm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx -ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); -B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291 -C1_dimension: 5558326, C2_dimension: 64, vals: 355732864 -D1_dimension: 5558326, D2_dimension: 64, vals: 355732864 -E1_dimension: 5558326, E2_dimension: 64, vals: 355732864 -G1_dimension: 5558326, G2_dimension: 64, vals: 4096 - - -kernel execution time: 688.492 ms -fused time: 689.478 - -kernel execution time: 979.86 ms -sddmm time: 980.45 - -kernel execution time: 318.248 ms -sddmm ryan time: 318.831 - -kernel execution time: 449.669 ms -spmm ryan time: 450.215 - -kernel execution time: 503.695 ms -gemm time: 504.291 - -kernel execution time: 326798 ms -taco reference time: 326799 - -sddmm-spmm-gemm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx -ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); -B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291 -C1_dimension: 5558326, C2_dimension: 64, vals: 355732864 -D1_dimension: 5558326, D2_dimension: 64, vals: 355732864 -E1_dimension: 5558326, E2_dimension: 64, vals: 355732864 -G1_dimension: 5558326, G2_dimension: 64, vals: 4096 - - -kernel execution time: 9624.7 ms -fused time: 9625.73 - -kernel execution time: 1635.76 ms -sddmm time: 1636.3 - -kernel execution time: 1636.41 ms -sddmm ryan time: 1636.96 - -kernel execution time: 2930.01 ms -spmm ryan time: 2930.5 - -kernel execution time: 15204.2 ms -gemm time: 15205.2 - -sddmm-spmm-gemm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 10974, D2_dimension: 64, vals: 702336 -E1_dimension: 10974, E2_dimension: 64, vals: 702336 -G1_dimension: 10974, G2_dimension: 64, vals: 4096 - - -kernel execution time: 31.0958 ms -fused time: 31.6403 - -kernel execution time: 9.52362 ms -sddmm time: 10.0411 - -kernel execution time: 9.50283 ms -sddmm ryan time: 9.98181 - -kernel execution time: 9.9883 ms -spmm ryan time: 10.3927 - -kernel execution time: 30.6724 ms -gemm time: 31.0956 - -kernel execution time: 50903.4 ms -taco reference time: 50904.4 - -sddmm-spmm-gemm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 64, vals: 2330688 -D1_dimension: 36417, D2_dimension: 64, vals: 2330688 -E1_dimension: 36417, E2_dimension: 64, vals: 2330688 -G1_dimension: 36417, G2_dimension: 64, vals: 4096 - - -kernel execution time: 221.251 ms -fused time: 223.31 - -kernel execution time: 90.6291 ms -sddmm time: 91.9017 - -kernel execution time: 92.6299 ms -sddmm ryan time: 93.1693 - -kernel execution time: 70.0109 ms -spmm ryan time: 70.4884 - -kernel execution time: 103.984 ms -gemm time: 105.217 - -kernel execution time: 441848 ms -taco reference time: 441849 - -sddmm-spmm-gemm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx -ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); -B1_dimension: 46835, B2_dimension: 46835, vals: 2374001 -C1_dimension: 46835, C2_dimension: 64, vals: 2997440 -D1_dimension: 46835, D2_dimension: 64, vals: 2997440 -E1_dimension: 46835, E2_dimension: 64, vals: 2997440 -G1_dimension: 46835, G2_dimension: 64, vals: 4096 - - -kernel execution time: 156.706 ms -fused time: 158.878 - -kernel execution time: 53.3541 ms -sddmm time: 53.8804 - -kernel execution time: 53.6128 ms -sddmm ryan time: 54.7942 - -kernel execution time: 51.5253 ms -spmm ryan time: 52.5961 - -kernel execution time: 130.147 ms -gemm time: 131.306 - -kernel execution time: 243737 ms -taco reference time: 243739 -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx -ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); -B1_dimension: 62451, B2_dimension: 62451, vals: 4007383 -C1_dimension: 62451, C2_dimension: 64, vals: 3996864 -D1_dimension: 62451, D2_dimension: 64, vals: 3996864 -E1_dimension: 62451, E2_dimension: 64, vals: 3996864 -G1_dimension: 62451, G2_dimension: 64, vals: 4096 - - -kernel execution time: 238.619 ms -fused time: 240.152 - -kernel execution time: 84.8828 ms -sddmm time: 85.4286 - -kernel execution time: 80.7058 ms -sddmm ryan time: 81.2588 - -kernel execution time: 75.2549 ms -spmm ryan time: 75.7338 - -kernel execution time: 174.145 ms -gemm time: 174.654 - -kernel execution time: 412699 ms -taco reference time: 412701 -/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx -ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); -B1_dimension: 83334, B2_dimension: 83334, vals: 6010480 -C1_dimension: 83334, C2_dimension: 64, vals: 5333376 -D1_dimension: 83334, D2_dimension: 64, vals: 5333376 -E1_dimension: 83334, E2_dimension: 64, vals: 5333376 -G1_dimension: 83334, G2_dimension: 64, vals: 4096 - - -kernel execution time: 350.004 ms -fused time: 351.319 - -kernel execution time: 123.574 ms -sddmm time: 124.101 - -kernel execution time: 126.113 ms -sddmm ryan time: 127.971 - -kernel execution time: 113.146 ms -spmm ryan time: 113.615 - -kernel execution time: 234.287 ms -gemm time: 235.546 - -kernel execution time: 619783 ms -taco reference time: 619784 -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx -ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); -B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 -C1_dimension: 121192, C2_dimension: 64, vals: 7756288 -D1_dimension: 121192, D2_dimension: 64, vals: 7756288 -E1_dimension: 121192, E2_dimension: 64, vals: 7756288 -G1_dimension: 121192, G2_dimension: 64, vals: 4096 - - -kernel execution time: 335.548 ms -fused time: 337.292 - -kernel execution time: 90.8795 ms -sddmm time: 91.3981 - -kernel execution time: 87.7678 ms -sddmm ryan time: 88.2879 - -kernel execution time: 111.725 ms -spmm ryan time: 113.063 - -kernel execution time: 338.451 ms -gemm time: 340.2 - -kernel execution time: 268303 ms -taco reference time: 268304 -/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx -ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); -B1_dimension: 140874, B2_dimension: 140874, vals: 7813404 -C1_dimension: 140874, C2_dimension: 64, vals: 9015936 -D1_dimension: 140874, D2_dimension: 64, vals: 9015936 -E1_dimension: 140874, E2_dimension: 64, vals: 9015936 -G1_dimension: 140874, G2_dimension: 64, vals: 4096 - - -kernel execution time: 488.065 ms -fused time: 489.312 - -kernel execution time: 161.434 ms -sddmm time: 163.199 - -kernel execution time: 164.295 ms -sddmm ryan time: 165.567 - -kernel execution time: 154.131 ms -spmm ryan time: 154.61 - -kernel execution time: 391.972 ms -gemm time: 393.242 - -kernel execution time: 798245 ms -taco reference time: 798247 -/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx -ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); -B1_dimension: 170998, B2_dimension: 170998, vals: 958936 -C1_dimension: 170998, C2_dimension: 64, vals: 10943872 -D1_dimension: 170998, D2_dimension: 64, vals: 10943872 -E1_dimension: 170998, E2_dimension: 64, vals: 10943872 -G1_dimension: 170998, G2_dimension: 64, vals: 4096 - - -kernel execution time: 279.308 ms -fused time: 280.422 - -kernel execution time: 41.2598 ms -sddmm time: 41.7727 - -kernel execution time: 40.3132 ms -sddmm ryan time: 40.882 - -kernel execution time: 72.4795 ms -spmm ryan time: 73.6321 - -kernel execution time: 473.298 ms -gemm time: 474.582 - -kernel execution time: 98095.7 ms -taco reference time: 98098.4 -/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx -ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); -B1_dimension: 206500, B2_dimension: 206500, vals: 1273389 -C1_dimension: 206500, C2_dimension: 64, vals: 13216000 -D1_dimension: 206500, D2_dimension: 64, vals: 13216000 -E1_dimension: 206500, E2_dimension: 64, vals: 13216000 -G1_dimension: 206500, G2_dimension: 64, vals: 4096 - - -kernel execution time: 321.827 ms -fused time: 322.725 - -kernel execution time: 43.7794 ms -sddmm time: 44.8964 - -kernel execution time: 42.531 ms -sddmm ryan time: 43.7502 - -kernel execution time: 83.5305 ms -spmm ryan time: 84.0178 - -kernel execution time: 567.368 ms -gemm time: 567.876 - -kernel execution time: 130204 ms -taco reference time: 130207 -/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx -ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); -B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536 -C1_dimension: 1000005, C2_dimension: 64, vals: 64000320 -D1_dimension: 1000005, D2_dimension: 64, vals: 64000320 -E1_dimension: 1000005, E2_dimension: 64, vals: 64000320 -G1_dimension: 1000005, G2_dimension: 64, vals: 4096 - - -kernel execution time: 1355.72 ms -fused time: 1357.14 - -kernel execution time: 98.94 ms -sddmm time: 101.488 - -kernel execution time: 97.8972 ms -sddmm ryan time: 98.4423 - -kernel execution time: 218.188 ms -spmm ryan time: 219.39 - -kernel execution time: 2744.38 ms -gemm time: 2744.89 - -kernel execution time: 320035 ms -taco reference time: 320037 -/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx -ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); -B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291 -C1_dimension: 5558326, C2_dimension: 64, vals: 355732864 -D1_dimension: 5558326, D2_dimension: 64, vals: 355732864 -E1_dimension: 5558326, E2_dimension: 64, vals: 355732864 -G1_dimension: 5558326, G2_dimension: 64, vals: 4096 - - -kernel execution time: 9682.48 ms -fused time: 9684.45 - -kernel execution time: 1640.01 ms -sddmm time: 1641.3 - -kernel execution time: 1626.66 ms -sddmm ryan time: 1628.12 - -kernel execution time: 2908.47 ms -spmm ryan time: 2908.94 - -kernel execution time: 15252.4 ms -gemm time: 15253.4 - -kernel execution time: 6.11703e+06 ms -taco reference time: 6.11703e+06 - -sddmm-spmm-gemm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx -ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); -B1_dimension: 46835, B2_dimension: 46835, vals: 2374001 -C1_dimension: 46835, C2_dimension: 64, vals: 2997440 -D1_dimension: 46835, D2_dimension: 64, vals: 2997440 -E1_dimension: 46835, E2_dimension: 64, vals: 2997440 -G1_dimension: 46835, G2_dimension: 64, vals: 4096 - - -kernel execution time: 7.90719 ms -fused time: 12.4475 - -kernel execution time: 15.0235 ms -sddmm time: 18.4078 - -kernel execution time: 3.60187 ms -sddmm ryan time: 7.64096 - -kernel execution time: 4.26585 ms -spmm ryan time: 7.23736 - -kernel execution time: 5.51232 ms -gemm time: 8.94274 - -kernel execution time: 5900.92 ms -taco reference time: 5901.77 -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx -ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); -B1_dimension: 62451, B2_dimension: 62451, vals: 4007383 -C1_dimension: 62451, C2_dimension: 64, vals: 3996864 -D1_dimension: 62451, D2_dimension: 64, vals: 3996864 -E1_dimension: 62451, E2_dimension: 64, vals: 3996864 -G1_dimension: 62451, G2_dimension: 64, vals: 4096 - - -kernel execution time: 11.9944 ms -fused time: 15.5065 - -kernel execution time: 17.5788 ms -sddmm time: 18.2088 - -kernel execution time: 6.90362 ms -sddmm ryan time: 9.18146 - -kernel execution time: 6.52502 ms -spmm ryan time: 7.08577 - -kernel execution time: 5.70869 ms -gemm time: 6.23327 - -kernel execution time: 9752.35 ms -taco reference time: 9753.37 -/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx -ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); -B1_dimension: 83334, B2_dimension: 83334, vals: 6010480 -C1_dimension: 83334, C2_dimension: 64, vals: 5333376 -D1_dimension: 83334, D2_dimension: 64, vals: 5333376 -E1_dimension: 83334, E2_dimension: 64, vals: 5333376 -G1_dimension: 83334, G2_dimension: 64, vals: 4096 - - -kernel execution time: 16.1703 ms -fused time: 19.9224 - -kernel execution time: 26.3346 ms -sddmm time: 30.1538 - -kernel execution time: 9.47197 ms -sddmm ryan time: 12.7137 - -kernel execution time: 9.14926 ms -spmm ryan time: 9.78178 - -kernel execution time: 8.06171 ms -gemm time: 8.592 - -kernel execution time: 14612.6 ms -taco reference time: 14617.7 -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx -ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); -B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 -C1_dimension: 121192, C2_dimension: 64, vals: 7756288 -D1_dimension: 121192, D2_dimension: 64, vals: 7756288 -E1_dimension: 121192, E2_dimension: 64, vals: 7756288 -G1_dimension: 121192, G2_dimension: 64, vals: 4096 - - -kernel execution time: 28.2581 ms -fused time: 32.7167 - -kernel execution time: 30.162 ms -sddmm time: 33.8587 - -kernel execution time: 11.0142 ms -sddmm ryan time: 15.2742 - -kernel execution time: 12.1744 ms -spmm ryan time: 15.0065 - -kernel execution time: 11.4579 ms -gemm time: 14.5527 - -kernel execution time: 6379.22 ms -taco reference time: 6380.3 -/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx -ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); -B1_dimension: 140874, B2_dimension: 140874, vals: 7813404 -C1_dimension: 140874, C2_dimension: 64, vals: 9015936 -D1_dimension: 140874, D2_dimension: 64, vals: 9015936 -E1_dimension: 140874, E2_dimension: 64, vals: 9015936 -G1_dimension: 140874, G2_dimension: 64, vals: 4096 - - -kernel execution time: 24.3937 ms -fused time: 28.6422 - -kernel execution time: 37.2457 ms -sddmm time: 41.311 - -kernel execution time: 13.8503 ms -sddmm ryan time: 17.9583 - -kernel execution time: 14.2713 ms -spmm ryan time: 17.1402 - -kernel execution time: 13.6024 ms -gemm time: 16.6078 - -kernel execution time: 18993.5 ms -taco reference time: 18994.5 -/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx -ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); -B1_dimension: 170998, B2_dimension: 170998, vals: 958936 -C1_dimension: 170998, C2_dimension: 64, vals: 10943872 -D1_dimension: 170998, D2_dimension: 64, vals: 10943872 -E1_dimension: 170998, E2_dimension: 64, vals: 10943872 -G1_dimension: 170998, G2_dimension: 64, vals: 4096 - - -kernel execution time: 18.4645 ms -fused time: 22.0711 - -kernel execution time: 31.6844 ms -sddmm time: 34.9774 - -kernel execution time: 7.19931 ms -sddmm ryan time: 11.584 - -kernel execution time: 9.40139 ms -spmm ryan time: 10.002 - -kernel execution time: 16.3933 ms -gemm time: 19.0699 - -kernel execution time: 2325.51 ms -taco reference time: 2326.19 -/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx -ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); -B1_dimension: 206500, B2_dimension: 206500, vals: 1273389 -C1_dimension: 206500, C2_dimension: 64, vals: 13216000 -D1_dimension: 206500, D2_dimension: 64, vals: 13216000 -E1_dimension: 206500, E2_dimension: 64, vals: 13216000 -G1_dimension: 206500, G2_dimension: 64, vals: 4096 - - -kernel execution time: 25.9398 ms -fused time: 30.7713 - -kernel execution time: 43.1619 ms -sddmm time: 47.1566 - -kernel execution time: 9.47076 ms -sddmm ryan time: 12.9736 - -kernel execution time: 12.1315 ms -spmm ryan time: 12.7125 - -kernel execution time: 19.8795 ms -gemm time: 23.9233 - -kernel execution time: 3085.34 ms -taco reference time: 3087.4 -/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx -ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); -B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536 -C1_dimension: 1000005, C2_dimension: 64, vals: 64000320 -D1_dimension: 1000005, D2_dimension: 64, vals: 64000320 -E1_dimension: 1000005, E2_dimension: 64, vals: 64000320 -G1_dimension: 1000005, G2_dimension: 64, vals: 4096 - - -kernel execution time: 68.9391 ms -fused time: 73.2143 - -kernel execution time: 68.0597 ms -sddmm time: 71.8136 - -kernel execution time: 23.658 ms -sddmm ryan time: 27.2015 - -kernel execution time: 42.2166 ms -spmm ryan time: 45.3816 - -kernel execution time: 91.7085 ms -gemm time: 94.965 - -kernel execution time: 7504.53 ms -taco reference time: 7510.21 -/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx -ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); -B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291 -C1_dimension: 5558326, C2_dimension: 64, vals: 355732864 -D1_dimension: 5558326, D2_dimension: 64, vals: 355732864 -E1_dimension: 5558326, E2_dimension: 64, vals: 355732864 -G1_dimension: 5558326, G2_dimension: 64, vals: 4096 - - -kernel execution time: 685.25 ms -fused time: 691.004 - -kernel execution time: 978.107 ms -sddmm time: 982.105 - -kernel execution time: 314.889 ms -sddmm ryan time: 319.437 - -kernel execution time: 451.321 ms -spmm ryan time: 454.339 - -kernel execution time: 511.771 ms -gemm time: 516.049 - -kernel execution time: 324954 ms -taco reference time: 324960 - -sddmm-spmm-gemm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 10974, D2_dimension: 64, vals: 702336 -E1_dimension: 10974, E2_dimension: 64, vals: 702336 -G1_dimension: 10974, G2_dimension: 64, vals: 4096 - - -kernel execution time: 2.03017 ms -fused time: 6.89988 - -kernel execution time: 4.23176 ms -sddmm time: 4.56628 - -kernel execution time: 1.07066 ms -sddmm ryan time: 1.60331 - -kernel execution time: 1.04047 ms -spmm ryan time: 1.84411 - -kernel execution time: 1.58419 ms -gemm time: 3.49011 - -kernel execution time: 1168.5 ms -taco reference time: 1172.82 -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 64, vals: 2330688 -D1_dimension: 36417, D2_dimension: 64, vals: 2330688 -E1_dimension: 36417, E2_dimension: 64, vals: 2330688 -G1_dimension: 36417, G2_dimension: 64, vals: 4096 - - -kernel execution time: 8.02954 ms -fused time: 12.4005 - -kernel execution time: 12.7753 ms -sddmm time: 15.6047 - -kernel execution time: 4.73627 ms -sddmm ryan time: 8.24994 - -kernel execution time: 4.90489 ms -spmm ryan time: 5.40766 - -kernel execution time: 2.99487 ms -gemm time: 3.53289 - -kernel execution time: 10658.1 ms -taco reference time: 10661.2 -/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx -ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); -B1_dimension: 46835, B2_dimension: 46835, vals: 2374001 -C1_dimension: 46835, C2_dimension: 64, vals: 2997440 -D1_dimension: 46835, D2_dimension: 64, vals: 2997440 -E1_dimension: 46835, E2_dimension: 64, vals: 2997440 -G1_dimension: 46835, G2_dimension: 64, vals: 4096 - - -kernel execution time: 7.15818 ms -fused time: 11.6143 - -kernel execution time: 15.0391 ms -sddmm time: 18.5456 - -kernel execution time: 3.33442 ms -sddmm ryan time: 6.94621 - -kernel execution time: 4.13895 ms -spmm ryan time: 7.49526 - -kernel execution time: 3.79939 ms -gemm time: 4.19085 - -kernel execution time: 5801.87 ms -taco reference time: 5803.1 -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx -ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); -B1_dimension: 62451, B2_dimension: 62451, vals: 4007383 -C1_dimension: 62451, C2_dimension: 64, vals: 3996864 -D1_dimension: 62451, D2_dimension: 64, vals: 3996864 -E1_dimension: 62451, E2_dimension: 64, vals: 3996864 -G1_dimension: 62451, G2_dimension: 64, vals: 4096 - - -kernel execution time: 12.0771 ms -fused time: 16.6939 - -kernel execution time: 17.5697 ms -sddmm time: 18.7919 - -kernel execution time: 6.94731 ms -sddmm ryan time: 11.0254 - -kernel execution time: 7.03752 ms -spmm ryan time: 8.55729 - -kernel execution time: 5.18056 ms -gemm time: 8.22984 - -kernel execution time: 9735.41 ms -taco reference time: 9737.5 -/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx -ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); -B1_dimension: 83334, B2_dimension: 83334, vals: 6010480 -C1_dimension: 83334, C2_dimension: 64, vals: 5333376 -D1_dimension: 83334, D2_dimension: 64, vals: 5333376 -E1_dimension: 83334, E2_dimension: 64, vals: 5333376 -G1_dimension: 83334, G2_dimension: 64, vals: 4096 - - -kernel execution time: 16.2173 ms -fused time: 20.4628 - -kernel execution time: 26.5883 ms -sddmm time: 30.2732 - -kernel execution time: 9.67928 ms -sddmm ryan time: 13.4002 - -kernel execution time: 9.46597 ms -spmm ryan time: 12.3215 - -kernel execution time: 6.14851 ms -gemm time: 6.79689 - -kernel execution time: 14647.4 ms -taco reference time: 14648.9 -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx -ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); -B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 -C1_dimension: 121192, C2_dimension: 64, vals: 7756288 -D1_dimension: 121192, D2_dimension: 64, vals: 7756288 -E1_dimension: 121192, E2_dimension: 64, vals: 7756288 -G1_dimension: 121192, G2_dimension: 64, vals: 4096 - - -kernel execution time: 28.0895 ms -fused time: 33.0632 - -kernel execution time: 29.4447 ms -sddmm time: 33.2669 - -kernel execution time: 10.992 ms -sddmm ryan time: 15.1462 - -kernel execution time: 12.2197 ms -spmm ryan time: 14.8823 - -kernel execution time: 9.1576 ms -gemm time: 12.476 - -kernel execution time: 6388.6 ms -taco reference time: 6389.71 -/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx -ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); -B1_dimension: 140874, B2_dimension: 140874, vals: 7813404 -C1_dimension: 140874, C2_dimension: 64, vals: 9015936 -D1_dimension: 140874, D2_dimension: 64, vals: 9015936 -E1_dimension: 140874, E2_dimension: 64, vals: 9015936 -G1_dimension: 140874, G2_dimension: 64, vals: 4096 - - -kernel execution time: 24.4023 ms -fused time: 28.7813 - -kernel execution time: 37.3163 ms -sddmm time: 41.2616 - -kernel execution time: 13.8084 ms -sddmm ryan time: 17.1208 - -kernel execution time: 14.1626 ms -spmm ryan time: 17.3487 - -kernel execution time: 10.2461 ms -gemm time: 10.8026 - -kernel execution time: 19008 ms -taco reference time: 19013 -/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx -ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); -B1_dimension: 170998, B2_dimension: 170998, vals: 958936 -C1_dimension: 170998, C2_dimension: 64, vals: 10943872 -D1_dimension: 170998, D2_dimension: 64, vals: 10943872 -E1_dimension: 170998, E2_dimension: 64, vals: 10943872 -G1_dimension: 170998, G2_dimension: 64, vals: 4096 - - -kernel execution time: 18.5328 ms -fused time: 21.8578 - -kernel execution time: 29.8727 ms -sddmm time: 32.6967 - -kernel execution time: 7.1244 ms -sddmm ryan time: 10.2857 - -kernel execution time: 8.9243 ms -spmm ryan time: 9.54503 - -kernel execution time: 12.6159 ms -gemm time: 13.2038 - -kernel execution time: 2326 ms -taco reference time: 2326.66 -/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx -ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); -B1_dimension: 206500, B2_dimension: 206500, vals: 1273389 -C1_dimension: 206500, C2_dimension: 64, vals: 13216000 -D1_dimension: 206500, D2_dimension: 64, vals: 13216000 -E1_dimension: 206500, E2_dimension: 64, vals: 13216000 -G1_dimension: 206500, G2_dimension: 64, vals: 4096 - - -kernel execution time: 25.7525 ms -fused time: 27.0427 - -kernel execution time: 40.701 ms -sddmm time: 44.8629 - -kernel execution time: 9.61808 ms -sddmm ryan time: 13.4076 - -kernel execution time: 12.4322 ms -spmm ryan time: 15.2811 - -kernel execution time: 15.1033 ms -gemm time: 17.9102 - -kernel execution time: 3091.33 ms -taco reference time: 3092.53 -/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx -ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); -B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536 -C1_dimension: 1000005, C2_dimension: 64, vals: 64000320 -D1_dimension: 1000005, D2_dimension: 64, vals: 64000320 -E1_dimension: 1000005, E2_dimension: 64, vals: 64000320 -G1_dimension: 1000005, G2_dimension: 64, vals: 4096 - - -kernel execution time: 68.4469 ms -fused time: 72.7982 - -kernel execution time: 52.1276 ms -sddmm time: 56.0577 - -kernel execution time: 23.4796 ms -sddmm ryan time: 27.0851 - -kernel execution time: 42.2008 ms -spmm ryan time: 45.2618 - -kernel execution time: 74.1167 ms -gemm time: 78.5888 - -kernel execution time: 7502.71 ms -taco reference time: 7508.45 -/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx -ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); -B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291 -C1_dimension: 5558326, C2_dimension: 64, vals: 355732864 -D1_dimension: 5558326, D2_dimension: 64, vals: 355732864 -E1_dimension: 5558326, E2_dimension: 64, vals: 355732864 -G1_dimension: 5558326, G2_dimension: 64, vals: 4096 - - -kernel execution time: 684.483 ms -fused time: 689.124 - -kernel execution time: 889.925 ms -sddmm time: 894.03 - -kernel execution time: 315.322 ms -sddmm ryan time: 319.629 - -kernel execution time: 449.91 ms -spmm ryan time: 453.686 - -kernel execution time: 417.449 ms -gemm time: 421.26 - -kernel execution time: 326305 ms -taco reference time: 326311 - -sddmm-spmm-gemm execution - ------------------------------------------ -/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx -ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); -B1_dimension: 2708, B2_dimension: 2708, vals: 5429 -C1_dimension: 2708, C2_dimension: 64, vals: 173312 -D1_dimension: 2708, D2_dimension: 64, vals: 173312 -E1_dimension: 2708, E2_dimension: 64, vals: 173312 -G1_dimension: 2708, G2_dimension: 64, vals: 4096 - - -kernel execution time: 5.08607 ms -fused time: 5.61989 - -kernel execution time: 0.557608 ms -sddmm time: 0.871642 - -kernel execution time: 0.465526 ms -sddmm ryan time: 0.7713 - -kernel execution time: 0.498686 ms -spmm ryan time: 0.739309 - -kernel execution time: 0.7957 ms -gemm time: 1.05919 - -kernel execution time: 42.447 ms -taco reference time: 42.885 -/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx -ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); -B1_dimension: 548551, B2_dimension: 548551, vals: 1851744 -C1_dimension: 548551, C2_dimension: 64, vals: 35107264 -D1_dimension: 548551, D2_dimension: 64, vals: 35107264 -E1_dimension: 548551, E2_dimension: 64, vals: 35107264 -G1_dimension: 548551, G2_dimension: 64, vals: 4096 - - -kernel execution time: 89.9099 ms -fused time: 90.5117 - -kernel execution time: 29.9086 ms -sddmm time: 30.4936 - -kernel execution time: 29.1529 ms -sddmm ryan time: 29.7063 - -kernel execution time: 34.6318 ms -spmm ryan time: 35.1535 - -kernel execution time: 66.4663 ms -gemm time: 67.0316 - -kernel execution time: 6272.25 ms -taco reference time: 6273.42 - -sddmm-spmm-gemm execution - ------------------------------------------ -/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx -ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); -B1_dimension: 2708, B2_dimension: 2708, vals: 5429 -C1_dimension: 2708, C2_dimension: 64, vals: 173312 -D1_dimension: 2708, D2_dimension: 64, vals: 173312 -E1_dimension: 2708, E2_dimension: 64, vals: 173312 -G1_dimension: 2708, G2_dimension: 64, vals: 4096 - - -kernel execution time: 3.72391 ms -fused time: 4.19698 - -kernel execution time: 0.585647 ms -sddmm time: 0.893112 - -kernel execution time: 0.483056 ms -sddmm ryan time: 0.79108 - -kernel execution time: 0.567518 ms -spmm ryan time: 0.808711 - -kernel execution time: 0.929183 ms -gemm time: 1.32543 - -kernel execution time: 35.7066 ms -taco reference time: 36.3331 -/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx -ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); -B1_dimension: 548551, B2_dimension: 548551, vals: 1851744 -C1_dimension: 548551, C2_dimension: 64, vals: 35107264 -D1_dimension: 548551, D2_dimension: 64, vals: 35107264 -E1_dimension: 548551, E2_dimension: 64, vals: 35107264 -G1_dimension: 548551, G2_dimension: 64, vals: 4096 - - -kernel execution time: 94.9377 ms -fused time: 95.7687 - -kernel execution time: 32.2051 ms -sddmm time: 32.7881 - -kernel execution time: 30.3982 ms -sddmm ryan time: 30.95 - -kernel execution time: 34.4172 ms -spmm ryan time: 34.9049 - -kernel execution time: 67.2709 ms -gemm time: 67.8035 - -kernel execution time: 6215.08 ms -taco reference time: 6216.26 - -sddmm-spmm-gemm execution - ------------------------------------------ -/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx -ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); -B1_dimension: 2708, B2_dimension: 2708, vals: 5429 -C1_dimension: 2708, C2_dimension: 64, vals: 173312 -D1_dimension: 2708, D2_dimension: 64, vals: 173312 -E1_dimension: 2708, E2_dimension: 64, vals: 173312 -G1_dimension: 2708, G2_dimension: 64, vals: 4096 - - -kernel execution time: 6.99173 ms -fused time: 7.86448 - -kernel execution time: 0.78061 ms -sddmm time: 1.28867 - -kernel execution time: 0.554227 ms -sddmm ryan time: 0.837111 - -kernel execution time: 0.909912 ms -spmm ryan time: 1.12908 - -kernel execution time: 7.60724 ms -gemm time: 7.85047 - -kernel execution time: 652.888 ms -taco reference time: 653.271 -/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx -ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); -B1_dimension: 548551, B2_dimension: 548551, vals: 1851744 -C1_dimension: 548551, C2_dimension: 64, vals: 35107264 -D1_dimension: 548551, D2_dimension: 64, vals: 35107264 -E1_dimension: 548551, E2_dimension: 64, vals: 35107264 -G1_dimension: 548551, G2_dimension: 64, vals: 4096 - - -kernel execution time: 1236.33 ms -fused time: 1236.87 - -kernel execution time: 249.805 ms -sddmm time: 250.356 - -kernel execution time: 247.195 ms -sddmm ryan time: 247.729 - -kernel execution time: 285.764 ms -spmm ryan time: 286.235 - -kernel execution time: 1529.34 ms -gemm time: 1529.83 - -kernel execution time: 190620 ms -taco reference time: 190621 - -sddmm-spmm-gemm execution - ------------------------------------------ -/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx -ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); -B1_dimension: 2708, B2_dimension: 2708, vals: 5429 -C1_dimension: 2708, C2_dimension: 64, vals: 173312 -D1_dimension: 2708, D2_dimension: 64, vals: 173312 -E1_dimension: 2708, E2_dimension: 64, vals: 173312 -G1_dimension: 2708, G2_dimension: 64, vals: 4096 - - -kernel execution time: 1.86163 ms -fused time: 2.34746 - -kernel execution time: 0.542927 ms -sddmm time: 1.05528 - -kernel execution time: 0.541998 ms -sddmm ryan time: 1.07672 - -kernel execution time: 0.524767 ms -spmm ryan time: 0.944293 - -kernel execution time: 0.75947 ms -gemm time: 1.2162 - -kernel execution time: 36.3755 ms -taco reference time: 37.0989 - -sddmm-spmm-gemm execution - ------------------------------------------ -/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx -ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); -B1_dimension: 2708, B2_dimension: 2708, vals: 5429 -C1_dimension: 2708, C2_dimension: 64, vals: 173312 -D1_dimension: 2708, D2_dimension: 64, vals: 173312 -E1_dimension: 2708, E2_dimension: 64, vals: 173312 -G1_dimension: 2708, G2_dimension: 64, vals: 4096 - - -kernel execution time: 1.97375 ms -fused time: 2.84436 - -kernel execution time: 0.881212 ms -sddmm time: 1.38907 - -kernel execution time: 0.545557 ms -sddmm ryan time: 1.0807 - -kernel execution time: 0.548488 ms -spmm ryan time: 0.978813 - -kernel execution time: 0.72955 ms -gemm time: 1.2023 - -kernel execution time: 34.867 ms -taco reference time: 35.5819 - -sddmm-spmm-gemm execution - ------------------------------------------ -/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx -ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); -B1_dimension: 2708, B2_dimension: 2708, vals: 5429 -C1_dimension: 2708, C2_dimension: 64, vals: 173312 -D1_dimension: 2708, D2_dimension: 64, vals: 173312 -E1_dimension: 2708, E2_dimension: 64, vals: 173312 -G1_dimension: 2708, G2_dimension: 64, vals: 4096 - - -kernel execution time: 1.69165 ms -fused time: 2.2114 - -kernel execution time: 0.908102 ms -sddmm time: 1.19792 - -kernel execution time: 0.513137 ms -sddmm ryan time: 0.807571 - -kernel execution time: 0.510327 ms -spmm ryan time: 0.76134 - -kernel execution time: 0.803101 ms -gemm time: 1.0684 - -kernel execution time: 45.9784 ms -taco reference time: 46.3901 - -sddmm-spmm-gemm execution - ------------------------------------------ -/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx -ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); -B1_dimension: 2708, B2_dimension: 2708, vals: 5429 -C1_dimension: 2708, C2_dimension: 64, vals: 173312 -D1_dimension: 2708, D2_dimension: 64, vals: 173312 -E1_dimension: 2708, E2_dimension: 64, vals: 173312 -G1_dimension: 2708, G2_dimension: 64, vals: 4096 - - -kernel execution time: 1.82354 ms -fused time: 2.81223 - -kernel execution time: 0.926052 ms -sddmm time: 1.48292 - -kernel execution time: 0.564157 ms -sddmm ryan time: 1.14611 - -kernel execution time: 0.512447 ms -spmm ryan time: 0.925102 - -kernel execution time: 0.689109 ms -gemm time: 1.08196 - -kernel execution time: 34.7847 ms -taco reference time: 35.4182 - -sddmm-spmm-gemm execution - ------------------------------------------ -/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx -ref(i,m)=B(i,j)*C(i,k)*D(j,k)*F(j,l)*G(l,m); -B1_dimension: 2708, B2_dimension: 2708, vals: 5429 -C1_dimension: 2708, C2_dimension: 64, vals: 173312 -D1_dimension: 2708, D2_dimension: 64, vals: 173312 -E1_dimension: 2708, E2_dimension: 64, vals: 173312 -G1_dimension: 2708, G2_dimension: 64, vals: 4096 - - -kernel execution time: 6.8174 ms -fused time: 7.69061 - -kernel execution time: 0.935843 ms -sddmm time: 1.46847 - -kernel execution time: 0.612468 ms -sddmm ryan time: 0.880662 - -kernel execution time: 0.831351 ms -spmm ryan time: 1.05745 - -kernel execution time: 7.58342 ms -gemm time: 7.82297 - -kernel execution time: 566.881 ms -taco reference time: 567.264 diff --git a/test/stats/sddmm-spmm.txt b/test/stats/sddmm-spmm.txt deleted file mode 100644 index df8d924b8..000000000 --- a/test/stats/sddmm-spmm.txt +++ /dev/null @@ -1,5995 +0,0 @@ - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 102, B2_dimension: 103, vals: 3149 -C1_dimension: 102, C2_dimension: 64, vals: 6528 -D1_dimension: 103, D2_dimension: 64, vals: 6592 -E1_dimension: 103, E2_dimension: 48, vals: 4944 - - -kernel execution time: 6223.98 ms -fused time: 6225.14 - -kernel execution time: 3659.4 ms -sddmm time: 3660.83 - -kernel execution time: 3145.85 ms -spmm time: 3146.77 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 10974, D2_dimension: 64, vals: 702336 -E1_dimension: 10974, E2_dimension: 64, vals: 702336 - - -kernel execution time: 17.1703 ms -fused time: 17.6378 - -kernel execution time: 8.23135 ms -sddmm time: 8.77073 - -kernel execution time: 19.3034 ms -spmm time: 19.7426 - -kernel execution time: 514.133 ms -taco reference time: 514.662 - -mtx dim1 dim2 nnz fused sddmm spmm taco-original -bcsstk17 10974 10974 428650 17.1703 8.23135 19.3034 514.662 - - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 64, vals: 2330688 -D1_dimension: 36417, D2_dimension: 64, vals: 2330688 -E1_dimension: 36417, E2_dimension: 64, vals: 2330688 - - -kernel execution time: 163.616 ms -fused time: 164.099 - -kernel execution time: 81.2672 ms -sddmm time: 81.8014 - -kernel execution time: 294.454 ms -spmm time: 294.968 - -kernel execution time: 5149.58 ms -taco reference time: 5150.58 - -mtx dim1 dim2 nnz fused sddmm spmm taco-original -bcsstk17 10974 10974 428650 17.1703 8.23135 19.3034 514.662 -pdb1HYS 36417 36417 4344765 163.616 81.2672 294.454 5149.58 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 46835, B2_dimension: 46835, vals: 2374001 -C1_dimension: 46835, C2_dimension: 64, vals: 2997440 -D1_dimension: 46835, D2_dimension: 64, vals: 2997440 -E1_dimension: 46835, E2_dimension: 64, vals: 2997440 - - -kernel execution time: 92.8319 ms -fused time: 93.3139 - -kernel execution time: 45.3221 ms -sddmm time: 45.8599 - -kernel execution time: 136.693 ms -spmm time: 137.198 - -kernel execution time: 2824.95 ms -taco reference time: 2825.53 - -mtx dim1 dim2 nnz fused sddmm spmm taco-original -bcsstk17 10974 10974 428650 17.1703 8.23135 19.3034 514.662 -pdb1HYS 36417 36417 4344765 163.616 81.2672 294.454 5149.58 -rma10 46835 46835 2374001 92.8319 45.3221 136.693 2824.95 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 62451, B2_dimension: 62451, vals: 4007383 -C1_dimension: 62451, C2_dimension: 64, vals: 3996864 -D1_dimension: 62451, D2_dimension: 64, vals: 3996864 -E1_dimension: 62451, E2_dimension: 64, vals: 3996864 - - -kernel execution time: 153.867 ms -fused time: 154.368 - -kernel execution time: 74.9071 ms -sddmm time: 75.4719 - -kernel execution time: 258.678 ms -spmm time: 259.209 - -kernel execution time: 4786.95 ms -taco reference time: 4788.05 - -mtx dim1 dim2 nnz fused sddmm spmm taco-original -bcsstk17 10974 10974 428650 17.1703 8.23135 19.3034 514.662 -pdb1HYS 36417 36417 4344765 163.616 81.2672 294.454 5149.58 -rma10 46835 46835 2374001 92.8319 45.3221 136.693 2824.95 -cant 62451 62451 4007383 153.867 74.9071 258.678 4786.95 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 83334, B2_dimension: 83334, vals: 6010480 -C1_dimension: 83334, C2_dimension: 64, vals: 5333376 -D1_dimension: 83334, D2_dimension: 64, vals: 5333376 -E1_dimension: 83334, E2_dimension: 64, vals: 5333376 - - -kernel execution time: 231.253 ms -fused time: 231.75 - -kernel execution time: 112.863 ms -sddmm time: 113.405 - -kernel execution time: 417.749 ms -spmm time: 418.285 - -kernel execution time: 7133.75 ms -taco reference time: 7134.88 - -consph 83334 83334 6010480 231.253 112.863 417.749 7133.75 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 -C1_dimension: 121192, C2_dimension: 64, vals: 7756288 -D1_dimension: 121192, D2_dimension: 64, vals: 7756288 -E1_dimension: 121192, E2_dimension: 64, vals: 7756288 - - -kernel execution time: 192.743 ms -fused time: 193.23 - -kernel execution time: 85.0563 ms -sddmm time: 85.6227 - -kernel execution time: 150.367 ms -spmm time: 150.908 - -kernel execution time: 3285.24 ms -taco reference time: 3286.37 - -cop20k_A 121192 121192 2624331 192.743 85.0563 150.367 3285.24 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 140874, B2_dimension: 140874, vals: 7813404 -C1_dimension: 140874, C2_dimension: 64, vals: 9015936 -D1_dimension: 140874, D2_dimension: 64, vals: 9015936 -E1_dimension: 140874, E2_dimension: 64, vals: 9015936 - - -kernel execution time: 307.481 ms -fused time: 307.98 - -kernel execution time: 150.621 ms -sddmm time: 151.15 - -kernel execution time: 451.195 ms -spmm time: 451.689 - -kernel execution time: 9393.95 ms -taco reference time: 9395.02 - -shipsec1 140874 140874 7813404 307.481 150.621 451.195 9393.95 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 170998, B2_dimension: 170998, vals: 958936 -C1_dimension: 170998, C2_dimension: 64, vals: 10943872 -D1_dimension: 170998, D2_dimension: 64, vals: 10943872 -E1_dimension: 170998, E2_dimension: 64, vals: 10943872 - - -kernel execution time: 85.4659 ms -fused time: 85.9614 - -kernel execution time: 34.7139 ms -sddmm time: 35.2946 - -kernel execution time: 71.0646 ms -spmm time: 71.6139 - -kernel execution time: 1234.06 ms -taco reference time: 1234.68 - -scircuit 170998 170998 958936 85.4659 34.7139 71.0646 1234.06 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 206500, B2_dimension: 206500, vals: 1273389 -C1_dimension: 206500, C2_dimension: 64, vals: 13216000 -D1_dimension: 206500, D2_dimension: 64, vals: 13216000 -E1_dimension: 206500, E2_dimension: 64, vals: 13216000 - - -kernel execution time: 88.3959 ms -fused time: 88.8687 - -kernel execution time: 36.7565 ms -sddmm time: 37.3021 - -kernel execution time: 80.2217 ms -spmm time: 80.7621 - -kernel execution time: 1588.94 ms -taco reference time: 1589.58 - -mac_econ_fwd500 206500 206500 1273389 88.3959 36.7565 80.2217 1588.94 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536 -C1_dimension: 1000005, C2_dimension: 64, vals: 64000320 -D1_dimension: 1000005, D2_dimension: 64, vals: 64000320 -E1_dimension: 1000005, E2_dimension: 64, vals: 64000320 - - -kernel execution time: 244.992 ms -fused time: 245.482 - -kernel execution time: 86.8711 ms -sddmm time: 87.4084 - -kernel execution time: 245.054 ms -spmm time: 245.552 - -kernel execution time: 3952.47 ms -taco reference time: 3953.57 - -webbase-1M 1000005 1000005 3105536 244.992 86.8711 245.054 3952.47 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291 -C1_dimension: 5558326, C2_dimension: 64, vals: 355732864 -D1_dimension: 5558326, D2_dimension: 64, vals: 355732864 -E1_dimension: 5558326, E2_dimension: 64, vals: 355732864 - - -kernel execution time: 3275.48 ms -fused time: 3276.44 - -kernel execution time: 1522.51 ms -sddmm time: 1523.05 - -kernel execution time: 7164.88 ms -spmm time: 7165.87 - -kernel execution time: 84078.7 ms -taco reference time: 84079.8 - -circuit5M 5558326 5558326 59524291 3275.48 1522.51 7164.88 84078.7 -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 -C1_dimension: 121192, C2_dimension: 64, vals: 7756288 -D1_dimension: 121192, D2_dimension: 64, vals: 7756288 -E1_dimension: 121192, E2_dimension: 64, vals: 7756288 - - -kernel execution time: 62.8847 ms -fused time: 63.418 - -kernel execution time: 561.815 ms -sddmm time: 562.479 - -kernel execution time: 62.7688 ms -spmm time: 63.4747 - -kernel execution time: 727.65 ms -taco reference time: 728.755 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 -C1_dimension: 121192, C2_dimension: 64, vals: 7756288 -D1_dimension: 121192, D2_dimension: 64, vals: 7756288 -E1_dimension: 121192, E2_dimension: 64, vals: 7756288 - - -kernel execution time: 1121.74 ms -fused time: 1122.26 - -kernel execution time: 524.494 ms -sddmm time: 525.084 - -kernel execution time: 602.517 ms -spmm time: 603.056 - -kernel execution time: 38095.2 ms -taco reference time: 38096.3 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 -C1_dimension: 121192, C2_dimension: 64, vals: 7756288 -D1_dimension: 121192, D2_dimension: 64, vals: 7756288 -E1_dimension: 121192, E2_dimension: 64, vals: 7756288 - - -kernel execution time: 1129.96 ms -fused time: 1130.47 - -kernel execution time: 528.571 ms -sddmm time: 529.152 - -kernel execution time: 611.108 ms -spmm time: 611.643 - -kernel execution time: 38230.1 ms -taco reference time: 38231.1 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 -C1_dimension: 121192, C2_dimension: 64, vals: 7756288 -D1_dimension: 121192, D2_dimension: 64, vals: 7756288 -E1_dimension: 121192, E2_dimension: 64, vals: 7756288 - - -kernel execution time: 63.6404 ms -fused time: 64.1428 - -kernel execution time: 562.966 ms -sddmm time: 563.609 - -kernel execution time: 62.5981 ms -spmm time: 63.1044 - -kernel execution time: 728.068 ms -taco reference time: 729.005 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 -C1_dimension: 121192, C2_dimension: 64, vals: 7756288 -D1_dimension: 121192, D2_dimension: 64, vals: 7756288 -E1_dimension: 121192, E2_dimension: 64, vals: 7756288 - - -kernel execution time: 62.7795 ms -fused time: 63.2831 - -kernel execution time: 564.376 ms -sddmm time: 565.025 - -kernel execution time: 62.8883 ms -spmm time: 63.4116 - -kernel execution time: 727.567 ms -taco reference time: 728.511 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 -C1_dimension: 121192, C2_dimension: 64, vals: 7756288 -D1_dimension: 121192, D2_dimension: 64, vals: 7756288 -E1_dimension: 121192, E2_dimension: 64, vals: 7756288 - - -kernel execution time: 68.4674 ms -fused time: 68.9896 - -kernel execution time: 563.596 ms -sddmm time: 564.267 - -kernel execution time: 62.5779 ms -spmm time: 63.0812 - -kernel execution time: 730.226 ms -taco reference time: 731.124 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 -C1_dimension: 121192, C2_dimension: 64, vals: 7756288 -D1_dimension: 121192, D2_dimension: 64, vals: 7756288 -E1_dimension: 121192, E2_dimension: 64, vals: 7756288 - - -kernel execution time: 56.5639 ms -fused time: 57.0618 - -kernel execution time: 562.554 ms -sddmm time: 563.193 - -kernel execution time: 62.6038 ms -spmm time: 63.1209 - -kernel execution time: 730.018 ms -taco reference time: 730.906 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 -C1_dimension: 121192, C2_dimension: 64, vals: 7756288 -D1_dimension: 121192, D2_dimension: 64, vals: 7756288 -E1_dimension: 121192, E2_dimension: 64, vals: 7756288 - - -kernel execution time: 66.7636 ms -fused time: 67.2669 - -kernel execution time: 564.075 ms -sddmm time: 564.809 - -kernel execution time: 62.9335 ms -spmm time: 63.4347 - -kernel execution time: 727.588 ms -taco reference time: 728.484 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 -C1_dimension: 121192, C2_dimension: 64, vals: 7756288 -D1_dimension: 121192, D2_dimension: 64, vals: 7756288 -E1_dimension: 121192, E2_dimension: 64, vals: 7756288 - - -kernel execution time: 55.1612 ms -fused time: 55.6765 - -kernel execution time: 574.602 ms -sddmm time: 575.262 - -kernel execution time: 62.2801 ms -spmm time: 62.7918 - -kernel execution time: 738.027 ms -taco reference time: 738.739 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 -C1_dimension: 121192, C2_dimension: 64, vals: 7756288 -D1_dimension: 121192, D2_dimension: 64, vals: 7756288 -E1_dimension: 121192, E2_dimension: 64, vals: 7756288 - - -kernel execution time: 864.868 ms -fused time: 865.374 - -kernel execution time: 544.426 ms -sddmm time: 545.045 - -kernel execution time: 377.977 ms -spmm time: 378.522 - -kernel execution time: 19947 ms -taco reference time: 19948.1 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 -C1_dimension: 121192, C2_dimension: 64, vals: 7756288 -D1_dimension: 121192, D2_dimension: 64, vals: 7756288 -E1_dimension: 121192, E2_dimension: 64, vals: 7756288 - - -kernel execution time: 71.685 ms -fused time: 72.1905 - -kernel execution time: 548.984 ms -sddmm time: 549.581 - -kernel execution time: 51.9969 ms -spmm time: 52.562 - -kernel execution time: 969.838 ms -taco reference time: 970.48 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 -C1_dimension: 121192, C2_dimension: 64, vals: 7756288 -D1_dimension: 121192, D2_dimension: 64, vals: 7756288 -E1_dimension: 121192, E2_dimension: 64, vals: 7756288 - - -kernel execution time: 56.1268 ms -fused time: 56.6263 - -kernel execution time: 566.523 ms -sddmm time: 567.123 - -kernel execution time: 60.4097 ms -spmm time: 60.9402 - -kernel execution time: 757.174 ms -taco reference time: 757.82 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 -C1_dimension: 121192, C2_dimension: 64, vals: 7756288 -D1_dimension: 121192, D2_dimension: 64, vals: 7756288 -E1_dimension: 121192, E2_dimension: 64, vals: 7756288 - -24 threads - -kernel execution time: 119.302 ms -fused time: 119.817 - -kernel execution time: 550.24 ms -sddmm time: 550.791 - -kernel execution time: 49.3294 ms -spmm time: 49.8462 - -kernel execution time: 1710.98 ms -taco reference time: 1711.58 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 -C1_dimension: 121192, C2_dimension: 64, vals: 7756288 -D1_dimension: 121192, D2_dimension: 64, vals: 7756288 -E1_dimension: 121192, E2_dimension: 64, vals: 7756288 - -2 threads - -kernel execution time: 832.831 ms -fused time: 833.337 - -kernel execution time: 543.518 ms -sddmm time: 544.133 - -kernel execution time: 372.721 ms -spmm time: 373.277 - -kernel execution time: 19871.7 ms -taco reference time: 19873 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 -C1_dimension: 121192, C2_dimension: 64, vals: 7756288 -D1_dimension: 121192, D2_dimension: 64, vals: 7756288 -E1_dimension: 121192, E2_dimension: 64, vals: 7756288 - - -kernel execution time: 1177.5 ms -fused time: 1178 - -kernel execution time: 547.532 ms -sddmm time: 548.083 - -kernel execution time: 618.83 ms -spmm time: 619.38 - -kernel execution time: 39590.7 ms -taco reference time: 39591.8 - - - ----------- 24 threads - - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 10974, D2_dimension: 64, vals: 702336 -E1_dimension: 10974, E2_dimension: 64, vals: 702336 - - -kernel execution time: 18.2194 ms -fused time: 18.6902 - -kernel execution time: 80.3278 ms -sddmm time: 80.7347 - -kernel execution time: 5.17506 ms -spmm time: 5.64137 - -kernel execution time: 275.571 ms -taco reference time: 275.978 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 64, vals: 2330688 -D1_dimension: 36417, D2_dimension: 64, vals: 2330688 -E1_dimension: 36417, E2_dimension: 64, vals: 2330688 - - -kernel execution time: 159.53 ms -fused time: 160.016 - -kernel execution time: 814.453 ms -sddmm time: 814.988 - -kernel execution time: 41.9148 ms -spmm time: 42.4142 - -kernel execution time: 2782.76 ms -taco reference time: 2783.34 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 46835, B2_dimension: 46835, vals: 2374001 -C1_dimension: 46835, C2_dimension: 64, vals: 2997440 -D1_dimension: 46835, D2_dimension: 64, vals: 2997440 -E1_dimension: 46835, E2_dimension: 64, vals: 2997440 - - -kernel execution time: 80.1703 ms -fused time: 80.65 - -kernel execution time: 442.648 ms -sddmm time: 443.191 - -kernel execution time: 27.375 ms -spmm time: 27.8981 - -kernel execution time: 1518.49 ms -taco reference time: 1519.1 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 62451, B2_dimension: 62451, vals: 4007383 -C1_dimension: 62451, C2_dimension: 64, vals: 3996864 -D1_dimension: 62451, D2_dimension: 64, vals: 3996864 -E1_dimension: 62451, E2_dimension: 64, vals: 3996864 - - -kernel execution time: 147.378 ms -fused time: 147.862 - -kernel execution time: 746.182 ms -sddmm time: 746.722 - -kernel execution time: 43.521 ms -spmm time: 44.0217 - -kernel execution time: 2560.78 ms -taco reference time: 2561.36 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 83334, B2_dimension: 83334, vals: 6010480 -C1_dimension: 83334, C2_dimension: 64, vals: 5333376 -D1_dimension: 83334, D2_dimension: 64, vals: 5333376 -E1_dimension: 83334, E2_dimension: 64, vals: 5333376 - - -kernel execution time: 220.568 ms -fused time: 221.066 - -kernel execution time: 1121.47 ms -sddmm time: 1122.03 - -kernel execution time: 61.8518 ms -spmm time: 62.3779 - -kernel execution time: 3844.87 ms -taco reference time: 3845.8 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 -C1_dimension: 121192, C2_dimension: 64, vals: 7756288 -D1_dimension: 121192, D2_dimension: 64, vals: 7756288 -E1_dimension: 121192, E2_dimension: 64, vals: 7756288 - - -kernel execution time: 118.211 ms -fused time: 118.715 - -kernel execution time: 552.77 ms -sddmm time: 553.326 - -kernel execution time: 49.2278 ms -spmm time: 49.7369 - -kernel execution time: 1713.01 ms -taco reference time: 1713.63 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 140874, B2_dimension: 140874, vals: 7813404 -C1_dimension: 140874, C2_dimension: 64, vals: 9015936 -D1_dimension: 140874, D2_dimension: 64, vals: 9015936 -E1_dimension: 140874, E2_dimension: 64, vals: 9015936 - - -kernel execution time: 300.972 ms -fused time: 301.471 - -kernel execution time: 1461.86 ms -sddmm time: 1462.45 - -kernel execution time: 89.5313 ms -spmm time: 90.0418 - -kernel execution time: 5010.7 ms -taco reference time: 5011.67 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 170998, B2_dimension: 170998, vals: 958936 -C1_dimension: 170998, C2_dimension: 64, vals: 10943872 -D1_dimension: 170998, D2_dimension: 64, vals: 10943872 -E1_dimension: 170998, E2_dimension: 64, vals: 10943872 - - -kernel execution time: 52.5196 ms -fused time: 53.0296 - -kernel execution time: 210.075 ms -sddmm time: 210.666 - -kernel execution time: 67.487 ms -spmm time: 68.0293 - -kernel execution time: 632.81 ms -taco reference time: 633.445 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 206500, B2_dimension: 206500, vals: 1273389 -C1_dimension: 206500, C2_dimension: 64, vals: 13216000 -D1_dimension: 206500, D2_dimension: 64, vals: 13216000 -E1_dimension: 206500, E2_dimension: 64, vals: 13216000 - - -kernel execution time: 60.3333 ms -fused time: 60.8277 - -kernel execution time: 261.834 ms -sddmm time: 262.379 - -kernel execution time: 82.326 ms -spmm time: 82.838 - -kernel execution time: 836.401 ms -taco reference time: 837.023 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536 -C1_dimension: 1000005, C2_dimension: 64, vals: 64000320 -D1_dimension: 1000005, D2_dimension: 64, vals: 64000320 -E1_dimension: 1000005, E2_dimension: 64, vals: 64000320 - - -kernel execution time: 187.296 ms -fused time: 187.792 - -kernel execution time: 616.026 ms -sddmm time: 616.601 - -kernel execution time: 382.801 ms -spmm time: 383.307 - -kernel execution time: 2082.34 ms -taco reference time: 2082.95 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291 -C1_dimension: 5558326, C2_dimension: 64, vals: 355732864 -D1_dimension: 5558326, D2_dimension: 64, vals: 355732864 -E1_dimension: 5558326, E2_dimension: 64, vals: 355732864 - - -kernel execution time: 2499.56 ms -fused time: 2500.39 - -kernel execution time: 11463.5 ms -sddmm time: 11464.5 - -kernel execution time: 2581.49 ms -spmm time: 2582.04 - -kernel execution time: 39683.3 ms -taco reference time: 39684.4 - - - - - --------------------- ---------------------- - - - - - - - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 10974, D2_dimension: 64, vals: 702336 -E1_dimension: 10974, E2_dimension: 64, vals: 702336 - - -kernel execution time: 163.669 ms -fused time: 164.155 - -kernel execution time: 79.1673 ms -sddmm time: 79.7118 - -kernel execution time: 88.6347 ms -spmm time: 89.0784 - -kernel execution time: 6143.97 ms -taco reference time: 6144.94 - - -163.669 79.1673 88.6347 6144.94 - - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 10974, D2_dimension: 64, vals: 702336 -E1_dimension: 10974, E2_dimension: 64, vals: 702336 - - -kernel execution time: 17.2275 ms -fused time: 17.6988 - -kernel execution time: 8.26223 ms -sddmm time: 8.8233 - -kernel execution time: 19.3989 ms -spmm time: 19.8422 - -kernel execution time: 519.537 ms -taco reference time: 520.073 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 10974, D2_dimension: 64, vals: 702336 -E1_dimension: 10974, E2_dimension: 64, vals: 702336 - - -kernel execution time: 3.03999 ms -fused time: 3.51084 - -kernel execution time: 8.19604 ms -sddmm time: 8.67702 - -kernel execution time: 5.63342 ms -spmm time: 6.05327 - -kernel execution time: 25.6437 ms -taco reference time: 26.0382 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 64, vals: 2330688 -D1_dimension: 36417, D2_dimension: 64, vals: 2330688 -E1_dimension: 36417, E2_dimension: 64, vals: 2330688 - - -kernel execution time: 41.03 ms -fused time: 41.5262 - -kernel execution time: 82.5401 ms -sddmm time: 83.1745 - -kernel execution time: 15.9687 ms -spmm time: 16.5644 - -kernel execution time: 244.774 ms -taco reference time: 245.387 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 46835, B2_dimension: 46835, vals: 2374001 -C1_dimension: 46835, C2_dimension: 64, vals: 2997440 -D1_dimension: 46835, D2_dimension: 64, vals: 2997440 -E1_dimension: 46835, E2_dimension: 64, vals: 2997440 - - -kernel execution time: 27.5081 ms -fused time: 28.0034 - -kernel execution time: 45.9865 ms -sddmm time: 46.5649 - -kernel execution time: 20.0912 ms -spmm time: 20.6288 - -kernel execution time: 138.544 ms -taco reference time: 139.148 - - ----------- ------------ - - - - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 10974, D2_dimension: 64, vals: 702336 -E1_dimension: 10974, E2_dimension: 64, vals: 702336 - - -kernel execution time: 3.25222 ms -fused time: 3.71775 - -kernel execution time: 8.13173 ms -sddmm time: 8.56798 - -kernel execution time: 5.42295 ms -spmm time: 5.85093 - -kernel execution time: 25.1419 ms -taco reference time: 25.5332 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 64, vals: 2330688 -D1_dimension: 36417, D2_dimension: 64, vals: 2330688 -E1_dimension: 36417, E2_dimension: 64, vals: 2330688 - - -kernel execution time: 40.046 ms -fused time: 40.5327 - -kernel execution time: 82.7374 ms -sddmm time: 83.308 - -kernel execution time: 17.148 ms -spmm time: 17.6723 - -kernel execution time: 244.434 ms -taco reference time: 245.084 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 46835, B2_dimension: 46835, vals: 2374001 -C1_dimension: 46835, C2_dimension: 64, vals: 2997440 -D1_dimension: 46835, D2_dimension: 64, vals: 2997440 -E1_dimension: 46835, E2_dimension: 64, vals: 2997440 - - -kernel execution time: 27.3917 ms -fused time: 27.8878 - -kernel execution time: 46.1218 ms -sddmm time: 46.7015 - -kernel execution time: 19.567 ms -spmm time: 20.0877 - -kernel execution time: 136.269 ms -taco reference time: 136.877 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 62451, B2_dimension: 62451, vals: 4007383 -C1_dimension: 62451, C2_dimension: 64, vals: 3996864 -D1_dimension: 62451, D2_dimension: 64, vals: 3996864 -E1_dimension: 62451, E2_dimension: 64, vals: 3996864 - - -kernel execution time: 42.3074 ms -fused time: 42.8144 - -kernel execution time: 75.8411 ms -sddmm time: 76.427 - -kernel execution time: 25.5141 ms -spmm time: 26.0647 - -kernel execution time: 229.9 ms -taco reference time: 230.514 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 83334, B2_dimension: 83334, vals: 6010480 -C1_dimension: 83334, C2_dimension: 64, vals: 5333376 -D1_dimension: 83334, D2_dimension: 64, vals: 5333376 -E1_dimension: 83334, E2_dimension: 64, vals: 5333376 - - -kernel execution time: 57.3193 ms -fused time: 57.8292 - -kernel execution time: 115.953 ms -sddmm time: 116.536 - -kernel execution time: 31.4256 ms -spmm time: 31.9698 - -kernel execution time: 344.97 ms -taco reference time: 345.594 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 -C1_dimension: 121192, C2_dimension: 64, vals: 7756288 -D1_dimension: 121192, D2_dimension: 64, vals: 7756288 -E1_dimension: 121192, E2_dimension: 64, vals: 7756288 - - -kernel execution time: 58.8731 ms -fused time: 59.371 - -kernel execution time: 96.3746 ms -sddmm time: 96.9431 - -kernel execution time: 52.3502 ms -spmm time: 52.8781 - -kernel execution time: 176.858 ms -taco reference time: 177.482 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 140874, B2_dimension: 140874, vals: 7813404 -C1_dimension: 140874, C2_dimension: 64, vals: 9015936 -D1_dimension: 140874, D2_dimension: 64, vals: 9015936 -E1_dimension: 140874, E2_dimension: 64, vals: 9015936 - - -kernel execution time: 97.3646 ms -fused time: 97.869 - -kernel execution time: 154.708 ms -sddmm time: 155.284 - -kernel execution time: 61.8392 ms -spmm time: 62.3666 - -kernel execution time: 455.127 ms -taco reference time: 455.719 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 170998, B2_dimension: 170998, vals: 958936 -C1_dimension: 170998, C2_dimension: 64, vals: 10943872 -D1_dimension: 170998, D2_dimension: 64, vals: 10943872 -E1_dimension: 170998, E2_dimension: 64, vals: 10943872 - - -kernel execution time: 30.2488 ms -fused time: 30.744 - -kernel execution time: 39.9852 ms -sddmm time: 40.5654 - -kernel execution time: 67.5062 ms -spmm time: 68.0413 - -kernel execution time: 74.4023 ms -taco reference time: 75.0271 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 206500, B2_dimension: 206500, vals: 1273389 -C1_dimension: 206500, C2_dimension: 64, vals: 13216000 -D1_dimension: 206500, D2_dimension: 64, vals: 13216000 -E1_dimension: 206500, E2_dimension: 64, vals: 13216000 - - -kernel execution time: 34.9737 ms -fused time: 35.4724 - -kernel execution time: 39.6662 ms -sddmm time: 40.2179 - -kernel execution time: 82.4413 ms -spmm time: 82.9627 - -kernel execution time: 91.1415 ms -taco reference time: 91.8035 - -sddmm-spmm execution - ------------------------------------------ - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536 -C1_dimension: 1000005, C2_dimension: 64, vals: 64000320 -D1_dimension: 1000005, D2_dimension: 64, vals: 64000320 -E1_dimension: 1000005, E2_dimension: 64, vals: 64000320 - - -kernel execution time: 118.92 ms -fused time: 119.4 - -kernel execution time: 90.6065 ms -sddmm time: 91.1522 - -kernel execution time: 390.342 ms -spmm time: 390.863 - -kernel execution time: 423.16 ms -taco reference time: 423.757 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291 -C1_dimension: 5558326, C2_dimension: 64, vals: 355732864 -D1_dimension: 5558326, D2_dimension: 64, vals: 355732864 -E1_dimension: 5558326, E2_dimension: 64, vals: 355732864 - - -kernel execution time: 1158.96 ms -fused time: 1159.93 - -kernel execution time: 1561.31 ms -sddmm time: 1561.87 - -kernel execution time: 2533.87 ms -spmm time: 2534.43 - -kernel execution time: 6529.81 ms -taco reference time: 6530.95 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 10974, D2_dimension: 64, vals: 702336 -E1_dimension: 10974, E2_dimension: 64, vals: 702336 - - -kernel execution time: 3.12799 ms -fused time: 3.5888 - -kernel execution time: 8.20063 ms -sddmm time: 8.64883 - -kernel execution time: 5.23889 ms -spmm time: 5.67244 - -kernel execution time: 25.0758 ms -taco reference time: 25.4671 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 64, vals: 2330688 -D1_dimension: 36417, D2_dimension: 64, vals: 2330688 -E1_dimension: 36417, E2_dimension: 64, vals: 2330688 - - -kernel execution time: 39.3104 ms -fused time: 39.7945 - -kernel execution time: 82.5126 ms -sddmm time: 83.0785 - -kernel execution time: 15.6324 ms -spmm time: 16.1739 - -kernel execution time: 245.768 ms -taco reference time: 246.406 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 102, B2_dimension: 103, vals: 3149 -C1_dimension: 102, C2_dimension: 64, vals: 6528 -D1_dimension: 103, D2_dimension: 64, vals: 6592 -E1_dimension: 103, E2_dimension: 64, vals: 6592 - - -kernel execution time: 0.160132 ms -fused time: 0.567098 - -kernel execution time: 0.065981 ms -sddmm time: 0.853092 - -kernel execution time: 0.081641 ms -spmm time: 0.331655 - -kernel execution time: 0.336385 ms -taco reference time: 1.05356 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 5, B2_dimension: 5, vals: 19 -C1_dimension: 5, C2_dimension: 64, vals: 320 -D1_dimension: 5, D2_dimension: 64, vals: 320 -E1_dimension: 5, E2_dimension: 64, vals: 320 - - -kernel execution time: 0.0165 ms -fused time: 0.78845 - -kernel execution time: 0.011641 ms -sddmm time: 0.873231 - -kernel execution time: 0.011011 ms -spmm time: 0.486977 - -kernel execution time: 0.059631 ms -taco reference time: 0.958413 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 5, B2_dimension: 5, vals: 19 -C1_dimension: 5, C2_dimension: 64, vals: 320 -D1_dimension: 5, D2_dimension: 64, vals: 320 -E1_dimension: 5, E2_dimension: 64, vals: 320 - - -kernel execution time: 0.01989 ms -fused time: 0.813381 - -kernel execution time: 0.01392 ms -sddmm time: 0.976913 - -kernel execution time: 0.013151 ms -spmm time: 0.497287 - -kernel execution time: 0.058 ms -taco reference time: 0.974083 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 5, B2_dimension: 5, vals: 19 -C1_dimension: 5, C2_dimension: 64, vals: 320 -D1_dimension: 5, D2_dimension: 64, vals: 320 -E1_dimension: 5, E2_dimension: 64, vals: 320 - - -kernel execution time: 0.0192 ms -fused time: 0.8019 - -kernel execution time: 0.012991 ms -sddmm time: 0.990253 - -kernel execution time: 0.01291 ms -spmm time: 0.490396 - -kernel execution time: 0.057891 ms -taco reference time: 0.929332 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 5, B2_dimension: 5, vals: 19 -C1_dimension: 5, C2_dimension: 64, vals: 320 -D1_dimension: 5, D2_dimension: 64, vals: 320 -E1_dimension: 5, E2_dimension: 64, vals: 320 - - -kernel execution time: 0.01797 ms -fused time: 0.779061 - -kernel execution time: 0.013 ms -sddmm time: 0.7717 - -kernel execution time: 0.01429 ms -spmm time: 0.487296 - -kernel execution time: 0.05764 ms -taco reference time: 0.72862 - -sddmm-spmm execution - - - -sddmm with parallel execution ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 5, B2_dimension: 5, vals: 19 -C1_dimension: 5, C2_dimension: 64, vals: 320 -D1_dimension: 5, D2_dimension: 64, vals: 320 -E1_dimension: 5, E2_dimension: 64, vals: 320 - - -kernel execution time: 0.02088 ms -fused time: 0.912153 - -kernel execution time: 0.01161 ms -sddmm time: 0.944402 - -kernel execution time: 0.01292 ms -spmm time: 0.562267 - -kernel execution time: 0.067781 ms -taco reference time: 1.10908 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 64, vals: 2330688 -D1_dimension: 36417, D2_dimension: 64, vals: 2330688 -E1_dimension: 36417, E2_dimension: 64, vals: 2330688 - - -kernel execution time: 166.429 ms -fused time: 166.938 - -kernel execution time: 83.0174 ms -sddmm time: 83.5946 - -kernel execution time: 303.7 ms -spmm time: 304.246 - -kernel execution time: 5227.75 ms -taco reference time: 5228.77 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 64, vals: 2330688 -D1_dimension: 36417, D2_dimension: 64, vals: 2330688 -E1_dimension: 36417, E2_dimension: 64, vals: 2330688 - - -kernel execution time: 166.755 ms -fused time: 167.262 - -kernel execution time: 83.1762 ms -sddmm time: 83.7333 - -kernel execution time: 303.525 ms -spmm time: 304.051 - -kernel execution time: 5232.78 ms -taco reference time: 5233.91 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 64, vals: 2330688 -D1_dimension: 36417, D2_dimension: 64, vals: 2330688 -E1_dimension: 36417, E2_dimension: 64, vals: 2330688 - - -kernel execution time: 27.2912 ms -fused time: 27.7968 - -kernel execution time: 84.1751 ms -sddmm time: 84.7569 - -kernel execution time: 12.6781 ms -spmm time: 13.1881 - -kernel execution time: 134.209 ms -taco reference time: 134.846 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 64, vals: 2330688 -D1_dimension: 36417, D2_dimension: 64, vals: 2330688 -E1_dimension: 36417, E2_dimension: 64, vals: 2330688 - - -kernel execution time: 26.6207 ms -fused time: 27.1299 - -kernel execution time: 86.3046 ms -sddmm time: 86.9394 - -kernel execution time: 12.7749 ms -spmm time: 13.2807 - -kernel execution time: 130.582 ms -taco reference time: 131.278 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 64, vals: 2330688 -D1_dimension: 36417, D2_dimension: 64, vals: 2330688 -E1_dimension: 36417, E2_dimension: 64, vals: 2330688 - - -kernel execution time: 101.848 ms -fused time: 102.362 - -kernel execution time: 83.9029 ms -sddmm time: 84.4969 - -kernel execution time: 42.5674 ms -spmm time: 43.1242 - -kernel execution time: 708.807 ms -taco reference time: 709.518 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 64, vals: 2330688 -D1_dimension: 36417, D2_dimension: 64, vals: 2330688 -E1_dimension: 36417, E2_dimension: 64, vals: 2330688 - - -kernel execution time: 107.29 ms -fused time: 107.797 - -kernel execution time: 83.8499 ms -sddmm time: 84.3953 - -kernel execution time: 43.5065 ms -spmm time: 44.0135 - -kernel execution time: 705.909 ms -taco reference time: 706.511 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 64, vals: 2330688 -D1_dimension: 36417, D2_dimension: 64, vals: 2330688 -E1_dimension: 36417, E2_dimension: 64, vals: 2330688 - - -kernel execution time: 26.2026 ms -fused time: 26.7322 - -kernel execution time: 86.809 ms -sddmm time: 87.4374 - -kernel execution time: 12.6681 ms -spmm time: 13.1758 - -kernel execution time: 130.015 ms -taco reference time: 130.717 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 10974, D2_dimension: 64, vals: 702336 -E1_dimension: 10974, E2_dimension: 64, vals: 702336 - - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 10974, D2_dimension: 64, vals: 702336 -E1_dimension: 10974, E2_dimension: 64, vals: 702336 - - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 10974, D2_dimension: 64, vals: 702336 -E1_dimension: 10974, E2_dimension: 64, vals: 702336 - - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 10974, D2_dimension: 64, vals: 702336 -E1_dimension: 10974, E2_dimension: 64, vals: 702336 - - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 10974, D2_dimension: 64, vals: 702336 -E1_dimension: 10974, E2_dimension: 64, vals: 702336 - - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 10974, D2_dimension: 64, vals: 702336 -E1_dimension: 10974, E2_dimension: 64, vals: 702336 - - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 10974, D2_dimension: 64, vals: 702336 -E1_dimension: 10974, E2_dimension: 64, vals: 702336 - - -kernel execution time: 4.9002 ms -fused time: 5.40296 - -kernel execution time: 9.21483 ms -sddmm time: 9.69115 - -kernel execution time: 5.35955 ms -spmm time: 5.79675 - -kernel execution time: 14.9148 ms -taco reference time: 15.4012 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 10974, D2_dimension: 64, vals: 702336 -E1_dimension: 10974, E2_dimension: 64, vals: 702336 - - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 10974, D2_dimension: 64, vals: 702336 -E1_dimension: 10974, E2_dimension: 64, vals: 702336 - - -kernel execution time: 2.39607 ms -fused time: 2.86927 - -kernel execution time: 8.62899 ms -sddmm time: 8.97544 - -kernel execution time: 5.41841 ms -spmm time: 5.83089 - -kernel execution time: 14.2058 ms -taco reference time: 14.5956 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 10974, D2_dimension: 64, vals: 702336 -E1_dimension: 10974, E2_dimension: 64, vals: 702336 - - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 10974, D2_dimension: 64, vals: 702336 -E1_dimension: 10974, E2_dimension: 64, vals: 702336 - - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 10974, D2_dimension: 64, vals: 702336 -E1_dimension: 10974, E2_dimension: 64, vals: 702336 - - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 10974, D2_dimension: 64, vals: 702336 -E1_dimension: 10974, E2_dimension: 64, vals: 702336 - - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 10974, D2_dimension: 64, vals: 702336 -E1_dimension: 10974, E2_dimension: 64, vals: 702336 - - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 10974, D2_dimension: 64, vals: 702336 -E1_dimension: 10974, E2_dimension: 64, vals: 702336 - - -kernel execution time: 1.85339 ms -fused time: 2.66762 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 10974, D2_dimension: 64, vals: 702336 -E1_dimension: 10974, E2_dimension: 64, vals: 702336 - - -kernel execution time: 4.94195 ms -fused time: 6.0647 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 10974, D2_dimension: 64, vals: 702336 -E1_dimension: 10974, E2_dimension: 64, vals: 702336 - - -kernel execution time: 5.09918 ms -fused time: 6.23075 - -kernel execution time: 14.2105 ms -sddmm time: 15.026 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 10974, D2_dimension: 64, vals: 702336 -E1_dimension: 10974, E2_dimension: 64, vals: 702336 - - -kernel execution time: 4.93573 ms -fused time: 5.42636 - -kernel execution time: 8.35333 ms -sddmm time: 8.77215 - -kernel execution time: 5.35189 ms -spmm time: 5.7874 - -kernel execution time: 15.4744 ms -taco reference time: 15.8619 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 10974, D2_dimension: 64, vals: 702336 -E1_dimension: 10974, E2_dimension: 64, vals: 702336 - - -kernel execution time: 1.72938 ms -fused time: 2.19226 - -kernel execution time: 8.38474 ms -sddmm time: 8.70208 - -kernel execution time: 5.55896 ms -spmm time: 5.96847 - -kernel execution time: 13.8271 ms -taco reference time: 14.2228 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 10974, D2_dimension: 64, vals: 702336 -E1_dimension: 10974, E2_dimension: 64, vals: 702336 - - -kernel execution time: 1.99224 ms -fused time: 2.45758 - -kernel execution time: 8.4613 ms -sddmm time: 8.79168 - -kernel execution time: 5.51595 ms -spmm time: 5.95761 - -kernel execution time: 13.5919 ms -taco reference time: 13.973 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 10974, D2_dimension: 64, vals: 702336 -E1_dimension: 10974, E2_dimension: 64, vals: 702336 - - -kernel execution time: 2.17974 ms -fused time: 2.64915 - -kernel execution time: 9.49553 ms -sddmm time: 9.89178 - -kernel execution time: 5.3851 ms -spmm time: 5.80552 - -kernel execution time: 15.1854 ms -taco reference time: 15.6294 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 10974, D2_dimension: 64, vals: 702336 -E1_dimension: 10974, E2_dimension: 64, vals: 702336 - - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 10974, D2_dimension: 64, vals: 702336 -E1_dimension: 10974, E2_dimension: 64, vals: 702336 - - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 10974, D2_dimension: 64, vals: 702336 -E1_dimension: 10974, E2_dimension: 64, vals: 702336 - - -kernel execution time: 1.77985 ms -fused time: 2.24554 - -kernel execution time: 9.31643 ms -sddmm time: 9.66639 - -kernel execution time: 5.48351 ms -spmm time: 5.89775 - -kernel execution time: 15.1635 ms -taco reference time: 15.6173 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 10974, D2_dimension: 64, vals: 702336 -E1_dimension: 10974, E2_dimension: 64, vals: 702336 - - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 10974, D2_dimension: 64, vals: 702336 -E1_dimension: 10974, E2_dimension: 64, vals: 702336 - - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 10974, D2_dimension: 64, vals: 702336 -E1_dimension: 10974, E2_dimension: 64, vals: 702336 - - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 10974, D2_dimension: 64, vals: 702336 -E1_dimension: 10974, E2_dimension: 64, vals: 702336 - - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 10974, D2_dimension: 64, vals: 702336 -E1_dimension: 10974, E2_dimension: 64, vals: 702336 - - -kernel execution time: 2.09062 ms -fused time: 2.75986 - -kernel execution time: 8.53961 ms -sddmm time: 8.99868 - -kernel execution time: 5.43386 ms -spmm time: 5.86914 - -kernel execution time: 14.7848 ms -taco reference time: 15.2128 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 10974, D2_dimension: 64, vals: 702336 -E1_dimension: 10974, E2_dimension: 64, vals: 702336 - - -kernel execution time: 1.99345 ms -fused time: 2.4639 - -kernel execution time: 10.0509 ms -sddmm time: 10.4945 - -kernel execution time: 5.37643 ms -spmm time: 5.82607 - -kernel execution time: 15.0911 ms -taco reference time: 15.5753 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 10974, D2_dimension: 64, vals: 702336 -E1_dimension: 10974, E2_dimension: 64, vals: 702336 - - -kernel execution time: 2.14705 ms -fused time: 2.62359 - -kernel execution time: 9.35781 ms -sddmm time: 9.71116 - -kernel execution time: 6.0153 ms -spmm time: 6.42121 - -kernel execution time: 14.8814 ms -taco reference time: 15.3035 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 10974, D2_dimension: 64, vals: 702336 -E1_dimension: 10974, E2_dimension: 64, vals: 702336 - - -kernel execution time: 3.85621 ms -fused time: 4.31728 - -kernel execution time: 8.49591 ms -sddmm time: 8.85325 - -kernel execution time: 4.55458 ms -spmm time: 5.00309 - -kernel execution time: 71.693 ms -taco reference time: 72.1249 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 10974, D2_dimension: 64, vals: 702336 -E1_dimension: 10974, E2_dimension: 64, vals: 702336 - - -kernel execution time: 4.4083 ms -fused time: 4.87449 - -kernel execution time: 9.23609 ms -sddmm time: 9.68592 - -kernel execution time: 4.52337 ms -spmm time: 4.93316 - -kernel execution time: 75.7983 ms -taco reference time: 76.2419 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 10974, D2_dimension: 64, vals: 702336 -E1_dimension: 10974, E2_dimension: 64, vals: 702336 - - -kernel execution time: 2.02675 ms -fused time: 2.47188 - -kernel execution time: 9.25498 ms -sddmm time: 9.67129 - -kernel execution time: 5.23325 ms -spmm time: 5.68302 - -kernel execution time: 14.8775 ms -taco reference time: 15.3813 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 10974, D2_dimension: 64, vals: 702336 -E1_dimension: 10974, E2_dimension: 64, vals: 702336 - - -kernel execution time: 1.94846 ms -fused time: 2.40322 - -kernel execution time: 9.52502 ms -sddmm time: 9.90909 - -kernel execution time: 5.31443 ms -spmm time: 5.71988 - -kernel execution time: 15.7004 ms -taco reference time: 16.1456 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 10974, D2_dimension: 64, vals: 702336 -E1_dimension: 10974, E2_dimension: 64, vals: 702336 - - -kernel execution time: 1.79798 ms -fused time: 2.25022 - -kernel execution time: 9.43793 ms -sddmm time: 9.82708 - -kernel execution time: 5.29275 ms -spmm time: 5.69457 - -kernel execution time: 14.9269 ms -taco reference time: 15.3874 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 10974, D2_dimension: 64, vals: 702336 -E1_dimension: 10974, E2_dimension: 64, vals: 702336 - - -kernel execution time: 1.75935 ms -fused time: 2.20095 - -kernel execution time: 8.58506 ms -sddmm time: 8.92534 - -kernel execution time: 5.5533 ms -spmm time: 5.93899 - -kernel execution time: 14.2327 ms -taco reference time: 14.5943 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 10974, D2_dimension: 64, vals: 702336 -E1_dimension: 10974, E2_dimension: 64, vals: 702336 - - -kernel execution time: 2.04599 ms -fused time: 2.50059 - -kernel execution time: 9.39166 ms -sddmm time: 9.80431 - -kernel execution time: 5.3514 ms -spmm time: 5.75487 - -kernel execution time: 15.0619 ms -taco reference time: 15.497 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 10974, D2_dimension: 64, vals: 702336 -E1_dimension: 10974, E2_dimension: 64, vals: 702336 - - -kernel execution time: 1.9781 ms -fused time: 2.41055 - -kernel execution time: 8.50024 ms -sddmm time: 8.81933 - -kernel execution time: 5.28711 ms -spmm time: 5.68452 - -kernel execution time: 13.5108 ms -taco reference time: 13.8766 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 5, B2_dimension: 5, vals: 19 -C1_dimension: 5, C2_dimension: 64, vals: 320 -D1_dimension: 5, D2_dimension: 64, vals: 320 -E1_dimension: 5, E2_dimension: 64, vals: 320 - - -kernel execution time: 11.5205 ms -fused time: 12.2496 - -kernel execution time: 0.00954 ms -sddmm time: 0.935822 - -kernel execution time: 0.02342 ms -spmm time: 0.324625 - -kernel execution time: 0.050091 ms -taco reference time: 0.727519 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 5, B2_dimension: 5, vals: 19 -C1_dimension: 5, C2_dimension: 64, vals: 320 -D1_dimension: 5, D2_dimension: 64, vals: 320 -E1_dimension: 5, E2_dimension: 64, vals: 320 - - -kernel execution time: 0.235743 ms -fused time: 0.969273 - -kernel execution time: 0.01214 ms -sddmm time: 0.981613 - -kernel execution time: 0.03193 ms -spmm time: 0.521637 - -kernel execution time: 0.059391 ms -taco reference time: 0.945792 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 5, B2_dimension: 5, vals: 19 -C1_dimension: 5, C2_dimension: 64, vals: 320 -D1_dimension: 5, D2_dimension: 64, vals: 320 -E1_dimension: 5, E2_dimension: 64, vals: 320 - - -kernel execution time: 0.235003 ms -fused time: 0.964663 - -kernel execution time: 0.013771 ms -sddmm time: 1.23201 - -kernel execution time: 0.027521 ms -spmm time: 0.470876 - -kernel execution time: 0.043441 ms -taco reference time: 0.814271 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 5, B2_dimension: 5, vals: 19 -C1_dimension: 5, C2_dimension: 64, vals: 320 -D1_dimension: 5, D2_dimension: 64, vals: 320 -E1_dimension: 5, E2_dimension: 64, vals: 320 - - -kernel execution time: 0.242774 ms -fused time: 0.984063 - -kernel execution time: 0.01744 ms -sddmm time: 1.07782 - -kernel execution time: 0.03915 ms -spmm time: 0.602928 - -kernel execution time: 0.073381 ms -taco reference time: 0.858301 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 5, B2_dimension: 5, vals: 19 -C1_dimension: 5, C2_dimension: 64, vals: 320 -D1_dimension: 5, D2_dimension: 64, vals: 320 -E1_dimension: 5, E2_dimension: 64, vals: 320 - - -kernel execution time: 0.199533 ms -fused time: 0.604928 - -kernel execution time: 0.00675 ms -sddmm time: 0.983573 - -kernel execution time: 0.02448 ms -spmm time: 0.300224 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 5, B2_dimension: 5, vals: 19 -C1_dimension: 5, C2_dimension: 64, vals: 320 -D1_dimension: 5, D2_dimension: 64, vals: 320 -E1_dimension: 5, E2_dimension: 64, vals: 320 - - -kernel execution time: 0.192703 ms -fused time: 0.575667 - -kernel execution time: 0.00622 ms -sddmm time: 0.863292 - -kernel execution time: 0.0221 ms -spmm time: 0.270204 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 5, B2_dimension: 5, vals: 19 -C1_dimension: 5, C2_dimension: 64, vals: 320 -D1_dimension: 5, D2_dimension: 64, vals: 320 -E1_dimension: 5, E2_dimension: 64, vals: 320 - - -kernel execution time: 0.195482 ms -fused time: 0.580768 - -kernel execution time: 0.00652 ms -sddmm time: 0.957703 - -kernel execution time: 0.025451 ms -spmm time: 0.313074 - -kernel execution time: 0.085611 ms -taco reference time: 0.970753 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 10974, D2_dimension: 64, vals: 702336 -E1_dimension: 10974, E2_dimension: 64, vals: 702336 - - -kernel execution time: 2.00856 ms -fused time: 2.45147 - -kernel execution time: 8.5121 ms -sddmm time: 8.95565 - -kernel execution time: 5.46083 ms -spmm time: 5.93676 - -kernel execution time: 14.1411 ms -taco reference time: 14.7397 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 64, vals: 2330688 -D1_dimension: 36417, D2_dimension: 64, vals: 2330688 -E1_dimension: 36417, E2_dimension: 64, vals: 2330688 - - -kernel execution time: 9.91597 ms -fused time: 10.4166 - -kernel execution time: 85.127 ms -sddmm time: 85.7297 - -kernel execution time: 12.8101 ms -spmm time: 13.3194 - -kernel execution time: 129.721 ms -taco reference time: 130.362 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 64, vals: 2330688 -D1_dimension: 36417, D2_dimension: 64, vals: 2330688 -E1_dimension: 36417, E2_dimension: 64, vals: 2330688 - - -kernel execution time: 9.9746 ms -fused time: 10.4536 - -kernel execution time: 85.6921 ms -sddmm time: 86.3192 - -kernel execution time: 12.752 ms -spmm time: 13.2448 - -kernel execution time: 135.682 ms -taco reference time: 136.351 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 64, vals: 2330688 -D1_dimension: 36417, D2_dimension: 64, vals: 2330688 -E1_dimension: 36417, E2_dimension: 64, vals: 2330688 - - -kernel execution time: 10.0998 ms -fused time: 10.5872 - -kernel execution time: 85.0064 ms -sddmm time: 85.6385 - -kernel execution time: 12.6128 ms -spmm time: 13.1169 - -kernel execution time: 134.629 ms -taco reference time: 135.323 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 64, vals: 2330688 -D1_dimension: 36417, D2_dimension: 64, vals: 2330688 -E1_dimension: 36417, E2_dimension: 64, vals: 2330688 - - -kernel execution time: 10.1006 ms -fused time: 10.5902 - -kernel execution time: 88.2603 ms -sddmm time: 88.897 - -kernel execution time: 12.5197 ms -spmm time: 13.0137 - -kernel execution time: 130.3 ms -taco reference time: 130.977 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 64, vals: 2330688 -D1_dimension: 36417, D2_dimension: 64, vals: 2330688 -E1_dimension: 36417, E2_dimension: 64, vals: 2330688 - - -kernel execution time: 27.6596 ms -fused time: 28.2096 - -kernel execution time: 85.6018 ms -sddmm time: 86.213 - -kernel execution time: 12.8244 ms -spmm time: 13.3343 - -kernel execution time: 131.089 ms -taco reference time: 131.789 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 64, vals: 2330688 -D1_dimension: 36417, D2_dimension: 64, vals: 2330688 -E1_dimension: 36417, E2_dimension: 64, vals: 2330688 - - -kernel execution time: 26.582 ms -fused time: 27.0673 - -kernel execution time: 87.6048 ms -sddmm time: 88.2462 - -kernel execution time: 12.5643 ms -spmm time: 13.0723 - -kernel execution time: 130.366 ms -taco reference time: 131.043 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 64, vals: 2330688 -D1_dimension: 36417, D2_dimension: 64, vals: 2330688 -E1_dimension: 36417, E2_dimension: 64, vals: 2330688 - - -kernel execution time: 26.5615 ms -fused time: 27.0713 - -kernel execution time: 87.5473 ms -sddmm time: 88.1848 - -kernel execution time: 12.6726 ms -spmm time: 13.152 - -kernel execution time: 131.024 ms -taco reference time: 131.701 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 64, vals: 2330688 -D1_dimension: 36417, D2_dimension: 64, vals: 2330688 -E1_dimension: 36417, E2_dimension: 64, vals: 2330688 - - -kernel execution time: 26.3835 ms -fused time: 26.8768 - -kernel execution time: 84.7609 ms -sddmm time: 85.3584 - -kernel execution time: 12.8437 ms -spmm time: 13.346 - -kernel execution time: 132.548 ms -taco reference time: 133.168 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 64, vals: 2330688 -D1_dimension: 36417, D2_dimension: 64, vals: 2330688 -E1_dimension: 36417, E2_dimension: 64, vals: 2330688 - - -kernel execution time: 26.6808 ms -fused time: 27.1679 - -kernel execution time: 87.0948 ms -sddmm time: 87.7219 - -kernel execution time: 12.695 ms -spmm time: 13.1923 - -kernel execution time: 134.587 ms -taco reference time: 135.255 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 5, B2_dimension: 5, vals: 19 -C1_dimension: 5, C2_dimension: 64, vals: 320 -D1_dimension: 5, D2_dimension: 64, vals: 320 -E1_dimension: 5, E2_dimension: 64, vals: 320 - - -kernel execution time: 0.235254 ms -fused time: 1.04843 - -kernel execution time: 0.01102 ms -sddmm time: 0.989634 - -kernel execution time: 0.028701 ms -spmm time: 0.574108 - -kernel execution time: 0.04363 ms -taco reference time: 0.840431 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 10974, D2_dimension: 64, vals: 702336 -E1_dimension: 10974, E2_dimension: 64, vals: 702336 - - -kernel execution time: 4.9177 ms -fused time: 5.37305 - -kernel execution time: 8.31608 ms -sddmm time: 8.76144 - -kernel execution time: 5.43042 ms -spmm time: 5.82157 - -kernel execution time: 15.0881 ms -taco reference time: 15.4618 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 64, vals: 2330688 -D1_dimension: 36417, D2_dimension: 64, vals: 2330688 -E1_dimension: 36417, E2_dimension: 64, vals: 2330688 - - -kernel execution time: 175.005 ms -fused time: 175.507 - -kernel execution time: 83.4127 ms -sddmm time: 83.9734 - -kernel execution time: 14.3027 ms -spmm time: 14.8133 - -kernel execution time: 5196.98 ms -taco reference time: 5198.39 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 46835, B2_dimension: 46835, vals: 2374001 -C1_dimension: 46835, C2_dimension: 64, vals: 2997440 -D1_dimension: 46835, D2_dimension: 64, vals: 2997440 -E1_dimension: 46835, E2_dimension: 64, vals: 2997440 - - -kernel execution time: 96.7809 ms -fused time: 97.2629 - -kernel execution time: 46.666 ms -sddmm time: 47.229 - -kernel execution time: 23.9017 ms -spmm time: 24.4045 - -kernel execution time: 2871.87 ms -taco reference time: 2872.47 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 46835, B2_dimension: 46835, vals: 2374001 -C1_dimension: 46835, C2_dimension: 64, vals: 2997440 -D1_dimension: 46835, D2_dimension: 64, vals: 2997440 -E1_dimension: 46835, E2_dimension: 64, vals: 2997440 - - -kernel execution time: 98.4225 ms -fused time: 98.9062 - -kernel execution time: 46.8647 ms -sddmm time: 47.4013 - -kernel execution time: 22.9253 ms -spmm time: 23.4505 - -kernel execution time: 2873.94 ms -taco reference time: 2874.59 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 64, vals: 2330688 -D1_dimension: 36417, D2_dimension: 64, vals: 2330688 -E1_dimension: 36417, E2_dimension: 64, vals: 2330688 - - -kernel execution time: 174.126 ms -fused time: 174.616 - -kernel execution time: 83.7673 ms -sddmm time: 84.3199 - -kernel execution time: 13.0437 ms -spmm time: 13.5625 - -kernel execution time: 5227.23 ms -taco reference time: 5228.25 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 64, vals: 2330688 -D1_dimension: 36417, D2_dimension: 64, vals: 2330688 -E1_dimension: 36417, E2_dimension: 64, vals: 2330688 - - -kernel execution time: 27.6542 ms -fused time: 28.1392 - -kernel execution time: 85.8985 ms -sddmm time: 86.5293 - -kernel execution time: 12.6722 ms -spmm time: 13.1883 - -kernel execution time: 130.948 ms -taco reference time: 131.642 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 64, vals: 2330688 -D1_dimension: 36417, D2_dimension: 64, vals: 2330688 -E1_dimension: 36417, E2_dimension: 64, vals: 2330688 - - -kernel execution time: 102.4 ms -fused time: 102.884 - -kernel execution time: 83.5498 ms -sddmm time: 84.1386 - -kernel execution time: 42.5049 ms -spmm time: 43.0426 - -kernel execution time: 710.168 ms -taco reference time: 710.765 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 64, vals: 2330688 -D1_dimension: 36417, D2_dimension: 64, vals: 2330688 -E1_dimension: 36417, E2_dimension: 64, vals: 2330688 - - -kernel execution time: 43.9551 ms -fused time: 44.6972 - -kernel execution time: 87.6996 ms -sddmm time: 89.4613 - -kernel execution time: 18.2632 ms -spmm time: 18.7804 - -kernel execution time: 122.262 ms -taco reference time: 123.152 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 64, vals: 2330688 -D1_dimension: 36417, D2_dimension: 64, vals: 2330688 -E1_dimension: 36417, E2_dimension: 64, vals: 2330688 - - -kernel execution time: 47.9407 ms -fused time: 48.4339 - -kernel execution time: 89.2157 ms -sddmm time: 89.8924 - -kernel execution time: 18.2009 ms -spmm time: 18.7261 - -kernel execution time: 123.559 ms -taco reference time: 124.405 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 64, vals: 2330688 -D1_dimension: 36417, D2_dimension: 64, vals: 2330688 -E1_dimension: 36417, E2_dimension: 64, vals: 2330688 - - -kernel execution time: 43.2059 ms -fused time: 43.6957 - -kernel execution time: 90.4258 ms -sddmm time: 91.1259 - -kernel execution time: 18.2655 ms -spmm time: 18.7701 - -kernel execution time: 123.565 ms -taco reference time: 124.302 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 64, vals: 2330688 -D1_dimension: 36417, D2_dimension: 64, vals: 2330688 -E1_dimension: 36417, E2_dimension: 64, vals: 2330688 - - -kernel execution time: 48.4004 ms -fused time: 48.9337 - -kernel execution time: 85.0973 ms -sddmm time: 85.6769 - -kernel execution time: 18.1666 ms -spmm time: 18.6607 - -kernel execution time: 123.347 ms -taco reference time: 124.257 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 64, vals: 2330688 -D1_dimension: 36417, D2_dimension: 64, vals: 2330688 -E1_dimension: 36417, E2_dimension: 64, vals: 2330688 - - -kernel execution time: 25.3405 ms -fused time: 25.8282 - -kernel execution time: 87.1326 ms -sddmm time: 87.7761 - -kernel execution time: 12.9441 ms -spmm time: 13.4425 - -kernel execution time: 132.388 ms -taco reference time: 133.056 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 64, vals: 2330688 -D1_dimension: 36417, D2_dimension: 64, vals: 2330688 -E1_dimension: 36417, E2_dimension: 64, vals: 2330688 - - -kernel execution time: 26.5881 ms -fused time: 27.0669 - -kernel execution time: 85.9749 ms -sddmm time: 86.5764 - -kernel execution time: 12.5752 ms -spmm time: 13.1009 - -kernel execution time: 131.368 ms -taco reference time: 132.072 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 64, vals: 2330688 -D1_dimension: 36417, D2_dimension: 64, vals: 2330688 -E1_dimension: 36417, E2_dimension: 64, vals: 2330688 - - -kernel execution time: 177.141 ms -fused time: 177.635 - -kernel execution time: 83.6231 ms -sddmm time: 84.2074 - -kernel execution time: 303.927 ms -spmm time: 304.455 - -kernel execution time: 5553.72 ms -taco reference time: 5554.89 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 64, vals: 2330688 -D1_dimension: 36417, D2_dimension: 64, vals: 2330688 -E1_dimension: 36417, E2_dimension: 64, vals: 2330688 - - -kernel execution time: 177.24 ms -fused time: 177.718 - -kernel execution time: 83.5235 ms -sddmm time: 84.0624 - -kernel execution time: 299.135 ms -spmm time: 299.642 - -kernel execution time: 5568.94 ms -taco reference time: 5570.07 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 64, vals: 2330688 -D1_dimension: 36417, D2_dimension: 64, vals: 2330688 -E1_dimension: 36417, E2_dimension: 64, vals: 2330688 - - -kernel execution time: 177.334 ms -fused time: 177.831 - -kernel execution time: 83.7814 ms -sddmm time: 84.3619 - -kernel execution time: 302.13 ms -spmm time: 302.653 - -kernel execution time: 5535.64 ms -taco reference time: 5536.87 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 64, vals: 2330688 -D1_dimension: 36417, D2_dimension: 64, vals: 2330688 -E1_dimension: 36417, E2_dimension: 64, vals: 2330688 - - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 64, vals: 2330688 -D1_dimension: 36417, D2_dimension: 64, vals: 2330688 -E1_dimension: 36417, E2_dimension: 64, vals: 2330688 - - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 64, vals: 2330688 -D1_dimension: 36417, D2_dimension: 64, vals: 2330688 -E1_dimension: 36417, E2_dimension: 64, vals: 2330688 - - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 64, vals: 2330688 -D1_dimension: 36417, D2_dimension: 64, vals: 2330688 -E1_dimension: 36417, E2_dimension: 64, vals: 2330688 - - -kernel execution time: 180.923 ms -fused time: 181.39 - -kernel execution time: 88.0592 ms -sddmm time: 88.6258 - -kernel execution time: 300.533 ms -spmm time: 301.047 - -kernel execution time: 5549.25 ms -taco reference time: 5550.45 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 64, vals: 2330688 -D1_dimension: 36417, D2_dimension: 64, vals: 2330688 -E1_dimension: 36417, E2_dimension: 64, vals: 2330688 - - -kernel execution time: 27.7589 ms -fused time: 28.2424 - -kernel execution time: 87.4027 ms -sddmm time: 88.0292 - -kernel execution time: 13.0621 ms -spmm time: 13.5896 - -kernel execution time: 131.501 ms -taco reference time: 132.191 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 64, vals: 2330688 -D1_dimension: 36417, D2_dimension: 64, vals: 2330688 -E1_dimension: 36417, E2_dimension: 64, vals: 2330688 - - -kernel execution time: 27.1159 ms -fused time: 27.6123 - -kernel execution time: 88.1805 ms -sddmm time: 88.8475 - -kernel execution time: 13.2301 ms -spmm time: 13.7512 - -kernel execution time: 130.96 ms -taco reference time: 131.633 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 64, vals: 2330688 -D1_dimension: 36417, D2_dimension: 64, vals: 2330688 -E1_dimension: 36417, E2_dimension: 64, vals: 2330688 - - -kernel execution time: 11.1791 ms -fused time: 11.6596 - -kernel execution time: 324.829 ms -sddmm time: 325.459 - -kernel execution time: 5.82413 ms -spmm time: 6.613 - -kernel execution time: 162.505 ms -taco reference time: 163.319 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 64, vals: 2330688 -D1_dimension: 36417, D2_dimension: 64, vals: 2330688 -E1_dimension: 36417, E2_dimension: 64, vals: 2330688 - - -kernel execution time: 167.093 ms -fused time: 167.577 - -kernel execution time: 264.158 ms -sddmm time: 264.712 - -kernel execution time: 68.6915 ms -spmm time: 69.2406 - -kernel execution time: 5581.71 ms -taco reference time: 5582.83 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 64, vals: 2330688 -D1_dimension: 36417, D2_dimension: 64, vals: 2330688 -E1_dimension: 36417, E2_dimension: 64, vals: 2330688 - - -kernel execution time: 170.702 ms -fused time: 171.176 - -kernel execution time: 88.5905 ms -sddmm time: 89.1447 - -kernel execution time: 68.5964 ms -spmm time: 69.1031 - -kernel execution time: 5551.85 ms -taco reference time: 5552.97 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 64, vals: 2330688 -D1_dimension: 36417, D2_dimension: 64, vals: 2330688 -E1_dimension: 36417, E2_dimension: 64, vals: 2330688 - - -kernel execution time: 10.8645 ms -fused time: 11.3531 - -kernel execution time: 9.04029 ms -sddmm time: 9.79108 - -kernel execution time: 5.63795 ms -spmm time: 6.23454 - -kernel execution time: 131.822 ms -taco reference time: 132.52 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 46835, B2_dimension: 46835, vals: 2374001 -C1_dimension: 46835, C2_dimension: 64, vals: 2997440 -D1_dimension: 46835, D2_dimension: 64, vals: 2997440 -E1_dimension: 46835, E2_dimension: 64, vals: 2997440 - - -kernel execution time: 9.65163 ms -fused time: 10.1436 - -kernel execution time: 9.70327 ms -sddmm time: 10.2929 - -kernel execution time: 4.85235 ms -spmm time: 5.40286 - -kernel execution time: 74.2349 ms -taco reference time: 74.8374 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 62451, B2_dimension: 62451, vals: 4007383 -C1_dimension: 62451, C2_dimension: 64, vals: 3996864 -D1_dimension: 62451, D2_dimension: 64, vals: 3996864 -E1_dimension: 62451, E2_dimension: 64, vals: 3996864 - - -kernel execution time: 15.2637 ms -fused time: 15.7881 - -kernel execution time: 12.0484 ms -sddmm time: 12.7139 - -kernel execution time: 7.9269 ms -spmm time: 8.5266 - -kernel execution time: 122.713 ms -taco reference time: 123.431 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291 -C1_dimension: 5558326, C2_dimension: 64, vals: 355732864 -D1_dimension: 5558326, D2_dimension: 64, vals: 355732864 -E1_dimension: 5558326, E2_dimension: 64, vals: 355732864 - - -kernel execution time: 750.953 ms -fused time: 751.849 - -kernel execution time: 410.668 ms -sddmm time: 411.252 - -kernel execution time: 490.401 ms -spmm time: 490.993 - -kernel execution time: 7382.94 ms -taco reference time: 7384.02 - - - --------------------------------- - - - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 10974, D2_dimension: 64, vals: 702336 -E1_dimension: 10974, E2_dimension: 64, vals: 702336 - - -kernel execution time: 2.89124 ms -fused time: 3.33064 - -kernel execution time: 2.48885 ms -sddmm time: 2.80581 - -kernel execution time: 1.25714 ms -sddmm time: 1.58645 - -kernel execution time: 1.82611 ms -spmm time: 2.10693 - -kernel execution time: 14.7536 ms -taco reference time: 15.1553 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 64, vals: 2330688 -D1_dimension: 36417, D2_dimension: 64, vals: 2330688 -E1_dimension: 36417, E2_dimension: 64, vals: 2330688 - - -kernel execution time: 10.4526 ms -fused time: 10.9812 - -kernel execution time: 9.28251 ms -sddmm time: 9.93109 - -kernel execution time: 5.36035 ms -sddmm time: 5.99358 - -kernel execution time: 5.29728 ms -spmm time: 5.86825 - -kernel execution time: 132.268 ms -taco reference time: 132.952 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 46835, B2_dimension: 46835, vals: 2374001 -C1_dimension: 46835, C2_dimension: 64, vals: 2997440 -D1_dimension: 46835, D2_dimension: 64, vals: 2997440 -E1_dimension: 46835, E2_dimension: 64, vals: 2997440 - - -kernel execution time: 9.78667 ms -fused time: 10.2677 - -kernel execution time: 9.62847 ms -sddmm time: 10.2355 - -kernel execution time: 3.92285 ms -sddmm time: 4.52461 - -kernel execution time: 4.91246 ms -spmm time: 5.38467 - -kernel execution time: 74.8226 ms -taco reference time: 75.4131 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 83334, B2_dimension: 83334, vals: 6010480 -C1_dimension: 83334, C2_dimension: 64, vals: 5333376 -D1_dimension: 83334, D2_dimension: 64, vals: 5333376 -E1_dimension: 83334, E2_dimension: 64, vals: 5333376 - - -kernel execution time: 19.7265 ms -fused time: 20.2664 - -kernel execution time: 17.1571 ms -sddmm time: 17.8366 - -kernel execution time: 10.5179 ms -sddmm time: 11.1615 - -kernel execution time: 10.7719 ms -spmm time: 11.4141 - -kernel execution time: 186.633 ms -taco reference time: 187.406 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 -C1_dimension: 121192, C2_dimension: 64, vals: 7756288 -D1_dimension: 121192, D2_dimension: 64, vals: 7756288 -E1_dimension: 121192, E2_dimension: 64, vals: 7756288 - - -kernel execution time: 28.3142 ms -fused time: 28.8151 - -kernel execution time: 20.3455 ms -sddmm time: 21.0059 - -kernel execution time: 12.2316 ms -sddmm time: 12.8542 - -kernel execution time: 13.8246 ms -spmm time: 14.4268 - -kernel execution time: 100.583 ms -taco reference time: 101.304 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 170998, B2_dimension: 170998, vals: 958936 -C1_dimension: 170998, C2_dimension: 64, vals: 10943872 -D1_dimension: 170998, D2_dimension: 64, vals: 10943872 -E1_dimension: 170998, E2_dimension: 64, vals: 10943872 - - -kernel execution time: 20.038 ms -fused time: 20.555 - -kernel execution time: 11.3385 ms -sddmm time: 11.9822 - -kernel execution time: 8.08082 ms -sddmm time: 8.71341 - -kernel execution time: 10.9562 ms -spmm time: 11.5782 - -kernel execution time: 80.9289 ms -taco reference time: 81.6333 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 206500, B2_dimension: 206500, vals: 1273389 -C1_dimension: 206500, C2_dimension: 64, vals: 13216000 -D1_dimension: 206500, D2_dimension: 64, vals: 13216000 -E1_dimension: 206500, E2_dimension: 64, vals: 13216000 - - -kernel execution time: 25.3126 ms -fused time: 25.8254 - -kernel execution time: 15.9278 ms -sddmm time: 16.6406 - -kernel execution time: 10.5087 ms -sddmm time: 11.2503 - -kernel execution time: 14.3281 ms -spmm time: 14.9822 - -kernel execution time: 98.03 ms -taco reference time: 98.7014 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536 -C1_dimension: 1000005, C2_dimension: 64, vals: 64000320 -D1_dimension: 1000005, D2_dimension: 64, vals: 64000320 -E1_dimension: 1000005, E2_dimension: 64, vals: 64000320 - - -kernel execution time: 77.5645 ms -fused time: 78.0892 - -kernel execution time: 31.7247 ms -sddmm time: 32.4147 - -kernel execution time: 26.0367 ms -sddmm time: 26.7311 - -kernel execution time: 47.1564 ms -spmm time: 47.8767 - -kernel execution time: 444.658 ms -taco reference time: 445.356 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291 -C1_dimension: 5558326, C2_dimension: 64, vals: 355732864 -D1_dimension: 5558326, D2_dimension: 64, vals: 355732864 -E1_dimension: 5558326, E2_dimension: 64, vals: 355732864 - - -kernel execution time: 760.552 ms -fused time: 761.497 - -kernel execution time: 414.806 ms -sddmm time: 415.511 - -kernel execution time: 347.288 ms -sddmm time: 348.046 - -kernel execution time: 493.652 ms -spmm time: 494.215 - -kernel execution time: 7069.3 ms -taco reference time: 7070.64 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 62451, B2_dimension: 62451, vals: 4007383 -C1_dimension: 62451, C2_dimension: 64, vals: 3996864 -D1_dimension: 62451, D2_dimension: 64, vals: 3996864 -E1_dimension: 62451, E2_dimension: 64, vals: 3996864 - - -kernel execution time: 14.868 ms -fused time: 15.3593 - -kernel execution time: 12.1237 ms -sddmm time: 12.798 - -kernel execution time: 7.68559 ms -sddmm time: 8.34388 - -kernel execution time: 7.93647 ms -spmm time: 8.56812 - -kernel execution time: 122.125 ms -taco reference time: 122.846 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 140874, B2_dimension: 140874, vals: 7813404 -C1_dimension: 140874, C2_dimension: 64, vals: 9015936 -D1_dimension: 140874, D2_dimension: 64, vals: 9015936 -E1_dimension: 140874, E2_dimension: 64, vals: 9015936 - - -kernel execution time: 28.6635 ms -fused time: 29.1538 - -kernel execution time: 24.0642 ms -sddmm time: 24.694 - -kernel execution time: 15.2 ms -sddmm time: 15.875 - -kernel execution time: 16.0406 ms -spmm time: 16.6827 - -kernel execution time: 242.63 ms -taco reference time: 243.336 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 140874, B2_dimension: 140874, vals: 7813404 -C1_dimension: 140874, C2_dimension: 128, vals: 18031872 -D1_dimension: 140874, D2_dimension: 128, vals: 18031872 -E1_dimension: 140874, E2_dimension: 128, vals: 18031872 - - -kernel execution time: 50.9773 ms -fused time: 51.4656 - -kernel execution time: 42.0404 ms -sddmm time: 42.7352 - -kernel execution time: 24.4547 ms -sddmm time: 25.1418 - -kernel execution time: 28.4623 ms -spmm time: 29.1722 - -kernel execution time: 903.853 ms -taco reference time: 904.701 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 140874, B2_dimension: 140874, vals: 7813404 -C1_dimension: 140874, C2_dimension: 250, vals: 35218500 -D1_dimension: 140874, D2_dimension: 250, vals: 35218500 -E1_dimension: 140874, E2_dimension: 250, vals: 35218500 - - -kernel execution time: 97.1385 ms -fused time: 97.6193 - -kernel execution time: 87.9795 ms -sddmm time: 88.6535 - -kernel execution time: 41.8878 ms -sddmm time: 42.5463 - -kernel execution time: 54.1433 ms -spmm time: 54.7894 - -kernel execution time: 3669.52 ms -taco reference time: 3670.78 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 140874, B2_dimension: 140874, vals: 7813404 -C1_dimension: 140874, C2_dimension: 512, vals: 72127488 -D1_dimension: 140874, D2_dimension: 512, vals: 72127488 -E1_dimension: 140874, E2_dimension: 512, vals: 72127488 - - -kernel execution time: 200.849 ms -fused time: 201.329 - -kernel execution time: 208.737 ms -sddmm time: 209.393 - -kernel execution time: 81.0923 ms -sddmm time: 81.7181 - -kernel execution time: 106.669 ms -spmm time: 107.272 - -kernel execution time: 15631.7 ms -taco reference time: 15632.4 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 140874, B2_dimension: 140874, vals: 7813404 -C1_dimension: 140874, C2_dimension: 32, vals: 4507968 -D1_dimension: 140874, D2_dimension: 32, vals: 4507968 -E1_dimension: 140874, E2_dimension: 32, vals: 4507968 - - -kernel execution time: 16.5631 ms -fused time: 17.0602 - -kernel execution time: 15.2542 ms -sddmm time: 15.8919 - -kernel execution time: 9.9104 ms -sddmm time: 10.5671 - -kernel execution time: 9.61101 ms -spmm time: 10.2251 - -kernel execution time: 68.1735 ms -taco reference time: 68.8921 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 140874, B2_dimension: 140874, vals: 7813404 -C1_dimension: 140874, C2_dimension: 256, vals: 36063744 -D1_dimension: 140874, D2_dimension: 256, vals: 36063744 -E1_dimension: 140874, E2_dimension: 256, vals: 36063744 - - -kernel execution time: 98.882 ms -fused time: 99.3547 - -kernel execution time: 90.4755 ms -sddmm time: 91.136 - -kernel execution time: 42.7487 ms -sddmm time: 43.4726 - -kernel execution time: 55.0127 ms -spmm time: 55.731 - -kernel execution time: 3836.15 ms -taco reference time: 3837.42 - - - - - ---------- single threads - - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 10974, D2_dimension: 64, vals: 702336 -E1_dimension: 10974, E2_dimension: 64, vals: 702336 - - -kernel execution time: 22.3045 ms -fused time: 22.7793 - -kernel execution time: 8.91826 ms -sddmm time: 9.46409 - -kernel execution time: 9.62695 ms -sddmm time: 10.1105 - -kernel execution time: 10.8309 ms -spmm time: 11.2862 - -kernel execution time: 554.747 ms -taco reference time: 555.315 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 64, vals: 2330688 -D1_dimension: 36417, D2_dimension: 64, vals: 2330688 -E1_dimension: 36417, E2_dimension: 64, vals: 2330688 - - -kernel execution time: 166.569 ms -fused time: 167.058 - -kernel execution time: 83.9979 ms -sddmm time: 84.5309 - -kernel execution time: 88.9971 ms -sddmm time: 89.5559 - -kernel execution time: 68.5334 ms -spmm time: 69.0587 - -kernel execution time: 5562.04 ms -taco reference time: 5563.12 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 46835, B2_dimension: 46835, vals: 2374001 -C1_dimension: 46835, C2_dimension: 64, vals: 2997440 -D1_dimension: 46835, D2_dimension: 64, vals: 2997440 -E1_dimension: 46835, E2_dimension: 64, vals: 2997440 - - -kernel execution time: 94.7764 ms -fused time: 95.2526 - -kernel execution time: 47.3174 ms -sddmm time: 47.8674 - -kernel execution time: 49.7766 ms -sddmm time: 50.3372 - -kernel execution time: 51.3685 ms -spmm time: 51.8719 - -kernel execution time: 3073.44 ms -taco reference time: 3074.55 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 62451, B2_dimension: 62451, vals: 4007383 -C1_dimension: 62451, C2_dimension: 64, vals: 3996864 -D1_dimension: 62451, D2_dimension: 64, vals: 3996864 -E1_dimension: 62451, E2_dimension: 64, vals: 3996864 - - -kernel execution time: 158.175 ms -fused time: 158.637 - -kernel execution time: 78.3163 ms -sddmm time: 78.8675 - -kernel execution time: 82.3237 ms -sddmm time: 82.8606 - -kernel execution time: 76.2056 ms -spmm time: 76.7067 - -kernel execution time: 5178.46 ms -taco reference time: 5179.53 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 83334, B2_dimension: 83334, vals: 6010480 -C1_dimension: 83334, C2_dimension: 64, vals: 5333376 -D1_dimension: 83334, D2_dimension: 64, vals: 5333376 -E1_dimension: 83334, E2_dimension: 64, vals: 5333376 - - -kernel execution time: 241.194 ms -fused time: 241.676 - -kernel execution time: 117.775 ms -sddmm time: 118.325 - -kernel execution time: 124.006 ms -sddmm time: 124.563 - -kernel execution time: 117.052 ms -spmm time: 117.594 - -kernel execution time: 7844.57 ms -taco reference time: 7845.69 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 -C1_dimension: 121192, C2_dimension: 64, vals: 7756288 -D1_dimension: 121192, D2_dimension: 64, vals: 7756288 -E1_dimension: 121192, E2_dimension: 64, vals: 7756288 - - -kernel execution time: 201.49 ms -fused time: 201.973 - -kernel execution time: 90.6759 ms -sddmm time: 91.2506 - -kernel execution time: 93.0462 ms -sddmm time: 93.6053 - -kernel execution time: 119.005 ms -spmm time: 119.547 - -kernel execution time: 3567.55 ms -taco reference time: 3568.67 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 140874, B2_dimension: 140874, vals: 7813404 -C1_dimension: 140874, C2_dimension: 64, vals: 9015936 -D1_dimension: 140874, D2_dimension: 64, vals: 9015936 -E1_dimension: 140874, E2_dimension: 64, vals: 9015936 - - -kernel execution time: 315.238 ms -fused time: 315.723 - -kernel execution time: 156.048 ms -sddmm time: 156.588 - -kernel execution time: 164.148 ms -sddmm time: 164.747 - -kernel execution time: 162.502 ms -spmm time: 163.021 - -kernel execution time: 10131.2 ms -taco reference time: 10132.3 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 170998, B2_dimension: 170998, vals: 958936 -C1_dimension: 170998, C2_dimension: 64, vals: 10943872 -D1_dimension: 170998, D2_dimension: 64, vals: 10943872 -E1_dimension: 170998, E2_dimension: 64, vals: 10943872 - - -kernel execution time: 87.9511 ms -fused time: 88.4267 - -kernel execution time: 37.6228 ms -sddmm time: 38.1792 - -kernel execution time: 37.8418 ms -sddmm time: 38.3903 - -kernel execution time: 84.4997 ms -spmm time: 85.037 - -kernel execution time: 1330.01 ms -taco reference time: 1330.63 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 206500, B2_dimension: 206500, vals: 1273389 -C1_dimension: 206500, C2_dimension: 64, vals: 13216000 -D1_dimension: 206500, D2_dimension: 64, vals: 13216000 -E1_dimension: 206500, E2_dimension: 64, vals: 13216000 - - -kernel execution time: 92.8914 ms -fused time: 93.3697 - -kernel execution time: 39.7714 ms -sddmm time: 40.3051 - -kernel execution time: 40.1835 ms -sddmm time: 40.7458 - -kernel execution time: 98.0818 ms -spmm time: 98.5997 - -kernel execution time: 1721.01 ms -taco reference time: 1721.64 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536 -C1_dimension: 1000005, C2_dimension: 64, vals: 64000320 -D1_dimension: 1000005, D2_dimension: 64, vals: 64000320 -E1_dimension: 1000005, E2_dimension: 64, vals: 64000320 - - -kernel execution time: 259.845 ms -fused time: 260.329 - -kernel execution time: 95.8311 ms -sddmm time: 96.3809 - -kernel execution time: 97.6925 ms -sddmm time: 98.2397 - -kernel execution time: 292.415 ms -spmm time: 292.952 - -kernel execution time: 4292.03 ms -taco reference time: 4293.1 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291 -C1_dimension: 5558326, C2_dimension: 64, vals: 355732864 -D1_dimension: 5558326, D2_dimension: 64, vals: 355732864 -E1_dimension: 5558326, E2_dimension: 64, vals: 355732864 - - -kernel execution time: 3326.66 ms -fused time: 3327.64 - -kernel execution time: 1617.82 ms -sddmm time: 1618.36 - -kernel execution time: 1672.73 ms -sddmm time: 1673.27 - -kernel execution time: 3199.32 ms -spmm time: 3200.35 - -kernel execution time: 88682 ms -taco reference time: 88683.1 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291 -C1_dimension: 5558326, C2_dimension: 64, vals: 355732864 -D1_dimension: 5558326, D2_dimension: 64, vals: 355732864 -E1_dimension: 5558326, E2_dimension: 64, vals: 355732864 - - -kernel execution time: 722.484 ms -fused time: 723.506 - -kernel execution time: 613.844 ms -sddmm time: 614.401 - -kernel execution time: 331.43 ms -sddmm time: 331.978 - -kernel execution time: 463.752 ms -spmm time: 464.328 - -kernel execution time: 8864.13 ms -taco reference time: 8865.18 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 140874, B2_dimension: 140874, vals: 7813404 -C1_dimension: 140874, C2_dimension: 16, vals: 2253984 -D1_dimension: 140874, D2_dimension: 16, vals: 2253984 -E1_dimension: 140874, E2_dimension: 16, vals: 2253984 - - -kernel execution time: 10.0607 ms -fused time: 10.5457 - -kernel execution time: 8.70278 ms -sddmm time: 9.26539 - -kernel execution time: 6.88021 ms -sddmm time: 7.49853 - -kernel execution time: 5.91127 ms -spmm time: 6.50028 - -kernel execution time: 23.776 ms -taco reference time: 24.3947 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536 -C1_dimension: 1000005, C2_dimension: 64, vals: 64000320 -D1_dimension: 1000005, D2_dimension: 64, vals: 64000320 -E1_dimension: 1000005, E2_dimension: 64, vals: 64000320 - - -kernel execution time: 179.752 ms -fused time: 180.214 - -kernel execution time: 170.678 ms -sddmm time: 171.224 - -kernel execution time: 67.5166 ms -sddmm time: 68.0688 - -kernel execution time: 168.557 ms -spmm time: 169.083 - -kernel execution time: 2452.7 ms -taco reference time: 2453.34 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536 -C1_dimension: 1000005, C2_dimension: 64, vals: 64000320 -D1_dimension: 1000005, D2_dimension: 64, vals: 64000320 -E1_dimension: 1000005, E2_dimension: 64, vals: 64000320 - - -kernel execution time: 111.508 ms -fused time: 111.983 - -kernel execution time: 171.316 ms -sddmm time: 171.863 - -kernel execution time: 40.3219 ms -sddmm time: 40.8676 - -kernel execution time: 91.8855 ms -spmm time: 92.3888 - -kernel execution time: 1349.98 ms -taco reference time: 1350.57 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536 -C1_dimension: 1000005, C2_dimension: 64, vals: 64000320 -D1_dimension: 1000005, D2_dimension: 64, vals: 64000320 -E1_dimension: 1000005, E2_dimension: 64, vals: 64000320 - - -kernel execution time: 84.4185 ms -fused time: 84.8803 - -kernel execution time: 131.898 ms -sddmm time: 132.465 - -kernel execution time: 27.6062 ms -sddmm time: 28.2117 - -kernel execution time: 59.0816 ms -spmm time: 59.6189 - -kernel execution time: 731.805 ms -taco reference time: 732.441 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536 -C1_dimension: 1000005, C2_dimension: 64, vals: 64000320 -D1_dimension: 1000005, D2_dimension: 64, vals: 64000320 -E1_dimension: 1000005, E2_dimension: 64, vals: 64000320 - - -kernel execution time: 76.4489 ms -fused time: 76.9087 - -kernel execution time: 65.9875 ms -sddmm time: 66.5522 - -kernel execution time: 25.2905 ms -sddmm time: 25.8759 - -kernel execution time: 50.1563 ms -spmm time: 50.6842 - -kernel execution time: 397.479 ms -taco reference time: 398.109 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536 -C1_dimension: 1000005, C2_dimension: 64, vals: 64000320 -D1_dimension: 1000005, D2_dimension: 64, vals: 64000320 -E1_dimension: 1000005, E2_dimension: 64, vals: 64000320 - - -kernel execution time: 74.0227 ms -fused time: 74.5259 - -kernel execution time: 40.2983 ms -sddmm time: 40.889 - -kernel execution time: 25.1349 ms -sddmm time: 25.7522 - -kernel execution time: 46.3853 ms -spmm time: 46.9556 - -kernel execution time: 418.693 ms -taco reference time: 419.345 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291 -C1_dimension: 5558326, C2_dimension: 64, vals: 355732864 -D1_dimension: 5558326, D2_dimension: 64, vals: 355732864 -E1_dimension: 5558326, E2_dimension: 64, vals: 355732864 - - -kernel execution time: 1982.06 ms -fused time: 1982.93 - -kernel execution time: 1668.23 ms -sddmm time: 1668.77 - -kernel execution time: 962.046 ms -sddmm time: 962.591 - -kernel execution time: 1821.97 ms -spmm time: 1822.46 - -kernel execution time: 47772.2 ms -taco reference time: 47773.4 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291 -C1_dimension: 5558326, C2_dimension: 64, vals: 355732864 -D1_dimension: 5558326, D2_dimension: 64, vals: 355732864 -E1_dimension: 5558326, E2_dimension: 64, vals: 355732864 - - -kernel execution time: 1143.12 ms -fused time: 1144.05 - -kernel execution time: 1254.57 ms -sddmm time: 1255.18 - -kernel execution time: 539.54 ms -sddmm time: 540.136 - -kernel execution time: 1005.14 ms -spmm time: 1005.69 - -kernel execution time: 25805.1 ms -taco reference time: 25806.1 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291 -C1_dimension: 5558326, C2_dimension: 64, vals: 355732864 -D1_dimension: 5558326, D2_dimension: 64, vals: 355732864 -E1_dimension: 5558326, E2_dimension: 64, vals: 355732864 - - -kernel execution time: 782.496 ms -fused time: 783.574 - -kernel execution time: 872.793 ms -sddmm time: 873.351 - -kernel execution time: 353.256 ms -sddmm time: 353.8 - -kernel execution time: 606.511 ms -spmm time: 607.041 - -kernel execution time: 15198.9 ms -taco reference time: 15199.9 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291 -C1_dimension: 5558326, C2_dimension: 64, vals: 355732864 -D1_dimension: 5558326, D2_dimension: 64, vals: 355732864 -E1_dimension: 5558326, E2_dimension: 64, vals: 355732864 - - -kernel execution time: 729.345 ms -fused time: 730.242 - -kernel execution time: 608.324 ms -sddmm time: 608.908 - -kernel execution time: 334.109 ms -sddmm time: 334.653 - -kernel execution time: 471.211 ms -spmm time: 471.77 - -kernel execution time: 8630.19 ms -taco reference time: 8631.29 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291 -C1_dimension: 5558326, C2_dimension: 64, vals: 355732864 -D1_dimension: 5558326, D2_dimension: 64, vals: 355732864 -E1_dimension: 5558326, E2_dimension: 64, vals: 355732864 - - -kernel execution time: 736.326 ms -fused time: 737.203 - -kernel execution time: 482.639 ms -sddmm time: 483.19 - -kernel execution time: 333.58 ms -sddmm time: 334.131 - -kernel execution time: 478.49 ms -spmm time: 479.051 - -kernel execution time: 7244.99 ms -taco reference time: 7246.13 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 62451, B2_dimension: 62451, vals: 4007383 -C1_dimension: 62451, C2_dimension: 64, vals: 3996864 -D1_dimension: 62451, D2_dimension: 64, vals: 3996864 -E1_dimension: 62451, E2_dimension: 64, vals: 3996864 - - -kernel execution time: 13.4143 ms -fused time: 13.9143 - -kernel execution time: 11.2836 ms -sddmm time: 12.0149 - -kernel execution time: 7.35609 ms -sddmm time: 8.06588 - -kernel execution time: 7.36916 ms -spmm time: 7.93476 - -kernel execution time: 120.287 ms -taco reference time: 120.948 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 62451, B2_dimension: 62451, vals: 4007383 -C1_dimension: 62451, C2_dimension: 64, vals: 3996864 -D1_dimension: 62451, D2_dimension: 64, vals: 3996864 -E1_dimension: 62451, E2_dimension: 64, vals: 3996864 - - -kernel execution time: 156.322 ms -fused time: 156.802 - -kernel execution time: 77.0794 ms -sddmm time: 77.6574 - -kernel execution time: 81.2772 ms -sddmm time: 81.8141 - -kernel execution time: 74.4419 ms -spmm time: 74.9538 - -kernel execution time: 5091.25 ms -taco reference time: 5092.34 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 62451, B2_dimension: 62451, vals: 4007383 -C1_dimension: 62451, C2_dimension: 64, vals: 3996864 -D1_dimension: 62451, D2_dimension: 64, vals: 3996864 -E1_dimension: 62451, E2_dimension: 64, vals: 3996864 - - -kernel execution time: 160.868 ms -fused time: 161.347 - -kernel execution time: 78.1223 ms -sddmm time: 78.7031 - -kernel execution time: 82.4929 ms -sddmm time: 83.0729 - -kernel execution time: 77.24 ms -spmm time: 77.7896 - -kernel execution time: 5087.42 ms -taco reference time: 5088.53 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 62451, B2_dimension: 62451, vals: 4007383 -C1_dimension: 62451, C2_dimension: 64, vals: 3996864 -D1_dimension: 62451, D2_dimension: 64, vals: 3996864 -E1_dimension: 62451, E2_dimension: 64, vals: 3996864 - - -kernel execution time: 157.627 ms -fused time: 158.106 - -kernel execution time: 76.9497 ms -sddmm time: 77.5265 - -kernel execution time: 81.9491 ms -sddmm time: 82.4945 - -kernel execution time: 81.9841 ms -spmm time: 82.5149 - -kernel execution time: 5084.06 ms -taco reference time: 5085.15 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 62451, B2_dimension: 62451, vals: 4007383 -C1_dimension: 62451, C2_dimension: 64, vals: 3996864 -D1_dimension: 62451, D2_dimension: 64, vals: 3996864 -E1_dimension: 62451, E2_dimension: 64, vals: 3996864 - - -kernel execution time: 156.608 ms -fused time: 157.085 - -kernel execution time: 76.6969 ms -sddmm time: 77.2366 - -kernel execution time: 80.7238 ms -sddmm time: 81.2624 - -kernel execution time: 74.4498 ms -spmm time: 74.9694 - -kernel execution time: 5076.16 ms -taco reference time: 5077.28 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 62451, B2_dimension: 62451, vals: 4007383 -C1_dimension: 62451, C2_dimension: 64, vals: 3996864 -D1_dimension: 62451, D2_dimension: 64, vals: 3996864 -E1_dimension: 62451, E2_dimension: 64, vals: 3996864 - - -kernel execution time: 156.489 ms -fused time: 156.996 - -kernel execution time: 77.2215 ms -sddmm time: 77.7763 - -kernel execution time: 81.2983 ms -sddmm time: 81.8357 - -kernel execution time: 75.4752 ms -spmm time: 76.0191 - -kernel execution time: 5087.37 ms -taco reference time: 5088.51 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 62451, B2_dimension: 62451, vals: 4007383 -C1_dimension: 62451, C2_dimension: 64, vals: 3996864 -D1_dimension: 62451, D2_dimension: 64, vals: 3996864 -E1_dimension: 62451, E2_dimension: 64, vals: 3996864 - - -kernel execution time: 156.515 ms -fused time: 156.991 - -kernel execution time: 76.9797 ms -sddmm time: 77.5298 - -kernel execution time: 81.4654 ms -sddmm time: 82.0017 - -kernel execution time: 76.1847 ms -spmm time: 76.693 - -kernel execution time: 5078.68 ms -taco reference time: 5079.85 - -sddmm-spmm execution - ------------------------------------------ -A(i) = B(i,j) * C(j,k) * v(k); -B1_dimension: 5, B2_dimension: 5, vals: 19 -C1_dimension: 5, C2_dimension: 5, vals: 19 -D1_dimension: 5, vals: 5 - - -sddmm-spmm execution - ------------------------------------------ -A(i) = B(i,j) * C(j,k) * v(k); -B1_dimension: 5, B2_dimension: 5, vals: 19 -C1_dimension: 5, C2_dimension: 5, vals: 19 -D1_dimension: 5, vals: 5 - - -sddmm-spmm execution - ------------------------------------------ -A(i) = B(i,j) * C(j,k) * v(k); -B1_dimension: 5, B2_dimension: 5, vals: 19 -C1_dimension: 5, C2_dimension: 5, vals: 19 -D1_dimension: 5, vals: 5 - - -sddmm-spmm execution - ------------------------------------------ -A(i) = B(i,j) * C(j,k) * v(k); -B1_dimension: 5, B2_dimension: 5, vals: 19 -C1_dimension: 5, C2_dimension: 5, vals: 19 -D1_dimension: 5, vals: 5 - - -sddmm-spmm execution - ------------------------------------------ -A(i) = B(i,j) * C(j,k) * v(k); -B1_dimension: 5, B2_dimension: 5, vals: 19 -C1_dimension: 5, C2_dimension: 5, vals: 19 -D1_dimension: 5, vals: 5 - - -sddmm-spmm execution - ------------------------------------------ -A(i) = B(i,j) * C(j,k) * v(k); -B1_dimension: 5, B2_dimension: 5, vals: 19 -C1_dimension: 5, C2_dimension: 5, vals: 19 -D1_dimension: 5, vals: 5 - - -sddmm-spmm execution - ------------------------------------------ -A(i) = B(i,j) * C(j,k) * v(k); -B1_dimension: 5, B2_dimension: 5, vals: 19 -C1_dimension: 5, C2_dimension: 5, vals: 19 -D1_dimension: 5, vals: 5 - - -sddmm-spmm execution - ------------------------------------------ -A(i) = B(i,j) * C(j,k) * v(k); -B1_dimension: 5, B2_dimension: 5, vals: 19 -C1_dimension: 5, C2_dimension: 5, vals: 19 -D1_dimension: 5, vals: 5 - - -sddmm-spmm execution - ------------------------------------------ -A(i) = B(i,j) * C(j,k) * v(k); -B1_dimension: 5, B2_dimension: 5, vals: 19 -C1_dimension: 5, C2_dimension: 5, vals: 19 -D1_dimension: 5, vals: 5 - - -sddmm-spmm execution - ------------------------------------------ -A(i) = B(i,j) * C(j,k) * v(k); -B1_dimension: 5, B2_dimension: 5, vals: 19 -C1_dimension: 5, C2_dimension: 5, vals: 19 -D1_dimension: 5, vals: 5 - - -sddmm-spmm execution - ------------------------------------------ -A(i) = B(i,j) * C(j,k) * v(k); -B1_dimension: 5, B2_dimension: 5, vals: 19 -C1_dimension: 5, C2_dimension: 5, vals: 19 -D1_dimension: 5, vals: 5 - - -sddmm-spmm execution - ------------------------------------------ -A(i) = B(i,j) * C(j,k) * v(k); -B1_dimension: 5, B2_dimension: 5, vals: 19 -C1_dimension: 5, C2_dimension: 5, vals: 19 -D1_dimension: 5, vals: 5 - - -sddmm-spmm execution - ------------------------------------------ -A(i) = B(i,j) * C(j,k) * v(k); -B1_dimension: 5, B2_dimension: 5, vals: 19 -C1_dimension: 5, C2_dimension: 5, vals: 19 -D1_dimension: 5, vals: 5 - - -sddmm-spmm execution - ------------------------------------------ -A(i) = B(i,j) * C(j,k) * v(k); -B1_dimension: 5, B2_dimension: 5, vals: 19 -C1_dimension: 5, C2_dimension: 5, vals: 19 -D1_dimension: 5, vals: 5 - - -sddmm-spmm execution - ------------------------------------------ -A(i) = B(i,j) * C(j,k) * v(k); -B1_dimension: 5, B2_dimension: 5, vals: 19 -C1_dimension: 5, C2_dimension: 5, vals: 19 -D1_dimension: 5, vals: 5 - - -sddmm-spmm execution - ------------------------------------------ -A(i) = B(i,j) * C(j,k) * v(k); -B1_dimension: 5, B2_dimension: 5, vals: 19 -C1_dimension: 5, C2_dimension: 5, vals: 19 -D1_dimension: 5, vals: 5 - - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 5, B2_dimension: 5, vals: 19 -C1_dimension: 5, C2_dimension: 64, vals: 320 -D1_dimension: 5, D2_dimension: 64, vals: 320 -E1_dimension: 64, E2_dimension: 64, vals: 4096 - - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 5, B2_dimension: 5, vals: 19 -C1_dimension: 5, C2_dimension: 64, vals: 320 -D1_dimension: 5, D2_dimension: 64, vals: 320 -E1_dimension: 64, E2_dimension: 64, vals: 4096 - - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 64, vals: 2330688 -D1_dimension: 36417, D2_dimension: 64, vals: 2330688 -E1_dimension: 64, E2_dimension: 64, vals: 4096 - - -kernel execution time: 115.102 ms -fused time: 115.803 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 83334, B2_dimension: 83334, vals: 6010480 -C1_dimension: 83334, C2_dimension: 128, vals: 10666752 -D1_dimension: 83334, D2_dimension: 128, vals: 10666752 -E1_dimension: 83334, E2_dimension: 128, vals: 10666752 - - -kernel execution time: 30.977 ms -fused time: 35.4912 - -separate execution - -kernel execution time: 26.0898 ms -sddmm time: 26.6915 - -kernel execution time: 15.4341 ms -sddmm time: 16.0058 - -kernel execution time: 17.7466 ms -spmm time: 18.2995 - -reference execution - -kernel execution time: 694.171 ms -taco reference time: 694.888 -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 -C1_dimension: 121192, C2_dimension: 128, vals: 15512576 -D1_dimension: 121192, D2_dimension: 128, vals: 15512576 -E1_dimension: 121192, E2_dimension: 128, vals: 15512576 - - -kernel execution time: 52.5109 ms -fused time: 56.6803 - -separate execution - -kernel execution time: 41.9638 ms -sddmm time: 42.5925 - -kernel execution time: 21.3537 ms -sddmm time: 21.9855 - -kernel execution time: 25.1185 ms -spmm time: 25.7047 - -reference execution - -kernel execution time: 323.01 ms -taco reference time: 323.699 -/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 140874, B2_dimension: 140874, vals: 7813404 -C1_dimension: 140874, C2_dimension: 128, vals: 18031872 -D1_dimension: 140874, D2_dimension: 128, vals: 18031872 -E1_dimension: 140874, E2_dimension: 128, vals: 18031872 - - -kernel execution time: 45.3128 ms -fused time: 48.4929 - -separate execution - -kernel execution time: 39.7986 ms -sddmm time: 40.3901 - -kernel execution time: 20.8296 ms -sddmm time: 21.432 - -kernel execution time: 25.0308 ms -spmm time: 25.5726 - -reference execution - -kernel execution time: 867.794 ms -taco reference time: 868.418 -/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 170998, B2_dimension: 170998, vals: 958936 -C1_dimension: 170998, C2_dimension: 128, vals: 21887744 -D1_dimension: 170998, D2_dimension: 128, vals: 21887744 -E1_dimension: 170998, E2_dimension: 128, vals: 21887744 - - -kernel execution time: 34.2915 ms -fused time: 38.221 - -separate execution - -kernel execution time: 18.8777 ms -sddmm time: 19.4859 - -kernel execution time: 12.8794 ms -sddmm time: 16.5695 - -kernel execution time: 19.7876 ms -spmm time: 23.5933 - -reference execution - -kernel execution time: 114.374 ms -taco reference time: 115.03 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 83334, B2_dimension: 83334, vals: 6010480 -C1_dimension: 83334, C2_dimension: 128, vals: 10666752 -D1_dimension: 83334, D2_dimension: 128, vals: 10666752 -E1_dimension: 83334, E2_dimension: 128, vals: 10666752 - - -kernel execution time: 77.2194 ms -fused time: 78.1408 - -separate execution - -kernel execution time: 28.0545 ms -sddmm time: 28.625 - -kernel execution time: 15.7941 ms -sddmm time: 16.3986 - -kernel execution time: 18.1167 ms -spmm time: 18.7055 - -reference execution - -kernel execution time: 652.088 ms -taco reference time: 652.794 -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 -C1_dimension: 121192, C2_dimension: 128, vals: 15512576 -D1_dimension: 121192, D2_dimension: 128, vals: 15512576 -E1_dimension: 121192, E2_dimension: 128, vals: 15512576 - - -kernel execution time: 100.999 ms -fused time: 104.98 - -separate execution - -kernel execution time: 42.4345 ms -sddmm time: 43.0804 - -kernel execution time: 21.5005 ms -sddmm time: 22.1326 - -kernel execution time: 25.1479 ms -spmm time: 25.7284 - -reference execution - -kernel execution time: 303.541 ms -taco reference time: 304.249 -/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 140874, B2_dimension: 140874, vals: 7813404 -C1_dimension: 140874, C2_dimension: 128, vals: 18031872 -D1_dimension: 140874, D2_dimension: 128, vals: 18031872 -E1_dimension: 140874, E2_dimension: 128, vals: 18031872 - - -kernel execution time: 121.702 ms -fused time: 122.44 - -separate execution - -kernel execution time: 41.1645 ms -sddmm time: 41.7679 - -kernel execution time: 21.4454 ms -sddmm time: 22.062 - -kernel execution time: 25.7274 ms -spmm time: 26.3069 - -reference execution - -kernel execution time: 838.679 ms -taco reference time: 839.358 -/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 170998, B2_dimension: 170998, vals: 958936 -C1_dimension: 170998, C2_dimension: 128, vals: 21887744 -D1_dimension: 170998, D2_dimension: 128, vals: 21887744 -E1_dimension: 170998, E2_dimension: 128, vals: 21887744 - - -kernel execution time: 49.6789 ms -fused time: 53.8345 - -separate execution - -kernel execution time: 19.3289 ms -sddmm time: 19.9476 - -kernel execution time: 12.9298 ms -sddmm time: 16.5522 - -kernel execution time: 19.7859 ms -spmm time: 23.3756 - -reference execution - -kernel execution time: 114.935 ms -taco reference time: 115.594 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 83334, B2_dimension: 83334, vals: 6010480 -C1_dimension: 83334, C2_dimension: 128, vals: 10666752 -D1_dimension: 83334, D2_dimension: 128, vals: 10666752 -E1_dimension: 83334, E2_dimension: 128, vals: 10666752 - - -kernel execution time: 29.3495 ms -fused time: 32.2304 - -separate execution - -kernel execution time: 23.942 ms -sddmm time: 24.54 - -kernel execution time: 14.4886 ms -sddmm time: 16.5358 - -kernel execution time: 16.8516 ms -spmm time: 20.2626 - -reference execution - -kernel execution time: 709.96 ms -taco reference time: 710.774 -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 -C1_dimension: 121192, C2_dimension: 128, vals: 15512576 -D1_dimension: 121192, D2_dimension: 128, vals: 15512576 -E1_dimension: 121192, E2_dimension: 128, vals: 15512576 - - -kernel execution time: 58.2762 ms -fused time: 62.5278 - -separate execution - -kernel execution time: 42.1594 ms -sddmm time: 42.7262 - -kernel execution time: 22.1442 ms -sddmm time: 23.0064 - -kernel execution time: 25.7924 ms -spmm time: 26.3623 - -reference execution - -kernel execution time: 329.572 ms -taco reference time: 330.27 -/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 140874, B2_dimension: 140874, vals: 7813404 -C1_dimension: 140874, C2_dimension: 128, vals: 18031872 -D1_dimension: 140874, D2_dimension: 128, vals: 18031872 -E1_dimension: 140874, E2_dimension: 128, vals: 18031872 - - -kernel execution time: 46.007 ms -fused time: 50.2274 - -separate execution - -kernel execution time: 41.4699 ms -sddmm time: 42.0415 - -kernel execution time: 21.559 ms -sddmm time: 22.136 - -kernel execution time: 25.525 ms -spmm time: 26.0801 - -reference execution - -kernel execution time: 869.823 ms -taco reference time: 873.823 -/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 170998, B2_dimension: 170998, vals: 958936 -C1_dimension: 170998, C2_dimension: 128, vals: 21887744 -D1_dimension: 170998, D2_dimension: 128, vals: 21887744 -E1_dimension: 170998, E2_dimension: 128, vals: 21887744 - - -kernel execution time: 33.3907 ms -fused time: 37.2851 - -separate execution - -kernel execution time: 19.369 ms -sddmm time: 19.9378 - -kernel execution time: 12.956 ms -sddmm time: 15.1889 - -kernel execution time: 19.8054 ms -spmm time: 23.5126 - -reference execution - -kernel execution time: 115.104 ms -taco reference time: 115.684 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 206500, B2_dimension: 206500, vals: 1273389 -C1_dimension: 206500, C2_dimension: 128, vals: 26432000 -D1_dimension: 206500, D2_dimension: 128, vals: 26432000 -E1_dimension: 206500, E2_dimension: 128, vals: 26432000 - - -kernel execution time: 45.2869 ms -fused time: 49.074 - -separate execution - -kernel execution time: 20.8037 ms -sddmm time: 21.3769 - -kernel execution time: 18.6117 ms -sddmm time: 19.1765 - -kernel execution time: 27.6368 ms -spmm time: 28.2194 - -reference execution - -kernel execution time: 157.83 ms -taco reference time: 158.458 -/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536 -C1_dimension: 1000005, C2_dimension: 128, vals: 128000640 -D1_dimension: 1000005, D2_dimension: 128, vals: 128000640 -E1_dimension: 1000005, E2_dimension: 128, vals: 128000640 - - -kernel execution time: 133.416 ms -fused time: 137.603 - -separate execution - -kernel execution time: 50.8463 ms -sddmm time: 51.4255 - -kernel execution time: 41.2442 ms -sddmm time: 41.8788 - -kernel execution time: 83.4032 ms -spmm time: 84.052 - -reference execution - -kernel execution time: 569.216 ms -taco reference time: 570.035 -/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291 -C1_dimension: 5558326, C2_dimension: 128, vals: 711465728 -D1_dimension: 5558326, D2_dimension: 128, vals: 711465728 -E1_dimension: 5558326, E2_dimension: 128, vals: 711465728 - - -kernel execution time: 1282.76 ms -fused time: 1287.59 - -separate execution - -kernel execution time: 606.985 ms -sddmm time: 607.616 - -kernel execution time: 561.224 ms -sddmm time: 561.958 - -kernel execution time: 874.527 ms -spmm time: 875.232 - -reference execution - -kernel execution time: 21707 ms -taco reference time: 21710.6 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 5, B2_dimension: 5, vals: 19 -C1_dimension: 5, C2_dimension: 128, vals: 640 -D1_dimension: 5, D2_dimension: 128, vals: 640 -E1_dimension: 5, E2_dimension: 128, vals: 640 - - -kernel execution time: 3.43602 ms -fused time: 27.8707 - -separate execution - -kernel execution time: 4107.02 ms -sddmm time: 4122.77 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 5, B2_dimension: 5, vals: 19 -C1_dimension: 5, C2_dimension: 128, vals: 640 -D1_dimension: 5, D2_dimension: 128, vals: 640 -E1_dimension: 5, E2_dimension: 128, vals: 640 - - -kernel execution time: 0.115981 ms -fused time: 0.499507 - -separate execution - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 5, B2_dimension: 5, vals: 19 -C1_dimension: 5, C2_dimension: 128, vals: 640 -D1_dimension: 5, D2_dimension: 128, vals: 640 -E1_dimension: 5, E2_dimension: 128, vals: 640 - - -kernel execution time: 0.133052 ms -fused time: 3.69599 - -separate execution - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 2700, B2_dimension: 2700, vals: 5400 -C1_dimension: 2700, C2_dimension: 128, vals: 345600 -D1_dimension: 2700, D2_dimension: 128, vals: 345600 -E1_dimension: 2700, E2_dimension: 128, vals: 345600 - - -kernel execution time: 0.606469 ms -fused time: 4.32552 - -separate execution - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 2700, B2_dimension: 2700, vals: 5400 -C1_dimension: 2700, C2_dimension: 128, vals: 345600 -D1_dimension: 2700, D2_dimension: 128, vals: 345600 -E1_dimension: 2700, E2_dimension: 128, vals: 345600 - - -kernel execution time: 0.650529 ms -fused time: 1.40893 - -separate execution - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 2708, B2_dimension: 2708, vals: 5400 -C1_dimension: 2708, C2_dimension: 128, vals: 346624 -D1_dimension: 2708, D2_dimension: 128, vals: 346624 -E1_dimension: 2708, E2_dimension: 128, vals: 346624 - - -kernel execution time: 0.620999 ms -fused time: 1.38301 - -separate execution - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 2708, B2_dimension: 2708, vals: 5400 -C1_dimension: 2708, C2_dimension: 128, vals: 346624 -D1_dimension: 2708, D2_dimension: 128, vals: 346624 -E1_dimension: 2708, E2_dimension: 128, vals: 346624 - - -kernel execution time: 0.652959 ms -fused time: 3.94184 - -separate execution - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 2708, B2_dimension: 2708, vals: 5429 -C1_dimension: 2708, C2_dimension: 128, vals: 346624 -D1_dimension: 2708, D2_dimension: 128, vals: 346624 -E1_dimension: 2708, E2_dimension: 128, vals: 346624 - - -kernel execution time: 0.597158 ms -fused time: 4.27836 - -separate execution - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 2708, B2_dimension: 2708, vals: 5429 -C1_dimension: 2708, C2_dimension: 128, vals: 346624 -D1_dimension: 2708, D2_dimension: 128, vals: 346624 -E1_dimension: 2708, E2_dimension: 128, vals: 346624 - - -kernel execution time: 0.659809 ms -fused time: 4.6484 - -separate execution - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 2708, B2_dimension: 2708, vals: 5429 -C1_dimension: 2708, C2_dimension: 128, vals: 346624 -D1_dimension: 2708, D2_dimension: 128, vals: 346624 -E1_dimension: 2708, E2_dimension: 128, vals: 346624 - - -kernel execution time: 0.591018 ms -fused time: 2.44084 - -separate execution - -kernel execution time: 0.607388 ms -sddmm time: 0.891202 - -kernel execution time: 0.857981 ms -sddmm time: 1.16087 - -kernel execution time: 0.922992 ms -spmm time: 1.60378 - -reference execution - -kernel execution time: 4.47191 ms -taco reference time: 5.26226 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 2708, B2_dimension: 2708, vals: 5429 -C1_dimension: 2708, C2_dimension: 128, vals: 346624 -D1_dimension: 2708, D2_dimension: 128, vals: 346624 -E1_dimension: 2708, E2_dimension: 128, vals: 346624 - - -kernel execution time: 0.658879 ms -fused time: 4.15402 - -separate execution - -kernel execution time: 0.70888 ms -sddmm time: 1.21343 - -kernel execution time: 0.531398 ms -sddmm time: 1.30729 - -kernel execution time: 0.965464 ms -spmm time: 2.35378 - -reference execution - -kernel execution time: 3.48771 ms -taco reference time: 7.55141 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 2708, B2_dimension: 2708, vals: 5429 -C1_dimension: 2708, C2_dimension: 128, vals: 346624 -D1_dimension: 2708, D2_dimension: 128, vals: 346624 -E1_dimension: 2708, E2_dimension: 128, vals: 346624 - - -kernel execution time: 0.616739 ms -fused time: 4.4146 - -separate execution - -kernel execution time: 0.556318 ms -sddmm time: 3.03196 - -kernel execution time: 0.945623 ms -sddmm time: 1.89019 - -kernel execution time: 0.777471 ms -spmm time: 3.57728 - -reference execution - -kernel execution time: 3.22827 ms -taco reference time: 7.39799 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 2708, B2_dimension: 2708, vals: 5429 -C1_dimension: 2708, C2_dimension: 128, vals: 346624 -D1_dimension: 2708, D2_dimension: 128, vals: 346624 -E1_dimension: 2708, E2_dimension: 128, vals: 346624 - - -kernel execution time: 0.65531 ms -fused time: 4.08374 - -separate execution - -kernel execution time: 0.666219 ms -sddmm time: 1.20641 - -kernel execution time: 0.941573 ms -sddmm time: 1.73185 - -kernel execution time: 1.01493 ms -spmm time: 1.75608 - -reference execution - -kernel execution time: 5.25507 ms -taco reference time: 6.04624 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 2708, B2_dimension: 2708, vals: 5429 -C1_dimension: 2708, C2_dimension: 128, vals: 346624 -D1_dimension: 2708, D2_dimension: 128, vals: 346624 -E1_dimension: 2708, E2_dimension: 128, vals: 346624 - - -kernel execution time: 0.670959 ms -fused time: 1.50328 - -separate execution - -kernel execution time: 0.600268 ms -sddmm time: 1.32833 - -kernel execution time: 0.476237 ms -sddmm time: 0.792151 - -kernel execution time: 0.781091 ms -spmm time: 1.10271 - -reference execution - -kernel execution time: 3.07623 ms -taco reference time: 3.53829 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 2708, B2_dimension: 2708, vals: 5429 -C1_dimension: 2708, C2_dimension: 128, vals: 346624 -D1_dimension: 2708, D2_dimension: 128, vals: 346624 -E1_dimension: 2708, E2_dimension: 128, vals: 346624 - - -kernel execution time: 0.760541 ms -fused time: 1.49073 - -separate execution - -kernel execution time: 0.639829 ms -sddmm time: 1.21327 - -kernel execution time: 0.576218 ms -sddmm time: 1.14083 - -kernel execution time: 0.829512 ms -spmm time: 1.33624 - -reference execution - -kernel execution time: 4.14591 ms -taco reference time: 4.82508 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 2708, B2_dimension: 2708, vals: 5429 -C1_dimension: 2708, C2_dimension: 128, vals: 346624 -D1_dimension: 2708, D2_dimension: 128, vals: 346624 -E1_dimension: 2708, E2_dimension: 128, vals: 346624 - - -kernel execution time: 0.638949 ms -fused time: 1.02277 - -separate execution - -kernel execution time: 0.945034 ms -sddmm time: 1.20456 - -kernel execution time: 0.6772 ms -sddmm time: 0.943263 - -kernel execution time: 0.888033 ms -spmm time: 1.133 - -reference execution - -kernel execution time: 3.82989 ms -taco reference time: 4.18452 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 2708, B2_dimension: 2708, vals: 5429 -C1_dimension: 2708, C2_dimension: 128, vals: 346624 -D1_dimension: 2708, D2_dimension: 128, vals: 346624 -E1_dimension: 2708, E2_dimension: 128, vals: 346624 - - -kernel execution time: 0.7361 ms -fused time: 1.45315 - -separate execution - -kernel execution time: 0.7335 ms -sddmm time: 1.25184 - -kernel execution time: 0.642509 ms -sddmm time: 1.16064 - -kernel execution time: 1.02361 ms -spmm time: 1.48614 - -reference execution - -kernel execution time: 4.12035 ms -taco reference time: 4.75857 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 334863, B2_dimension: 334863, vals: 777323 -C1_dimension: 334863, C2_dimension: 128, vals: 42862464 -D1_dimension: 334863, D2_dimension: 128, vals: 42862464 -E1_dimension: 334863, E2_dimension: 128, vals: 42862464 - - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 548551, B2_dimension: 548551, vals: 925872 -C1_dimension: 548551, C2_dimension: 128, vals: 70214528 -D1_dimension: 548551, D2_dimension: 128, vals: 70214528 -E1_dimension: 548551, E2_dimension: 128, vals: 70214528 - - -kernel execution time: 66.4595 ms -fused time: 66.9196 - -separate execution - -kernel execution time: 22.9317 ms -sddmm time: 23.4738 - -kernel execution time: 22.4453 ms -sddmm time: 23.0045 - -kernel execution time: 44.2796 ms -spmm time: 44.8052 - -reference execution - -kernel execution time: 187.6 ms -taco reference time: 188.247 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 548551, B2_dimension: 548551, vals: 1851744 -C1_dimension: 548551, C2_dimension: 128, vals: 70214528 -D1_dimension: 548551, D2_dimension: 128, vals: 70214528 -E1_dimension: 548551, E2_dimension: 128, vals: 70214528 - - -kernel execution time: 103.551 ms -fused time: 104.018 - -separate execution - -kernel execution time: 39.9535 ms -sddmm time: 40.5639 - -kernel execution time: 39.2683 ms -sddmm time: 39.8581 - -kernel execution time: 65.8336 ms -spmm time: 66.417 - -reference execution - -kernel execution time: 306.901 ms -taco reference time: 307.61 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 548551, B2_dimension: 548551, vals: 1851744 -C1_dimension: 548551, C2_dimension: 128, vals: 70214528 -D1_dimension: 548551, D2_dimension: 128, vals: 70214528 -E1_dimension: 548551, E2_dimension: 128, vals: 70214528 - - -kernel execution time: 106.782 ms -fused time: 107.261 - -separate execution - -kernel execution time: 40.7961 ms -sddmm time: 41.3604 - -kernel execution time: 39.8676 ms -sddmm time: 40.4959 - -kernel execution time: 66.2656 ms -spmm time: 66.8105 - -reference execution - -kernel execution time: 367.416 ms -taco reference time: 368.086 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 548551, B2_dimension: 548551, vals: 1851744 -C1_dimension: 548551, C2_dimension: 128, vals: 70214528 -D1_dimension: 548551, D2_dimension: 128, vals: 70214528 -E1_dimension: 548551, E2_dimension: 128, vals: 70214528 - - -kernel execution time: 108.809 ms -fused time: 109.274 - -separate execution - -kernel execution time: 42.2311 ms -sddmm time: 42.826 - -kernel execution time: 41.711 ms -sddmm time: 42.3721 - -kernel execution time: 65.9512 ms -spmm time: 66.5647 - -reference execution - -kernel execution time: 360.581 ms -taco reference time: 361.225 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 548551, B2_dimension: 548551, vals: 1851744 -C1_dimension: 548551, C2_dimension: 128, vals: 70214528 -D1_dimension: 548551, D2_dimension: 128, vals: 70214528 -E1_dimension: 548551, E2_dimension: 128, vals: 70214528 - - -kernel execution time: 922.149 ms -fused time: 922.605 - -separate execution - -kernel execution time: 392.18 ms -sddmm time: 392.716 - -kernel execution time: 393.251 ms -sddmm time: 393.777 - -kernel execution time: 520.496 ms -spmm time: 521.007 - -reference execution - -kernel execution time: 9912.29 ms -taco reference time: 9913.37 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 2708, B2_dimension: 2708, vals: 5429 -C1_dimension: 2708, C2_dimension: 128, vals: 346624 -D1_dimension: 2708, D2_dimension: 128, vals: 346624 -E1_dimension: 2708, E2_dimension: 128, vals: 346624 - - -kernel execution time: 2.15935 ms -fused time: 2.88765 - -separate execution - -kernel execution time: 1.09729 ms -sddmm time: 1.64867 - -kernel execution time: 0.987463 ms -sddmm time: 1.50853 - -kernel execution time: 2.22996 ms -spmm time: 2.71273 - -reference execution - -kernel execution time: 29.4617 ms -taco reference time: 29.8511 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 2708, B2_dimension: 2708, vals: 5429 -C1_dimension: 2708, C2_dimension: 128, vals: 346624 -D1_dimension: 2708, D2_dimension: 128, vals: 346624 -E1_dimension: 2708, E2_dimension: 128, vals: 346624 - - -kernel execution time: 0.667108 ms -fused time: 1.05163 - -separate execution - -kernel execution time: 0.680159 ms -sddmm time: 0.994963 - -kernel execution time: 0.611478 ms -sddmm time: 1.1057 - -kernel execution time: 0.988313 ms -spmm time: 1.4939 - -reference execution - -kernel execution time: 3.64386 ms -taco reference time: 4.33446 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 2708, B2_dimension: 2708, vals: 5429 -C1_dimension: 2708, C2_dimension: 128, vals: 346624 -D1_dimension: 2708, D2_dimension: 128, vals: 346624 -E1_dimension: 2708, E2_dimension: 128, vals: 346624 - - -kernel execution time: 0.691709 ms -fused time: 1.07767 - -separate execution - -kernel execution time: 0.516997 ms -sddmm time: 0.77957 - -kernel execution time: 0.458366 ms -sddmm time: 0.73026 - -kernel execution time: 0.777811 ms -spmm time: 1.01678 - -reference execution - -kernel execution time: 3.47463 ms -taco reference time: 3.82426 -/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 548551, B2_dimension: 548551, vals: 1851744 -C1_dimension: 548551, C2_dimension: 128, vals: 70214528 -D1_dimension: 548551, D2_dimension: 128, vals: 70214528 -E1_dimension: 548551, E2_dimension: 128, vals: 70214528 - - -kernel execution time: 104.681 ms -fused time: 105.128 - -separate execution - -kernel execution time: 39.5478 ms -sddmm time: 40.1164 - -kernel execution time: 40.2068 ms -sddmm time: 40.7802 - -kernel execution time: 67.2769 ms -spmm time: 67.8666 - -reference execution - -kernel execution time: 378.806 ms -taco reference time: 379.526 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 2708, B2_dimension: 2708, vals: 5429 -C1_dimension: 2708, C2_dimension: 128, vals: 346624 -D1_dimension: 2708, D2_dimension: 128, vals: 346624 -E1_dimension: 2708, E2_dimension: 128, vals: 346624 - - -kernel execution time: 2.0421 ms -fused time: 2.77318 - -separate execution - -kernel execution time: 0.890922 ms -sddmm time: 1.4406 - -kernel execution time: 0.673509 ms -sddmm time: 0.955103 - -kernel execution time: 1.93153 ms -spmm time: 2.18341 - -reference execution - -kernel execution time: 33.2851 ms -taco reference time: 33.6343 -/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 548551, B2_dimension: 548551, vals: 1851744 -C1_dimension: 548551, C2_dimension: 128, vals: 70214528 -D1_dimension: 548551, D2_dimension: 128, vals: 70214528 -E1_dimension: 548551, E2_dimension: 128, vals: 70214528 - - -kernel execution time: 913.728 ms -fused time: 914.178 - -separate execution - -kernel execution time: 389.744 ms -sddmm time: 390.317 - -kernel execution time: 389.105 ms -sddmm time: 389.68 - -kernel execution time: 520.43 ms -spmm time: 520.979 - -reference execution - -kernel execution time: 9970.19 ms -taco reference time: 9971.18 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 2708, B2_dimension: 2708, vals: 5429 -C1_dimension: 2708, C2_dimension: 128, vals: 346624 -D1_dimension: 2708, D2_dimension: 128, vals: 346624 -E1_dimension: 2708, E2_dimension: 128, vals: 346624 - - -kernel execution time: 1.81249 ms -fused time: 2.53831 - -separate execution - -kernel execution time: 1.41327 ms -sddmm time: 1.9866 - -kernel execution time: 0.687839 ms -sddmm time: 0.957583 - -kernel execution time: 1.99132 ms -spmm time: 2.2301 - -reference execution - -kernel execution time: 33.8389 ms -taco reference time: 34.1855 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 2708, B2_dimension: 2708, vals: 5429 -C1_dimension: 2708, C2_dimension: 128, vals: 346624 -D1_dimension: 2708, D2_dimension: 128, vals: 346624 -E1_dimension: 2708, E2_dimension: 128, vals: 346624 - - -kernel execution time: 2.08639 ms -fused time: 2.81403 - -separate execution - -kernel execution time: 0.75901 ms -sddmm time: 1.27309 - -kernel execution time: 0.72208 ms -sddmm time: 1.00494 - -kernel execution time: 1.95748 ms -spmm time: 2.20503 - -reference execution - -kernel execution time: 33.4827 ms -taco reference time: 33.8347 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 2708, B2_dimension: 2708, vals: 5429 -C1_dimension: 2708, C2_dimension: 128, vals: 346624 -D1_dimension: 2708, D2_dimension: 128, vals: 346624 -E1_dimension: 2708, E2_dimension: 128, vals: 346624 - - -kernel execution time: 2.09414 ms -fused time: 2.82691 - -separate execution - -kernel execution time: 1.03623 ms -sddmm time: 1.58316 - -kernel execution time: 0.653819 ms -sddmm time: 0.926463 - -kernel execution time: 1.88145 ms -spmm time: 2.12517 - -reference execution - -kernel execution time: 33.3395 ms -taco reference time: 33.6915 - -sddmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k)*F(j,l); -B1_dimension: 2708, B2_dimension: 2708, vals: 5429 -C1_dimension: 2708, C2_dimension: 128, vals: 346624 -D1_dimension: 2708, D2_dimension: 128, vals: 346624 -E1_dimension: 2708, E2_dimension: 128, vals: 346624 - - -kernel execution time: 1.70968 ms -fused time: 2.43176 - -separate execution - -kernel execution time: 0.76455 ms -sddmm time: 1.31209 - -kernel execution time: 0.664099 ms -sddmm time: 0.932353 - -kernel execution time: 1.92536 ms -spmm time: 2.17072 - -reference execution - -kernel execution time: 32.5601 ms -taco reference time: 32.9017 diff --git a/test/stats/spmm-spmm.txt b/test/stats/spmm-spmm.txt deleted file mode 100644 index 329aacd65..000000000 --- a/test/stats/spmm-spmm.txt +++ /dev/null @@ -1,3604 +0,0 @@ - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 -C1_dimension: 121192, C2_dimension: 121192, vals: 2624331 -D1_dimension: 121192, D2_dimension: 64, vals: 7756288 - - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 -C1_dimension: 121192, C2_dimension: 121192, vals: 2624331 -D1_dimension: 121192, D2_dimension: 64, vals: 7756288 - - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 -C1_dimension: 121192, C2_dimension: 121192, vals: 2624331 -D1_dimension: 121192, D2_dimension: 64, vals: 7756288 - - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 -C1_dimension: 121192, C2_dimension: 64, vals: 7756288 -D1_dimension: 64, D2_dimension: 64, vals: 4096 - - -kernel execution time: 303.084 ms -fused time: 303.842 - -kernel execution time: 8140.55 ms -taco reference time: 8141.59 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 -C1_dimension: 121192, C2_dimension: 64, vals: 7756288 -D1_dimension: 64, D2_dimension: 64, vals: 4096 - - -kernel execution time: 269.44 ms -fused time: 270.181 - -kernel execution time: 1612.62 ms -taco reference time: 1613.21 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 -C1_dimension: 121192, C2_dimension: 121192, vals: 2624331 -D1_dimension: 121192, D2_dimension: 64, vals: 7756288 - - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 -C1_dimension: 121192, C2_dimension: 121192, vals: 2624331 -D1_dimension: 121192, D2_dimension: 64, vals: 7756288 - - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 -C1_dimension: 121192, C2_dimension: 121192, vals: 2624331 -D1_dimension: 121192, D2_dimension: 64, vals: 7756288 - - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 -C1_dimension: 121192, C2_dimension: 121192, vals: 2624331 -D1_dimension: 121192, D2_dimension: 64, vals: 7756288 - - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 5, B2_dimension: 5, vals: 19 -C1_dimension: 5, C2_dimension: 5, vals: 19 -D1_dimension: 5, D2_dimension: 64, vals: 320 - - -kernel execution time: 0.125431 ms -fused time: 0.815671 - -kernel execution time: 0.03254 ms -taco reference time: 0.828291 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 10974, vals: 428650 -D1_dimension: 10974, D2_dimension: 64, vals: 702336 - - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 10974, vals: 428650 -D1_dimension: 10974, D2_dimension: 8, vals: 87792 - - -kernel execution time: 783.639 ms -fused time: 784.413 - -kernel execution time: 25.6025 ms -taco reference time: 25.9422 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 10974, vals: 428650 -D1_dimension: 10974, D2_dimension: 8, vals: 87792 - - -kernel execution time: 3538.49 ms -fused time: 3539.6 - -kernel execution time: 544.057 ms -taco reference time: 544.496 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 10974, vals: 428650 -D1_dimension: 10974, D2_dimension: 8, vals: 87792 - - -kernel execution time: 3451.46 ms -fused time: 3452.59 - -kernel execution time: 540.889 ms -taco reference time: 541.34 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 8, vals: 87792 -D1_dimension: 8, D2_dimension: 8, vals: 64 - - -kernel execution time: 23.9997 ms -fused time: 24.715 - -kernel execution time: 116.717 ms -taco reference time: 117.038 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 8, vals: 87792 -D1_dimension: 8, D2_dimension: 8, vals: 64 - - -kernel execution time: 2.19466 ms -fused time: 2.91615 - -kernel execution time: 9.4728 ms -taco reference time: 10.0292 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 -C1_dimension: 121192, C2_dimension: 8, vals: 969536 -D1_dimension: 8, D2_dimension: 8, vals: 64 - - -kernel execution time: 30.5327 ms -fused time: 31.2749 - -kernel execution time: 35.9838 ms -taco reference time: 36.52 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536 -C1_dimension: 1000005, C2_dimension: 64, vals: 64000320 -D1_dimension: 64, D2_dimension: 64, vals: 4096 - - -kernel execution time: 1803.51 ms -fused time: 1804.27 - -kernel execution time: 1976.12 ms -taco reference time: 1976.69 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 206500, B2_dimension: 206500, vals: 1273389 -C1_dimension: 206500, C2_dimension: 64, vals: 13216000 -D1_dimension: 64, D2_dimension: 128, vals: 8192 - - -kernel execution time: 484.907 ms -fused time: 485.835 - -kernel execution time: 1567.31 ms -taco reference time: 1567.89 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536 -C1_dimension: 1000005, C2_dimension: 64, vals: 64000320 -D1_dimension: 64, D2_dimension: 128, vals: 8192 - - -kernel execution time: 2301.83 ms -fused time: 2302.58 - -kernel execution time: 3904.01 ms -taco reference time: 3905 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 64, D2_dimension: 128, vals: 8192 - - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 64, D2_dimension: 128, vals: 8192 - - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 64, D2_dimension: 128, vals: 8192 - - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 64, D2_dimension: 128, vals: 8192 - - -kernel execution time: 11.7415 ms -fused time: 12.4648 - -kernel execution time: 155.192 ms -taco reference time: 155.893 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 64, vals: 702336 -D1_dimension: 64, D2_dimension: 128, vals: 8192 - - -kernel execution time: 6.56465 ms -fused time: 7.31046 - -kernel execution time: 1.17042 ms -sddmm time: 1.68226 - -kernel execution time: 5.08948 ms -spmm time: 5.36855 - -kernel execution time: 124.176 ms -taco reference time: 124.551 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 64, vals: 2330688 -D1_dimension: 64, D2_dimension: 128, vals: 8192 - - -kernel execution time: 25.3076 ms -fused time: 25.7407 - -kernel execution time: 14.1922 ms -sddmm time: 14.7097 - -kernel execution time: 16.8223 ms -spmm time: 17.3081 - -kernel execution time: 1299.07 ms -taco reference time: 1299.47 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 46835, B2_dimension: 46835, vals: 2374001 -C1_dimension: 46835, C2_dimension: 64, vals: 2997440 -D1_dimension: 64, D2_dimension: 128, vals: 8192 - - -kernel execution time: 27.1044 ms -fused time: 27.5788 - -kernel execution time: 9.05436 ms -sddmm time: 9.61561 - -kernel execution time: 21.401 ms -spmm time: 21.9403 - -kernel execution time: 695.617 ms -taco reference time: 696.166 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 62451, B2_dimension: 62451, vals: 4007383 -C1_dimension: 62451, C2_dimension: 64, vals: 3996864 -D1_dimension: 64, D2_dimension: 128, vals: 8192 - - -kernel execution time: 33.1726 ms -fused time: 33.5921 - -kernel execution time: 14.8585 ms -sddmm time: 15.3574 - -kernel execution time: 28.8622 ms -spmm time: 29.3477 - -kernel execution time: 1179.24 ms -taco reference time: 1179.66 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 83334, B2_dimension: 83334, vals: 6010480 -C1_dimension: 83334, C2_dimension: 64, vals: 5333376 -D1_dimension: 64, D2_dimension: 128, vals: 8192 - - -kernel execution time: 50.933 ms -fused time: 51.3664 - -kernel execution time: 22.1051 ms -sddmm time: 22.6231 - -kernel execution time: 37.9487 ms -spmm time: 38.4594 - -kernel execution time: 1793.69 ms -taco reference time: 1794.18 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 -C1_dimension: 121192, C2_dimension: 64, vals: 7756288 -D1_dimension: 64, D2_dimension: 128, vals: 8192 - - -kernel execution time: 77.6403 ms -fused time: 78.0713 - -kernel execution time: 19.9996 ms -sddmm time: 20.5235 - -kernel execution time: 55.1072 ms -spmm time: 55.6382 - -kernel execution time: 757.71 ms -taco reference time: 758.251 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 140874, B2_dimension: 140874, vals: 7813404 -C1_dimension: 140874, C2_dimension: 64, vals: 9015936 -D1_dimension: 64, D2_dimension: 128, vals: 8192 - - -kernel execution time: 74.448 ms -fused time: 74.8977 - -kernel execution time: 28.5447 ms -sddmm time: 29.0628 - -kernel execution time: 64.5939 ms -spmm time: 65.3752 - -kernel execution time: 2277.84 ms -taco reference time: 2278.26 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 170998, B2_dimension: 170998, vals: 958936 -C1_dimension: 170998, C2_dimension: 64, vals: 10943872 -D1_dimension: 64, D2_dimension: 128, vals: 8192 - - -kernel execution time: 103.993 ms -fused time: 104.417 - -kernel execution time: 13.9953 ms -sddmm time: 14.4722 - -kernel execution time: 77.1505 ms -spmm time: 77.6507 - -kernel execution time: 277.888 ms -taco reference time: 278.424 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 206500, B2_dimension: 206500, vals: 1273389 -C1_dimension: 206500, C2_dimension: 64, vals: 13216000 -D1_dimension: 64, D2_dimension: 128, vals: 8192 - - -kernel execution time: 122.094 ms -fused time: 122.526 - -kernel execution time: 16.3934 ms -sddmm time: 16.9174 - -kernel execution time: 93.4293 ms -spmm time: 93.9709 - -kernel execution time: 368.185 ms -taco reference time: 368.744 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536 -C1_dimension: 1000005, C2_dimension: 64, vals: 64000320 -D1_dimension: 64, D2_dimension: 128, vals: 8192 - - -kernel execution time: 594.481 ms -fused time: 594.903 - -kernel execution time: 68.7062 ms -sddmm time: 69.19 - -kernel execution time: 456.966 ms -spmm time: 457.476 - -kernel execution time: 939.672 ms -taco reference time: 940.234 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291 -C1_dimension: 5558326, C2_dimension: 64, vals: 355732864 -D1_dimension: 64, D2_dimension: 128, vals: 8192 - - -kernel execution time: 3572.47 ms -fused time: 3573.32 - -kernel execution time: 1088.24 ms -sddmm time: 1088.74 - -kernel execution time: 2533.08 ms -spmm time: 2533.64 - -kernel execution time: 19935.1 ms -taco reference time: 19936.1 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 128, vals: 1404672 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 28.4416 ms -fused time: 28.8482 - -kernel execution time: 58.9151 ms -sddmm time: 59.3822 - -kernel execution time: 85.1524 ms -spmm time: 85.6136 - -kernel execution time: 3443.24 ms -taco reference time: 3444.27 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 128, vals: 1404672 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 28.4398 ms -fused time: 28.9133 - -kernel execution time: 59.5781 ms -SpMM time: 60.0552 - -kernel execution time: 85.038 ms -GeMM time: 85.49 - -kernel execution time: 83.589 ms -Optimized GeMM time: 83.939 - -kernel execution time: 3425.66 ms -taco reference time: 3426.56 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 128, vals: 1404672 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 28.1949 ms -fused time: 28.6047 - -kernel execution time: 58.8056 ms -SpMM time: 59.2739 - -kernel execution time: 85.098 ms -GeMM time: 85.5677 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 128, vals: 1404672 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 128, vals: 1404672 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 34.4562 ms -fused time: 35.1247 - -kernel execution time: 57.8421 ms -SpMM time: 58.3206 - -kernel execution time: 84.8243 ms -GeMM time: 85.2948 - -kernel execution time: 84.2094 ms -Optimized GeMM template time: 84.5715 - -kernel execution time: 3423.26 ms -taco reference time: 3424.18 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 128, vals: 1404672 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 34.1982 ms -fused time: 34.9007 - -kernel execution time: 58.2208 ms -SpMM time: 58.708 - -kernel execution time: 85.2639 ms -GeMM time: 85.7329 - -kernel execution time: 84.6708 ms -Optimized GeMM template time: 85.0447 - -kernel execution time: 3448.38 ms -taco reference time: 3449.25 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 128, vals: 1404672 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 3.98391 ms -fused time: 4.78728 - -kernel execution time: 3.85974 ms -SpMM time: 4.41484 - -kernel execution time: 5.20996 ms -GeMM time: 5.78292 - -kernel execution time: 85.5005 ms -Optimized GeMM template time: 85.8224 - -kernel execution time: 68.5977 ms -taco reference time: 69.0953 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 128, vals: 1404672 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 35.477 ms -fused time: 36.1715 - -kernel execution time: 57.2092 ms -SpMM time: 57.6862 - -kernel execution time: 84.9251 ms -GeMM time: 85.3862 - -kernel execution time: 84.8529 ms -Optimized GeMM template time: 85.2333 - -kernel execution time: 3425.71 ms -taco reference time: 3426.59 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 128, vals: 1404672 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 35.2755 ms -fused time: 35.9965 - -kernel execution time: 57.3952 ms -SpMM time: 57.8851 - -kernel execution time: 85.2686 ms -GeMM time: 85.7356 - -kernel execution time: 84.5744 ms -Optimized GeMM template time: 84.9512 - -kernel execution time: 3429.7 ms -taco reference time: 3430.52 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 128, vals: 1404672 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 3.98364 ms -fused time: 4.61817 - -kernel execution time: 3.85737 ms -SpMM time: 4.28322 - -kernel execution time: 5.15902 ms -GeMM time: 5.6055 - -kernel execution time: 87.1601 ms -Optimized GeMM template time: 87.4622 - -kernel execution time: 69.0316 ms -taco reference time: 69.4576 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 128, vals: 1404672 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 128, vals: 1404672 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 4.62195 ms -fused time: 5.02884 - -kernel execution time: 4.03094 ms -SpMM time: 4.41592 - -kernel execution time: 5.10184 ms -GeMM time: 5.44766 - -kernel execution time: 83.6233 ms -Optimized GeMM template time: 83.895 - -kernel execution time: 5.3188 ms -Optimized GeMM template time: 5.65673 - -kernel execution time: 69.2656 ms -taco reference time: 69.6404 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 128, vals: 1404672 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 4.03732 ms -fused time: 4.69314 - -kernel execution time: 3.72378 ms -SpMM time: 4.02627 - -kernel execution time: 2.04995 ms -GeMM time: 2.33804 - -kernel execution time: 2.25997 ms -Optimized GeMM template time: 2.50901 - -kernel execution time: 5.18509 ms -Optimized GeMM template time: 5.46269 - -kernel execution time: 68.4415 ms -taco reference time: 68.78 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 128, vals: 1404672 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 3.95981 ms -fused time: 4.3754 - -kernel execution time: 3.78475 ms -SpMM time: 4.19686 - -kernel execution time: 2.00709 ms -GeMM time: 2.38028 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 128, vals: 1404672 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 4.05057 ms -fused time: 4.40773 - -kernel execution time: 3.75306 ms -SpMM time: 4.08598 - -kernel execution time: 2.05899 ms -GeMM time: 2.36596 - -kernel execution time: 2.12928 ms -Optimized GeMM template time: 2.36493 - -kernel execution time: 5.14712 ms -Optimized GeMM template time: 5.41248 - -kernel execution time: 68.075 ms -taco reference time: 68.3835 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 128, vals: 1404672 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 3.88934 ms -fused time: 4.25328 - -kernel execution time: 3.82407 ms -SpMM time: 4.19446 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 128, vals: 1404672 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 4.28741 ms -fused time: 4.98944 - -kernel execution time: 3.79765 ms -SpMM time: 4.16417 - -kernel execution time: 1.4265 ms -SpMM template time: 1.74127 - -kernel execution time: 2.10898 ms -GeMM time: 2.39285 - -kernel execution time: 2.34628 ms -Optimized GeMM template time: 2.61728 - -kernel execution time: 5.31869 ms -Optimized GeMM template time: 5.60267 - -kernel execution time: 69.5098 ms -taco reference time: 69.8708 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 128, vals: 1404672 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 4.01588 ms -fused time: 4.65051 - -kernel execution time: 3.86258 ms -SpMM time: 4.2125 - -kernel execution time: 1.43425 ms -SpMM template time: 1.72825 - -kernel execution time: 2.09177 ms -GeMM time: 2.35741 - -kernel execution time: 2.03779 ms -GeMM time: 2.26668 - -kernel execution time: 2.18152 ms -Optimized GeMM template time: 2.45788 - -kernel execution time: 0.974804 ms -Optimized GeMM template time: 1.25462 - -kernel execution time: 67.9024 ms -taco reference time: 68.2452 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 128, vals: 1404672 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 4.0224 ms -fused time: 4.44033 - -kernel execution time: 3.84077 ms -SpMM time: 4.2196 - -kernel execution time: 1.57684 ms -SpMM template time: 1.93604 - -kernel execution time: 2.00289 ms -GeMM time: 2.38135 - -kernel execution time: 1.93219 ms -ref 2 GeMM time: 2.16952 - -kernel execution time: 1.9562 ms -ref3 GeMM template time: 2.22014 - -kernel execution time: 1.02843 ms -SpMM template time: 1.3134 - -kernel execution time: 68.6937 ms -taco reference time: 69.0531 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 128, vals: 1404672 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 128, vals: 1404672 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 4.70723 ms -fused time: 5.10663 - -kernel execution time: 3.86475 ms -SpMM time: 4.22896 - -kernel execution time: 1.5696 ms -SpMM template time: 1.91027 - -kernel execution time: 2.06463 ms -GeMM time: 2.35063 - -kernel execution time: 1.93837 ms -ref 2 GeMM time: 2.18475 - -kernel execution time: 1.93808 ms -ref3 GeMM template time: 2.21134 - -kernel execution time: 1.00393 ms -SpMM template time: 1.28759 - -kernel execution time: 65.6539 ms -taco reference time: 66.0123 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 128, vals: 1404672 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 4.41073 ms -fused time: 4.81175 - -kernel execution time: 3.96438 ms -SpMM time: 4.33792 - -kernel execution time: 1.48077 ms -SpMM template time: 1.84634 - -kernel execution time: 2.06276 ms -GeMM time: 2.52122 - -kernel execution time: 2.4643 ms -ref 2 GeMM template time: 3.77443 - -kernel execution time: 2.21292 ms -ref3 GeMM template time: 2.48374 - -kernel execution time: 1.02386 ms -SpMM template time ref4: 5.63941 - -kernel execution time: 73.0137 ms -taco reference time: 73.4188 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 128, vals: 1404672 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 2.81946 ms -fused time: 3.44515 - -kernel execution time: 3.93379 ms -SpMM time: 4.19505 - -kernel execution time: 1.46537 ms -SpMM template time: 1.77106 - -kernel execution time: 2.48839 ms -GeMM time: 2.75159 - -kernel execution time: 2.57119 ms -ref 2 GeMM template time: 2.83288 - -kernel execution time: 2.19579 ms -ref3 GeMM template time: 2.44668 - -kernel execution time: 1.08977 ms -SpMM template time ref4: 1.3527 - -kernel execution time: 72.5212 ms -taco reference time: 72.8405 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 128, vals: 1404672 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 2.34088 ms -fused time: 2.99398 - -kernel execution time: 3.80606 ms -SpMM time: 4.36154 - -kernel execution time: 1.58906 ms -SpMM template time: 1.95568 - -kernel execution time: 2.25455 ms -GeMM time: 2.5356 - -kernel execution time: 2.3975 ms -ref 2 GeMM template time: 2.66963 - -kernel execution time: 2.10202 ms -ref3 GeMM template time: 2.40392 - -kernel execution time: 1.02333 ms -SpMM template time ref4: 1.30975 - -kernel execution time: 72.6994 ms -taco reference time: 73.0145 - - - - - - ---------------------------------------------------------------------------------------------------------------- ---------------------------------------------------------------------------------------------------------------- ---------------------------------------------------------------------------------------------------------------- - - -with 64 threads - - - - - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 128, vals: 1404672 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 2.36795 ms -fused time: 2.78304 - -kernel execution time: 3.8721 ms -SpMM time: 4.20057 - -kernel execution time: 1.52637 ms -SpMM template time: 1.85784 - -kernel execution time: 2.03318 ms -GeMM time: 2.31935 - -kernel execution time: 2.39998 ms -ref 2 GeMM template time: 2.68836 - -kernel execution time: 1.94819 ms -ref3 GeMM template time: 2.2353 - -kernel execution time: 1.06049 ms -SpMM template time ref4: 1.35755 - -kernel execution time: 68.6851 ms -taco reference time: 69.0188 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 128, vals: 4661376 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 8.41302 ms -fused time: 8.85733 - -kernel execution time: 17.639 ms -SpMM time: 18.2378 - -kernel execution time: 7.98654 ms -SpMM template time: 8.57087 - -kernel execution time: 6.34574 ms -GeMM time: 6.8938 - -kernel execution time: 6.10335 ms -ref 2 GeMM template time: 6.39173 - -kernel execution time: 5.82956 ms -ref3 GeMM template time: 6.11877 - -kernel execution time: 4.70653 ms -SpMM template time ref4: 5.04278 - -kernel execution time: 671.833 ms -taco reference time: 672.353 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 46835, B2_dimension: 46835, vals: 2374001 -C1_dimension: 46835, C2_dimension: 128, vals: 5994880 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 7.27388 ms -fused time: 7.73945 - -kernel execution time: 17.7256 ms -SpMM time: 18.3199 - -kernel execution time: 7.35832 ms -SpMM template time: 7.9109 - -kernel execution time: 8.33036 ms -GeMM time: 8.86966 - -kernel execution time: 7.86963 ms -ref 2 GeMM template time: 8.15124 - -kernel execution time: 7.7866 ms -ref3 GeMM template time: 8.07407 - -kernel execution time: 4.49305 ms -SpMM template time ref4: 4.80781 - -kernel execution time: 398.926 ms -taco reference time: 399.478 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 62451, B2_dimension: 62451, vals: 4007383 -C1_dimension: 62451, C2_dimension: 128, vals: 7993728 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 11.3443 ms -fused time: 11.8147 - -kernel execution time: 22.2928 ms -SpMM time: 22.924 - -kernel execution time: 12.4461 ms -SpMM template time: 13.0043 - -kernel execution time: 10.9317 ms -GeMM time: 11.5006 - -kernel execution time: 10.7585 ms -ref 2 GeMM template time: 11.0658 - -kernel execution time: 11.0196 ms -ref3 GeMM template time: 11.3149 - -kernel execution time: 6.90358 ms -SpMM template time ref4: 7.24984 - -kernel execution time: 657.038 ms -taco reference time: 657.641 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 83334, B2_dimension: 83334, vals: 6010480 -C1_dimension: 83334, C2_dimension: 128, vals: 10666752 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 15.2657 ms -fused time: 15.7013 - -kernel execution time: 31.6235 ms -SpMM time: 32.1905 - -kernel execution time: 16.8006 ms -SpMM template time: 17.332 - -kernel execution time: 14.3795 ms -GeMM time: 14.9199 - -kernel execution time: 14.4997 ms -ref 2 GeMM template time: 14.8349 - -kernel execution time: 14.0983 ms -ref3 GeMM template time: 14.393 - -kernel execution time: 9.33791 ms -SpMM template time ref4: 9.73698 - -kernel execution time: 903.295 ms -taco reference time: 903.924 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 -C1_dimension: 121192, C2_dimension: 128, vals: 15512576 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 27.1267 ms -fused time: 27.6407 - -kernel execution time: 52.874 ms -SpMM time: 53.49 - -kernel execution time: 25.9708 ms -SpMM template time: 26.5475 - -kernel execution time: 20.1295 ms -GeMM time: 20.7267 - -kernel execution time: 21.2549 ms -ref 2 GeMM template time: 21.7256 - -kernel execution time: 20.7262 ms -ref3 GeMM template time: 21.1848 - -kernel execution time: 12.5379 ms -SpMM template time ref4: 13.0829 - -kernel execution time: 405.376 ms -taco reference time: 406.043 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 140874, B2_dimension: 140874, vals: 7813404 -C1_dimension: 140874, C2_dimension: 128, vals: 18031872 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 22.7136 ms -fused time: 23.1625 - -kernel execution time: 49.1418 ms -SpMM time: 49.7343 - -kernel execution time: 25.0936 ms -SpMM template time: 25.604 - -kernel execution time: 23.6444 ms -GeMM time: 24.1812 - -kernel execution time: 24.348 ms -ref 2 GeMM template time: 24.6837 - -kernel execution time: 23.9836 ms -ref3 GeMM template time: 24.2972 - -kernel execution time: 14.4884 ms -SpMM template time ref4: 14.8698 - -kernel execution time: 1154.44 ms -taco reference time: 1155.04 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 170998, B2_dimension: 170998, vals: 958936 -C1_dimension: 170998, C2_dimension: 128, vals: 21887744 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 18.015 ms -fused time: 18.4775 - -kernel execution time: 56.1907 ms -SpMM time: 56.8126 - -kernel execution time: 20.0375 ms -SpMM template time: 20.5913 - -kernel execution time: 28.1716 ms -GeMM time: 28.7647 - -kernel execution time: 30.484 ms -ref 2 GeMM template time: 30.9681 - -kernel execution time: 30.0422 ms -ref3 GeMM template time: 30.5496 - -kernel execution time: 10.8925 ms -SpMM template time ref4: 11.4401 - -kernel execution time: 162.277 ms -taco reference time: 162.908 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 206500, B2_dimension: 206500, vals: 1273389 -C1_dimension: 206500, C2_dimension: 128, vals: 26432000 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 23.8637 ms -fused time: 24.4029 - -kernel execution time: 69.8832 ms -SpMM time: 70.504 - -kernel execution time: 26.8086 ms -SpMM template time: 27.6336 - -kernel execution time: 34.2049 ms -GeMM time: 34.8056 - -kernel execution time: 34.6783 ms -ref 2 GeMM template time: 35.183 - -kernel execution time: 33.8854 ms -ref3 GeMM template time: 34.3954 - -kernel execution time: 13.9069 ms -SpMM template time ref4: 14.4251 - -kernel execution time: 189.271 ms -taco reference time: 189.95 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536 -C1_dimension: 1000005, C2_dimension: 128, vals: 128000640 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 66.2912 ms -fused time: 66.8207 - -kernel execution time: 335.04 ms -SpMM time: 335.699 - -kernel execution time: 83.9137 ms -SpMM template time: 84.5618 - -kernel execution time: 157.411 ms -GeMM time: 158.061 - -kernel execution time: 169.35 ms -ref 2 GeMM template time: 169.938 - -kernel execution time: 168.201 ms -ref3 GeMM template time: 168.762 - -kernel execution time: 44.531 ms -SpMM template time ref4: 45.176 - -kernel execution time: 458.322 ms -taco reference time: 458.992 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291 -C1_dimension: 5558326, C2_dimension: 128, vals: 711465728 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 629.911 ms -fused time: 630.89 - -kernel execution time: 2385.92 ms -SpMM time: 2386.45 - -kernel execution time: 904.117 ms -SpMM template time: 904.66 - -kernel execution time: 867.356 ms -GeMM time: 867.943 - -kernel execution time: 946.344 ms -ref 2 GeMM template time: 946.912 - -kernel execution time: 951.944 ms -ref3 GeMM template time: 952.496 - -kernel execution time: 464.289 ms -SpMM template time ref4: 464.847 - -kernel execution time: 19646 ms -taco reference time: 19647.2 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536 -C1_dimension: 1000005, C2_dimension: 128, vals: 128000640 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 65.749 ms -fused time: 66.2393 - -kernel execution time: 334.436 ms -SpMM time: 335.114 - -kernel execution time: 85.6378 ms -SpMM template time: 86.2216 - -kernel execution time: 156.716 ms -GeMM time: 157.281 - -kernel execution time: 169.383 ms -ref 2 GeMM template time: 169.948 - -kernel execution time: 168.128 ms -ref3 GeMM template time: 168.722 - -kernel execution time: 44.3902 ms -SpMM template time ref4: 44.9859 - -kernel execution time: 462.089 ms -taco reference time: 462.747 - -kernel execution time: 472.176 ms -taco reference new time: 472.868 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 206500, B2_dimension: 206500, vals: 1273389 -C1_dimension: 206500, C2_dimension: 128, vals: 26432000 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 22.9203 ms -fused time: 23.382 - -kernel execution time: 69.0678 ms -SpMM time: 69.6771 - -kernel execution time: 25.7576 ms -SpMM template time: 26.2883 - -kernel execution time: 33.838 ms -GeMM time: 34.3893 - -kernel execution time: 36.2223 ms -ref 2 GeMM template time: 36.7099 - -kernel execution time: 35.9919 ms -ref3 GeMM template time: 36.5181 - -kernel execution time: 13.5094 ms -SpMM template time ref4: 14.0411 - -kernel execution time: 209.225 ms -taco reference time: 209.806 - -kernel execution time: 195.258 ms -taco reference new time: 195.862 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 206500, B2_dimension: 206500, vals: 1273389 -C1_dimension: 206500, C2_dimension: 128, vals: 26432000 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 23.9941 ms -fused time: 24.5306 - -kernel execution time: 70.3118 ms -SpMM time: 70.9711 - -kernel execution time: 26.7754 ms -SpMM template time: 27.3965 - -kernel execution time: 34.3488 ms -GeMM time: 34.9449 - -kernel execution time: 34.9754 ms -ref 2 GeMM template time: 35.5492 - -kernel execution time: 34.4524 ms -ref3 GeMM template time: 35.0358 - -kernel execution time: 13.8295 ms -SpMM template time ref4: 14.4023 - -kernel execution time: 195.316 ms -taco reference time: 195.985 - -kernel execution time: 194.321 ms -taco reference new time: 194.959 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 140874, B2_dimension: 140874, vals: 7813404 -C1_dimension: 140874, C2_dimension: 128, vals: 18031872 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 499.31 ms -fused time: 500.253 - -kernel execution time: 1127.92 ms -SpMM time: 1128.46 - -kernel execution time: 314.563 ms -SpMM template time: 315.094 - -kernel execution time: 1071.42 ms -GeMM time: 1071.96 - -kernel execution time: 772.255 ms -ref 2 GeMM template time: 772.765 - -kernel execution time: 768.478 ms -ref3 GeMM template time: 768.998 - -kernel execution time: 162.934 ms -SpMM template time ref4: 163.456 - -kernel execution time: 51182.8 ms -taco reference time: 51183.7 - -kernel execution time: 62360.6 ms -taco reference new time: 62361.5 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 170998, B2_dimension: 170998, vals: 958936 -C1_dimension: 170998, C2_dimension: 128, vals: 21887744 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 343.987 ms -fused time: 344.403 - -kernel execution time: 127.278 ms -SpMM time: 127.803 - -kernel execution time: 139.755 ms -SpMM template time: 140.297 - -kernel execution time: 1308.19 ms -GeMM time: 1308.77 - -kernel execution time: 930.985 ms -ref 2 GeMM template time: 931.498 - -kernel execution time: 924.636 ms -ref3 GeMM template time: 925.164 - -kernel execution time: 83.9238 ms -SpMM template time ref4: 84.4508 - -kernel execution time: 6298.13 ms -taco reference time: 6299.21 - -kernel execution time: 7357.04 ms -taco reference new time: 7358.09 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 206500, B2_dimension: 206500, vals: 1273389 -C1_dimension: 206500, C2_dimension: 128, vals: 26432000 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 404.825 ms -fused time: 405.271 - -kernel execution time: 142.933 ms -SpMM time: 143.48 - -kernel execution time: 155.193 ms -SpMM template time: 155.761 - -kernel execution time: 1572.88 ms -GeMM time: 1573.41 - -kernel execution time: 1132.63 ms -ref 2 GeMM template time: 1133.13 - -kernel execution time: 1126.54 ms -ref3 GeMM template time: 1127.06 - -kernel execution time: 96.7404 ms -SpMM template time ref4: 97.2437 - -kernel execution time: 8321.2 ms -taco reference time: 8322.27 - -kernel execution time: 9774.76 ms -taco reference new time: 9775.82 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536 -C1_dimension: 1000005, C2_dimension: 128, vals: 128000640 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 1796.56 ms -fused time: 1797.34 - -kernel execution time: 429.623 ms -SpMM time: 430.127 - -kernel execution time: 406.352 ms -SpMM template time: 406.855 - -kernel execution time: 7603.48 ms -GeMM time: 7604.4 - -kernel execution time: 5458.44 ms -ref 2 GeMM template time: 5459.36 - -kernel execution time: 5413.18 ms -ref3 GeMM template time: 5414.05 - -kernel execution time: 266.783 ms -SpMM template time ref4: 267.276 - -kernel execution time: 20481.5 ms -taco reference time: 20482.6 - -kernel execution time: 23942.3 ms -taco reference new time: 23943.8 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291 -C1_dimension: 5558326, C2_dimension: 128, vals: 711465728 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 11983.1 ms -fused time: 11984.1 - -kernel execution time: 14647.3 ms -SpMM time: 14648.4 - -kernel execution time: 5779.35 ms -SpMM template time: 5780.3 - -kernel execution time: 42156 ms -GeMM time: 42156.9 - -kernel execution time: 30315.6 ms -ref 2 GeMM template time: 30316.6 - -kernel execution time: 30070.9 ms -ref3 GeMM template time: 30071.9 - -kernel execution time: 3196.34 ms -SpMM template time ref4: 3197.36 - -kernel execution time: 387963 ms -taco reference time: 387964 - -kernel execution time: 481094 ms -taco reference new time: 481095 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 128, vals: 1404672 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 32.8596 ms -fused time: 33.2745 - -kernel execution time: 57.4073 ms -SpMM time: 57.9242 - -kernel execution time: 18.9092 ms -SpMM template time: 19.4238 - -kernel execution time: 84.8547 ms -GeMM time: 85.3549 - -kernel execution time: 60.5468 ms -ref 2 GeMM template time: 60.9429 - -kernel execution time: 60.3303 ms -ref3 GeMM template time: 60.7269 - -kernel execution time: 9.95693 ms -SpMM template time ref4: 10.3864 - -kernel execution time: 2808.32 ms -taco reference time: 2808.79 - -kernel execution time: 3456.32 ms -taco reference new time: 3457.29 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 128, vals: 4661376 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 203.078 ms -fused time: 203.513 - -kernel execution time: 594.431 ms -SpMM time: 594.968 - -kernel execution time: 135.247 ms -SpMM template time: 135.774 - -kernel execution time: 277.557 ms -GeMM time: 278.077 - -kernel execution time: 201.246 ms -ref 2 GeMM template time: 201.741 - -kernel execution time: 200.173 ms -ref3 GeMM template time: 200.697 - -kernel execution time: 67.3815 ms -SpMM template time ref4: 67.9079 - -kernel execution time: 28413.2 ms -taco reference time: 28414.2 - -kernel execution time: 34685.2 ms -taco reference new time: 34687 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 46835, B2_dimension: 46835, vals: 2374001 -C1_dimension: 46835, C2_dimension: 128, vals: 5994880 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 156.103 ms -fused time: 156.534 - -kernel execution time: 313.946 ms -SpMM time: 314.545 - -kernel execution time: 95.9908 ms -SpMM template time: 96.5235 - -kernel execution time: 355.516 ms -GeMM time: 356.043 - -kernel execution time: 257.486 ms -ref 2 GeMM template time: 258 - -kernel execution time: 255.966 ms -ref3 GeMM template time: 256.498 - -kernel execution time: 50.7943 ms -SpMM template time ref4: 51.3121 - -kernel execution time: 15474.9 ms -taco reference time: 15476 - -kernel execution time: 19054.1 ms -taco reference new time: 19055.3 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 62451, B2_dimension: 62451, vals: 4007383 -C1_dimension: 62451, C2_dimension: 128, vals: 7993728 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 233.01 ms -fused time: 233.435 - -kernel execution time: 583.856 ms -SpMM time: 584.39 - -kernel execution time: 148.111 ms -SpMM template time: 148.649 - -kernel execution time: 474.209 ms -GeMM time: 474.735 - -kernel execution time: 343.934 ms -ref 2 GeMM template time: 344.44 - -kernel execution time: 342.778 ms -ref3 GeMM template time: 343.3 - -kernel execution time: 74.5241 ms -SpMM template time ref4: 75.0386 - -kernel execution time: 26129.8 ms -taco reference time: 26130.9 - -kernel execution time: 32058.9 ms -taco reference new time: 32059.8 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 83334, B2_dimension: 83334, vals: 6010480 -C1_dimension: 83334, C2_dimension: 128, vals: 10666752 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 332.296 ms -fused time: 332.73 - -kernel execution time: 871.053 ms -SpMM time: 871.586 - -kernel execution time: 217.386 ms -SpMM template time: 217.911 - -kernel execution time: 636.82 ms -GeMM time: 637.357 - -kernel execution time: 461.8 ms -ref 2 GeMM template time: 462.325 - -kernel execution time: 458.184 ms -ref3 GeMM template time: 458.738 - -kernel execution time: 114.816 ms -SpMM template time ref4: 115.341 - -kernel execution time: 39240.9 ms -taco reference time: 39242 - -kernel execution time: 48108.4 ms -taco reference new time: 48109.4 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 -C1_dimension: 121192, C2_dimension: 128, vals: 15512576 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 351.775 ms -fused time: 352.201 - -kernel execution time: 317.447 ms -SpMM time: 317.983 - -kernel execution time: 217.205 ms -SpMM template time: 217.733 - -kernel execution time: 921.754 ms -GeMM time: 922.288 - -kernel execution time: 667.69 ms -ref 2 GeMM template time: 668.21 - -kernel execution time: 655.357 ms -ref3 GeMM template time: 655.888 - -kernel execution time: 118.018 ms -SpMM template time ref4: 118.546 - -kernel execution time: 17243.9 ms -taco reference time: 17245 - -kernel execution time: 21353.4 ms -taco reference new time: 21354.7 - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 -C1_dimension: 121192, C2_dimension: 128, vals: 15512576 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 27502 ms -fused time: 27581.4 - -kernel execution time: 19193.1 ms -SpMM time: 19304.1 - -kernel execution time: 8528.83 ms -SpMM template time: 8571.46 - -kernel execution time: 33685.2 ms -GeMM time: 33768.7 - -kernel execution time: 32503 ms -ref 2 GeMM template time: 32589.2 - -kernel execution time: 32859.6 ms -ref3 GeMM template time: 32952.9 - -kernel execution time: 4862.19 ms -SpMM template time ref4: 4917.41 - -kernel execution time: 891084 ms -taco reference time: 891170 - - - - - - ----------------------------------------------------------------------------------------------------- ----------------------------------------------------------------------------------------------------- ----------------------------------------------------------------------------------------------------- - - - - -spmm-spmm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 128, vals: 1404672 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 2.69255 ms -fused time: 5.71229 - -kernel execution time: 3.93158 ms -SpMM time: 4.42244 - -1st pattern computation - -kernel execution time: 1.69479 ms -SpMM template time: 2.18137 - -kernel execution time: 2.53215 ms -GeMM time: 2.92698 - -kernel execution time: 82.7455 ms -ref 2 GeMM template time: 83.6829 - -2nd pattern computation - -kernel execution time: 2.52512 ms -ref3 GeMM template time: 2.90403 - -kernel execution time: 1.07835 ms -SpMM template time ref4: 1.34312 - -reference pattern computation - -kernel execution time: 66.8405 ms -taco reference time: 67.1485 - -kernel execution time: 71.5847 ms -taco reference new time: 71.9261 - -spmm-spmm execution - ------------------------------------------ -filenum: 2 ---------------------------------- -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 128, vals: 1404672 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 2.77205 ms -fused time: 6.22498 - -kernel execution time: 3.70735 ms -SpMM time: 4.15143 - -1st pattern computation - -kernel execution time: 1.68777 ms -SpMM template time: 2.37238 - -kernel execution time: 2.64104 ms -GeMM time: 5.76589 - -kernel execution time: 81.9899 ms -ref 2 GeMM template time: 82.2704 - -2nd pattern computation - -kernel execution time: 2.45488 ms -ref3 GeMM template time: 2.8586 - -kernel execution time: 1.12289 ms -SpMM template time ref4: 1.39155 - -reference pattern computation - -kernel execution time: 76.3877 ms -taco reference time: 78.7939 - -kernel execution time: 72.755 ms -taco reference new time: 73.1269 -filenum: 3 ---------------------------------- -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 128, vals: 4661376 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 7.80932 ms -fused time: 11.2518 - -kernel execution time: 16.5944 ms -SpMM time: 17.886 - -1st pattern computation - -kernel execution time: 7.11089 ms -SpMM template time: 7.68253 - -kernel execution time: 6.4731 ms -GeMM time: 9.33681 - -kernel execution time: 275.759 ms -ref 2 GeMM template time: 276.631 - -2nd pattern computation - -kernel execution time: 6.3356 ms -ref3 GeMM template time: 6.81471 - -kernel execution time: 4.47152 ms -SpMM template time ref4: 4.76175 - -reference pattern computation - -kernel execution time: 658.29 ms -taco reference time: 658.76 - -kernel execution time: 687.782 ms -taco reference new time: 688.49 -filenum: 4 ---------------------------------- -/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 46835, B2_dimension: 46835, vals: 2374001 -C1_dimension: 46835, C2_dimension: 128, vals: 5994880 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 6.78576 ms -fused time: 8.17823 - -kernel execution time: 18.7121 ms -SpMM time: 20.1397 - -1st pattern computation - -kernel execution time: 6.53343 ms -SpMM template time: 7.11366 - -kernel execution time: 8.13131 ms -GeMM time: 10.4823 - -kernel execution time: 341.676 ms -ref 2 GeMM template time: 341.986 - -2nd pattern computation - -kernel execution time: 7.69804 ms -ref3 GeMM template time: 8.15483 - -kernel execution time: 4.61245 ms -SpMM template time ref4: 4.90988 - -reference pattern computation - -kernel execution time: 343.367 ms -taco reference time: 343.755 - -kernel execution time: 374.197 ms -taco reference new time: 374.704 -filenum: 5 ---------------------------------- -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 62451, B2_dimension: 62451, vals: 4007383 -C1_dimension: 62451, C2_dimension: 128, vals: 7993728 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 11.6176 ms -fused time: 15.1115 - -kernel execution time: 22.6994 ms -SpMM time: 23.3508 - -1st pattern computation - -kernel execution time: 11.9033 ms -SpMM template time: 12.4284 - -kernel execution time: 10.4635 ms -GeMM time: 10.9336 - -kernel execution time: 452.62 ms -ref 2 GeMM template time: 452.931 - -2nd pattern computation - -kernel execution time: 9.29193 ms -ref3 GeMM template time: 9.74228 - -kernel execution time: 7.21434 ms -SpMM template time ref4: 7.5664 - -reference pattern computation - -kernel execution time: 570.857 ms -taco reference time: 571.396 - -kernel execution time: 623.78 ms -taco reference new time: 624.325 -filenum: 6 ---------------------------------- -/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 83334, B2_dimension: 83334, vals: 6010480 -C1_dimension: 83334, C2_dimension: 128, vals: 10666752 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 15.2241 ms -fused time: 17.4586 - -kernel execution time: 31.7064 ms -SpMM time: 32.3582 - -1st pattern computation - -kernel execution time: 16.5454 ms -SpMM template time: 17.0802 - -kernel execution time: 13.8741 ms -GeMM time: 14.3707 - -kernel execution time: 604.662 ms -ref 2 GeMM template time: 605.002 - -2nd pattern computation - -kernel execution time: 11.9433 ms -ref3 GeMM template time: 12.403 - -kernel execution time: 9.77169 ms -SpMM template time ref4: 10.1324 - -reference pattern computation - -kernel execution time: 841.646 ms -taco reference time: 842.221 - -kernel execution time: 932.828 ms -taco reference new time: 933.378 -filenum: 7 ---------------------------------- -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 -C1_dimension: 121192, C2_dimension: 128, vals: 15512576 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 25.1981 ms -fused time: 28.3453 - -kernel execution time: 51.7019 ms -SpMM time: 52.3269 - -1st pattern computation - -kernel execution time: 24.2567 ms -SpMM template time: 24.8204 - -kernel execution time: 19.9687 ms -GeMM time: 20.5536 - -kernel execution time: 874.389 ms -ref 2 GeMM template time: 874.8 - -2nd pattern computation - -kernel execution time: 17.1428 ms -ref3 GeMM template time: 17.605 - -kernel execution time: 12.4989 ms -SpMM template time ref4: 12.9327 - -reference pattern computation - -kernel execution time: 374.424 ms -taco reference time: 375.053 - -kernel execution time: 412.224 ms -taco reference new time: 412.828 -filenum: 8 ---------------------------------- -/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 140874, B2_dimension: 140874, vals: 7813404 -C1_dimension: 140874, C2_dimension: 128, vals: 18031872 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 22.3642 ms -fused time: 22.9541 - -kernel execution time: 48.8361 ms -SpMM time: 49.478 - -1st pattern computation - -kernel execution time: 24.4919 ms -SpMM template time: 25.0744 - -kernel execution time: 23.1278 ms -GeMM time: 23.714 - -kernel execution time: 1021.89 ms -ref 2 GeMM template time: 1022.32 - -2nd pattern computation - -kernel execution time: 19.872 ms -ref3 GeMM template time: 20.3315 - -kernel execution time: 14.608 ms -SpMM template time ref4: 15.077 - -reference pattern computation - -kernel execution time: 1080.68 ms -taco reference time: 1081.32 - -kernel execution time: 1211.77 ms -taco reference new time: 1212.36 -filenum: 9 ---------------------------------- -/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 170998, B2_dimension: 170998, vals: 958936 -C1_dimension: 170998, C2_dimension: 128, vals: 21887744 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 16.318 ms -fused time: 18.887 - -kernel execution time: 56.5258 ms -SpMM time: 57.1171 - -1st pattern computation - -kernel execution time: 18.2007 ms -SpMM template time: 18.7215 - -kernel execution time: 28.1041 ms -GeMM time: 28.6173 - -kernel execution time: 1232.84 ms -ref 2 GeMM template time: 1233.26 - -2nd pattern computation - -kernel execution time: 23.6402 ms -ref3 GeMM template time: 24.1216 - -kernel execution time: 10.6221 ms -SpMM template time ref4: 11.1278 - -reference pattern computation - -kernel execution time: 136.61 ms -taco reference time: 137.191 - -kernel execution time: 143.222 ms -taco reference new time: 143.823 -filenum: 10 ---------------------------------- -/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 206500, B2_dimension: 206500, vals: 1273389 -C1_dimension: 206500, C2_dimension: 128, vals: 26432000 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 22.1951 ms -fused time: 25.4707 - -kernel execution time: 69.5817 ms -SpMM time: 70.2133 - -1st pattern computation - -kernel execution time: 25.2229 ms -SpMM template time: 25.818 - -kernel execution time: 34.0166 ms -GeMM time: 34.5719 - -kernel execution time: 1506.8 ms -ref 2 GeMM template time: 1507.32 - -2nd pattern computation - -kernel execution time: 27.9513 ms -ref3 GeMM template time: 28.4381 - -kernel execution time: 13.4585 ms -SpMM template time ref4: 14.0168 - -reference pattern computation - -kernel execution time: 182.244 ms -taco reference time: 182.878 - -kernel execution time: 191.621 ms -taco reference new time: 192.28 -filenum: 12 ---------------------------------- -/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536 -C1_dimension: 1000005, C2_dimension: 128, vals: 128000640 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 62.6358 ms -fused time: 66.0562 - -kernel execution time: 331.995 ms -SpMM time: 332.669 - -1st pattern computation - -kernel execution time: 81.0262 ms -SpMM template time: 81.6316 - -kernel execution time: 155.308 ms -GeMM time: 155.913 - -kernel execution time: 7174.32 ms -ref 2 GeMM template time: 7175.38 - -2nd pattern computation - -kernel execution time: 131.848 ms -ref3 GeMM template time: 132.36 - -kernel execution time: 43.681 ms -SpMM template time ref4: 44.293 - -reference pattern computation - -kernel execution time: 444.857 ms -taco reference time: 445.492 - -kernel execution time: 467.509 ms -taco reference new time: 468.15 -filenum: 15 ---------------------------------- -/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291 -C1_dimension: 5558326, C2_dimension: 128, vals: 711465728 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 621.338 ms -fused time: 625.05 - -kernel execution time: 2276.7 ms -SpMM time: 2277.28 - -1st pattern computation - -kernel execution time: 881.7 ms -SpMM template time: 882.296 - -kernel execution time: 859.785 ms -GeMM time: 860.272 - -kernel execution time: 39771.6 ms -ref 2 GeMM template time: 39772.6 - -2nd pattern computation - -kernel execution time: 748.251 ms -ref3 GeMM template time: 748.758 - -kernel execution time: 452.61 ms -SpMM template time ref4: 453.163 - -reference pattern computation - -kernel execution time: 19528.6 ms -taco reference time: 19529.7 - -kernel execution time: 26715.2 ms -taco reference new time: 26716.6 - -spmm-spmm execution - ------------------------------------------ -filenum: 2 ---------------------------------- -/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 10974, B2_dimension: 10974, vals: 428650 -C1_dimension: 10974, C2_dimension: 128, vals: 1404672 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 2.64213 ms -fused time: 6.13507 - ---------- 1st pattern computation TTM, GEMM - -kernel execution time: 3.84056 ms -SpMM time: 4.24008 - -kernel execution time: 1.61274 ms -SpMM template time: 2.04575 - -kernel execution time: 2.33971 ms -GeMM time: 2.69705 - -kernel execution time: 85.2544 ms -ref 2 GeMM template time: 86.1514 - ---------- 2nd pattern computation GEMM, SpMM - -kernel execution time: 2.2757 ms -ref3 GeMM template time: 2.64863 - -kernel execution time: 1.04819 ms -SpMM template time ref4: 1.27491 - --------- reference pattern computation - -kernel execution time: 69.4126 ms -taco reference time: 71.9418 - -kernel execution time: 71.8522 ms -taco reference new time: 72.137 -filenum: 3 ---------------------------------- -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 128, vals: 4661376 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 7.47716 ms -fused time: 11.1061 - ---------- 1st pattern computation TTM, GEMM - -kernel execution time: 16.7215 ms -SpMM time: 17.3352 - -kernel execution time: 7.10234 ms -SpMM template time: 7.68864 - -kernel execution time: 6.44691 ms -GeMM time: 9.89357 - -kernel execution time: 275.868 ms -ref 2 GeMM template time: 276.795 - ---------- 2nd pattern computation GEMM, SpMM - -kernel execution time: 6.21948 ms -ref3 GeMM template time: 6.86379 - -kernel execution time: 4.55999 ms -SpMM template time ref4: 4.85255 - --------- reference pattern computation - -kernel execution time: 643.662 ms -taco reference time: 644.221 - -kernel execution time: 682.88 ms -taco reference new time: 683.468 -filenum: 4 ---------------------------------- -/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 46835, B2_dimension: 46835, vals: 2374001 -C1_dimension: 46835, C2_dimension: 128, vals: 5994880 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 7.25024 ms -fused time: 11.0411 - ---------- 1st pattern computation TTM, GEMM - -kernel execution time: 18.4386 ms -SpMM time: 18.956 - -kernel execution time: 6.48062 ms -SpMM template time: 7.03658 - -kernel execution time: 7.9428 ms -GeMM time: 9.42206 - -kernel execution time: 343.414 ms -ref 2 GeMM template time: 343.746 - ---------- 2nd pattern computation GEMM, SpMM - -kernel execution time: 6.9495 ms -ref3 GeMM template time: 7.40299 - -kernel execution time: 4.95305 ms -SpMM template time ref4: 5.26981 - --------- reference pattern computation - -kernel execution time: 338.889 ms -taco reference time: 339.74 - -kernel execution time: 373.621 ms -taco reference new time: 374.075 -filenum: 5 ---------------------------------- -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 62451, B2_dimension: 62451, vals: 4007383 -C1_dimension: 62451, C2_dimension: 128, vals: 7993728 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 11.3714 ms -fused time: 15.0722 - ---------- 1st pattern computation TTM, GEMM - -kernel execution time: 22.4213 ms -SpMM time: 22.9773 - -kernel execution time: 11.8747 ms -SpMM template time: 12.4314 - -kernel execution time: 10.2572 ms -GeMM time: 12.818 - -kernel execution time: 451.818 ms -ref 2 GeMM template time: 452.131 - ---------- 2nd pattern computation GEMM, SpMM - -kernel execution time: 9.4658 ms -ref3 GeMM template time: 9.90856 - -kernel execution time: 6.97316 ms -SpMM template time ref4: 7.30846 - --------- reference pattern computation - -kernel execution time: 543.932 ms -taco reference time: 544.422 - -kernel execution time: 623.419 ms -taco reference new time: 623.935 -filenum: 6 ---------------------------------- -/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 83334, B2_dimension: 83334, vals: 6010480 -C1_dimension: 83334, C2_dimension: 128, vals: 10666752 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 15.18 ms -fused time: 18.5471 - ---------- 1st pattern computation TTM, GEMM - -kernel execution time: 31.3038 ms -SpMM time: 31.9251 - -kernel execution time: 16.4816 ms -SpMM template time: 17.0655 - -kernel execution time: 13.7454 ms -GeMM time: 14.2668 - -kernel execution time: 601.657 ms -ref 2 GeMM template time: 602.024 - ---------- 2nd pattern computation GEMM, SpMM - -kernel execution time: 14.354 ms -ref3 GeMM template time: 14.8072 - -kernel execution time: 9.41569 ms -SpMM template time ref4: 9.77992 - --------- reference pattern computation - -kernel execution time: 805.535 ms -taco reference time: 806.106 - -kernel execution time: 928.447 ms -taco reference new time: 928.999 -filenum: 7 ---------------------------------- -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 121192, B2_dimension: 121192, vals: 2624331 -C1_dimension: 121192, C2_dimension: 128, vals: 15512576 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 25.2666 ms -fused time: 27.8771 - ---------- 1st pattern computation TTM, GEMM - -kernel execution time: 51.9104 ms -SpMM time: 52.5127 - -kernel execution time: 23.9709 ms -SpMM template time: 24.5371 - -kernel execution time: 19.8979 ms -GeMM time: 20.5052 - -kernel execution time: 878.762 ms -ref 2 GeMM template time: 879.166 - ---------- 2nd pattern computation GEMM, SpMM - -kernel execution time: 16.9454 ms -ref3 GeMM template time: 17.4072 - -kernel execution time: 12.6943 ms -SpMM template time ref4: 13.1204 - --------- reference pattern computation - -kernel execution time: 356.591 ms -taco reference time: 357.146 - -kernel execution time: 408.529 ms -taco reference new time: 409.172 -filenum: 8 ---------------------------------- -/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 140874, B2_dimension: 140874, vals: 7813404 -C1_dimension: 140874, C2_dimension: 128, vals: 18031872 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 22.2469 ms -fused time: 22.8567 - ---------- 1st pattern computation TTM, GEMM - -kernel execution time: 49.6959 ms -SpMM time: 50.3273 - -kernel execution time: 24.2333 ms -SpMM template time: 24.8116 - -kernel execution time: 23.0719 ms -GeMM time: 23.6169 - -kernel execution time: 1017.55 ms -ref 2 GeMM template time: 1018 - ---------- 2nd pattern computation GEMM, SpMM - -kernel execution time: 19.3601 ms -ref3 GeMM template time: 19.8249 - -kernel execution time: 14.2804 ms -SpMM template time ref4: 14.7665 - --------- reference pattern computation - -kernel execution time: 1048.84 ms -taco reference time: 1049.44 - -kernel execution time: 1209.88 ms -taco reference new time: 1210.47 -filenum: 9 ---------------------------------- -/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 170998, B2_dimension: 170998, vals: 958936 -C1_dimension: 170998, C2_dimension: 128, vals: 21887744 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 15.8746 ms -fused time: 19.813 - ---------- 1st pattern computation TTM, GEMM - -kernel execution time: 55.9723 ms -SpMM time: 56.6152 - -kernel execution time: 17.9806 ms -SpMM template time: 18.623 - -kernel execution time: 27.7406 ms -GeMM time: 28.4557 - -kernel execution time: 1236.24 ms -ref 2 GeMM template time: 1236.69 - ---------- 2nd pattern computation GEMM, SpMM - -kernel execution time: 23.8143 ms -ref3 GeMM template time: 24.2887 - -kernel execution time: 10.5388 ms -SpMM template time ref4: 11.0342 - --------- reference pattern computation - -kernel execution time: 131.162 ms -taco reference time: 131.729 - -kernel execution time: 142.639 ms -taco reference new time: 143.262 -filenum: 10 ---------------------------------- -/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 206500, B2_dimension: 206500, vals: 1273389 -C1_dimension: 206500, C2_dimension: 128, vals: 26432000 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 22.0414 ms -fused time: 24.5186 - ---------- 1st pattern computation TTM, GEMM - -kernel execution time: 69.6038 ms -SpMM time: 70.136 - -kernel execution time: 24.6489 ms -SpMM template time: 25.1488 - -kernel execution time: 33.413 ms -GeMM time: 33.9108 - -kernel execution time: 1497.05 ms -ref 2 GeMM template time: 1497.51 - ---------- 2nd pattern computation GEMM, SpMM - -kernel execution time: 29.3442 ms -ref3 GeMM template time: 29.8157 - -kernel execution time: 12.9244 ms -SpMM template time ref4: 13.3503 - --------- reference pattern computation - -kernel execution time: 174.347 ms -taco reference time: 174.811 - -kernel execution time: 190.408 ms -taco reference new time: 190.973 -filenum: 12 ---------------------------------- -/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 1000005, B2_dimension: 1000005, vals: 3105536 -C1_dimension: 1000005, C2_dimension: 128, vals: 128000640 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 61.219 ms -fused time: 65.9604 - ---------- 1st pattern computation TTM, GEMM - -kernel execution time: 329.098 ms -SpMM time: 329.782 - -kernel execution time: 80.1902 ms -SpMM template time: 80.758 - -kernel execution time: 154.474 ms -GeMM time: 155.08 - -kernel execution time: 7192.75 ms -ref 2 GeMM template time: 7193.76 - ---------- 2nd pattern computation GEMM, SpMM - -kernel execution time: 132.057 ms -ref3 GeMM template time: 132.561 - -kernel execution time: 43.0394 ms -SpMM template time ref4: 43.6558 - --------- reference pattern computation - -kernel execution time: 430.157 ms -taco reference time: 430.825 - -kernel execution time: 463.848 ms -taco reference new time: 464.498 -filenum: 15 ---------------------------------- -/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 5558326, B2_dimension: 5558326, vals: 59524291 -C1_dimension: 5558326, C2_dimension: 128, vals: 711465728 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 602.9 ms -fused time: 606.764 - ---------- 1st pattern computation TTM, GEMM - -kernel execution time: 2126.86 ms -SpMM time: 2127.49 - -kernel execution time: 871.892 ms -SpMM template time: 872.491 - -kernel execution time: 845.837 ms -GeMM time: 846.363 - -kernel execution time: 39844.5 ms -ref 2 GeMM template time: 39845.6 - ---------- 2nd pattern computation GEMM, SpMM - -kernel execution time: 740.208 ms -ref3 GeMM template time: 740.701 - -kernel execution time: 447.66 ms -SpMM template time ref4: 448.268 - --------- reference pattern computation - -kernel execution time: 18669.7 ms -taco reference time: 18671 - -kernel execution time: 26729.8 ms -taco reference new time: 26731.1 - -spmm-spmm execution - ------------------------------------------ -filenum: 1 ---------------------------------- -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 5, B2_dimension: 5, vals: 19 -C1_dimension: 5, C2_dimension: 128, vals: 640 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 0.044111 ms -fused time: 0.69912 - ---------- 1st pattern computation TTM, GEMM - -kernel execution time: 0.019191 ms -SpMM time: 1.30214 - -kernel execution time: 0.499717 ms -SpMM template time: 1.01315 - -kernel execution time: 0.096371 ms -GeMM time: 0.631739 - -kernel execution time: 0.070191 ms -ref 2 GeMM template time: 0.560537 - ---------- 2nd pattern computation GEMM, SpMM - -kernel execution time: 0.070901 ms -ref3 GeMM template time: 0.579358 - -kernel execution time: 0.02984 ms -SpMM template time ref4: 0.851161 - --------- reference pattern computation - -kernel execution time: 0.194393 ms -taco reference time: 0.628889 - -kernel execution time: 0.242974 ms -taco reference new time: 0.667439 - -spmm-spmm execution - ------------------------------------------ -filenum: 1 ---------------------------------- -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 5, B2_dimension: 5, vals: 19 -C1_dimension: 5, C2_dimension: 128, vals: 640 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 0.043801 ms -fused time: 0.685989 - ---------- 1st pattern computation TTM, GEMM - -kernel execution time: 0.01878 ms -SpMM time: 0.861191 - -kernel execution time: 0.503617 ms -SpMM template time: 1.00581 - -kernel execution time: 0.095292 ms -GeMM time: 0.583898 - -kernel execution time: 0.070121 ms -ref 2 GeMM template time: 0.520137 - ---------- 2nd pattern computation GEMM, SpMM - -kernel execution time: 0.070641 ms -ref3 GeMM template time: 0.537688 - -kernel execution time: 0.035491 ms -SpMM template time ref4: 0.514717 - --------- reference pattern computation - -kernel execution time: 0.194192 ms -taco reference time: 0.618658 - -kernel execution time: 0.239543 ms -taco reference new time: 0.655149 - -spmm-spmm execution - ------------------------------------------ -filenum: 1 ---------------------------------- -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 5, B2_dimension: 5, vals: 19 -C1_dimension: 5, C2_dimension: 128, vals: 640 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 0.04383 ms -fused time: 0.680319 - ---------- 1st pattern computation TTM, GEMM - -kernel execution time: 0.019891 ms -SpMM time: 0.72453 - -kernel execution time: 0.515237 ms -SpMM template time: 0.995294 - -kernel execution time: 0.095731 ms -GeMM time: 0.628018 - -kernel execution time: 0.071101 ms -ref 2 GeMM template time: 0.539967 - ---------- 2nd pattern computation GEMM, SpMM - -kernel execution time: 0.071171 ms -ref3 GeMM template time: 0.592848 - -kernel execution time: 0.029131 ms -SpMM template time ref4: 0.582288 - --------- reference pattern computation - -kernel execution time: 0.254484 ms -taco reference time: 0.768111 - -kernel execution time: 0.273853 ms -taco reference new time: 0.781751 - -spmm-spmm execution - ------------------------------------------ -filenum: 1 ---------------------------------- -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 5, B2_dimension: 5, vals: 19 -C1_dimension: 5, C2_dimension: 128, vals: 640 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 0.043111 ms -fused time: 0.676409 - ---------- 1st pattern computation TTM, GEMM - -kernel execution time: 0.01898 ms -SpMM time: 0.836491 - -kernel execution time: 0.489586 ms -SpMM template time: 0.969303 - -kernel execution time: 0.094641 ms -GeMM time: 0.561697 - -kernel execution time: 0.070251 ms -ref 2 GeMM template time: 0.545778 - ---------- 2nd pattern computation GEMM, SpMM - -kernel execution time: 0.07045 ms -ref3 GeMM template time: 0.550897 - -kernel execution time: 0.0282 ms -SpMM template time ref4: 0.463227 - --------- reference pattern computation - -kernel execution time: 0.245783 ms -taco reference time: 0.761711 - -kernel execution time: 0.304314 ms -taco reference new time: 0.834081 - -spmm-spmm execution - ------------------------------------------ -filenum: 1 ---------------------------------- -/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 5, B2_dimension: 5, vals: 19 -C1_dimension: 5, C2_dimension: 128, vals: 640 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 0.03874 ms -fused time: 0.669969 - ---------- 1st pattern computation TTM, GEMM - -kernel execution time: 0.019931 ms -SpMM time: 0.857531 - -kernel execution time: 0.507936 ms -SpMM template time: 1.00321 - -kernel execution time: 0.093961 ms -GeMM time: 0.727229 - -kernel execution time: 0.070371 ms -ref 2 GeMM template time: 0.867451 - ---------- 2nd pattern computation GEMM, SpMM - -kernel execution time: 0.069541 ms -ref3 GeMM template time: 0.546687 - -kernel execution time: 0.02565 ms -SpMM template time ref4: 0.541707 - --------- reference pattern computation - -kernel execution time: 0.195092 ms -taco reference time: 0.615338 - -kernel execution time: 0.239653 ms -taco reference new time: 0.657449 - -spmm-spmm execution - ------------------------------------------ -filenum: 3 ---------------------------------- -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 128, vals: 4661376 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - -kernel execution time: 202.946 ms -fused time: 203.369 - ---------- 1st pattern computation TTM, GEMM - -kernel execution time: 604.532 ms -SpMM time: 605.081 - -kernel execution time: 137.88 ms -SpMM template time: 138.397 - -kernel execution time: 281.01 ms -GeMM time: 281.522 - -kernel execution time: 267.152 ms -ref 2 GeMM template time: 267.64 - ---------- 2nd pattern computation GEMM, SpMM - -kernel execution time: 202.612 ms -ref3 GeMM template time: 203.13 - -kernel execution time: 72.1263 ms -SpMM template time ref4: 72.634 - --------- reference pattern computation - -kernel execution time: 26464.3 ms -taco reference time: 26465.4 - -kernel execution time: 34639.1 ms -taco reference new time: 34640.2 - -spmm-spmm execution - ------------------------------------------ -filenum: 3 ---------------------------------- -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 128, vals: 4661376 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - ---------- 1st pattern computation TTM, GEMM - -kernel execution time: 601.783 ms -SpMM time: 602.149 - -kernel execution time: 135.443 ms -SpMM template time: 135.968 - -kernel execution time: 277.027 ms -GeMM time: 277.575 - -kernel execution time: 262.418 ms -ref 2 GeMM template time: 262.884 - ---------- 2nd pattern computation GEMM, SpMM - -kernel execution time: 200.17 ms -ref3 GeMM template time: 200.726 - -kernel execution time: 71.523 ms -SpMM template time ref4: 72.0077 - --------- reference pattern computation - -kernel execution time: 26468.2 ms -taco reference time: 26469.2 - -spmm-spmm execution - ------------------------------------------ -filenum: 3 ---------------------------------- -/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 36417, B2_dimension: 36417, vals: 4344765 -C1_dimension: 36417, C2_dimension: 128, vals: 4661376 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - ---------- 1st pattern computation TTM, GEMM - -kernel execution time: 600.837 ms -SpMM time: 601.215 - -kernel execution time: 137.481 ms -SpMM template time: 138.009 - -kernel execution time: 280.631 ms -GeMM time: 281.208 - -kernel execution time: 266.073 ms -ref 2 GeMM template time: 266.549 - ---------- 2nd pattern computation GEMM, SpMM - -kernel execution time: 200.674 ms -ref3 GeMM template time: 201.238 - -kernel execution time: 72.8548 ms -SpMM template time ref4: 73.3562 - --------- reference pattern computation - -kernel execution time: 26717.7 ms -taco reference time: 26718.7 - -kernel execution time: 34613.6 ms -taco reference new time: 34614.6 - -kernel execution time: 202.425 ms -fused time: 203.027 - -spmm-spmm execution - ------------------------------------------ -filenum: 3 ---------------------------------- - -spmm-spmm execution - ------------------------------------------ -filenum: 0 ---------------------------------- -/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 2708, B2_dimension: 2708, vals: 5429 -C1_dimension: 2708, C2_dimension: 128, vals: 346624 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - ---------- 1st pattern computation TTM, GEMM - -kernel execution time: 0.924512 ms -SpMM time: 1.22967 - -kernel execution time: 1.23287 ms -SpMM template time: 1.51353 - -kernel execution time: 20.7805 ms -GeMM time: 21.0769 - -kernel execution time: 19.6116 ms -ref 2 GeMM template time: 19.8379 - ---------- 2nd pattern computation GEMM, SpMM - -kernel execution time: 14.7563 ms -ref3 GeMM template time: 15.0245 - -kernel execution time: 0.823641 ms -SpMM template time ref4: 1.05233 - --------- reference pattern computation - -kernel execution time: 34.1041 ms -taco reference time: 34.4607 - -kernel execution time: 41.9195 ms -taco reference new time: 42.2061 - -kernel execution time: 4.76242 ms -fused time: 5.04101 -filenum: 1 ---------------------------------- -/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 548551, B2_dimension: 548551, vals: 1851744 -C1_dimension: 548551, C2_dimension: 128, vals: 70214528 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - ---------- 1st pattern computation TTM, GEMM - -kernel execution time: 394.8 ms -SpMM time: 395.503 - -kernel execution time: 473.148 ms -SpMM template time: 473.684 - -kernel execution time: 4117.68 ms -GeMM time: 4118.6 - -kernel execution time: 3957.31 ms -ref 2 GeMM template time: 3958.16 - ---------- 2nd pattern computation GEMM, SpMM - -kernel execution time: 3017.13 ms -ref3 GeMM template time: 3017.67 - -kernel execution time: 314.652 ms -SpMM template time ref4: 315.164 - --------- reference pattern computation - -kernel execution time: 11644.6 ms -taco reference time: 11645.6 - -kernel execution time: 14402.6 ms -taco reference new time: 14403.6 - -kernel execution time: 1261.33 ms -fused time: 1261.88 - -spmm-spmm execution - ------------------------------------------ -filenum: 0 ---------------------------------- -/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/cora.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 2708, B2_dimension: 2708, vals: 5429 -C1_dimension: 2708, C2_dimension: 128, vals: 346624 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - ---------- 1st pattern computation TTM, GEMM - -kernel execution time: 0.209133 ms -SpMM time: 0.517016 - -kernel execution time: 0.579748 ms -SpMM template time: 0.864251 - -kernel execution time: 1.0574 ms -GeMM time: 1.37727 - -kernel execution time: 19.621 ms -ref 2 GeMM template time: 19.8504 - ---------- 2nd pattern computation GEMM, SpMM - -kernel execution time: 1.44618 ms -ref3 GeMM template time: 1.72243 - -kernel execution time: 0.384425 ms -SpMM template time ref4: 0.610708 - --------- reference pattern computation - -kernel execution time: 3.59893 ms -taco reference time: 3.95508 - -kernel execution time: 4.81855 ms -taco reference new time: 5.10349 - -kernel execution time: 1.47107 ms -fused time: 1.90463 -filenum: 1 ---------------------------------- -/home/min/a/kadhitha/workspace/my_taco/taco/net-repo-graph/amazon.mtx -ref(i,l)=B(i,j)*C(i,k)*D(j,k); -B1_dimension: 548551, B2_dimension: 548551, vals: 1851744 -C1_dimension: 548551, C2_dimension: 128, vals: 70214528 -D1_dimension: 128, D2_dimension: 64, vals: 8192 - - ---------- 1st pattern computation TTM, GEMM - -kernel execution time: 50.1795 ms -SpMM time: 50.5567 - -kernel execution time: 64.2504 ms -SpMM template time: 64.8179 - -kernel execution time: 96.8464 ms -GeMM time: 97.4123 - -kernel execution time: 3949.87 ms -ref 2 GeMM template time: 3950.93 - ---------- 2nd pattern computation GEMM, SpMM - -kernel execution time: 123.802 ms -ref3 GeMM template time: 124.342 - -kernel execution time: 39.2723 ms -SpMM template time ref4: 39.8322 - --------- reference pattern computation - -kernel execution time: 457.271 ms -taco reference time: 457.979 - -kernel execution time: 427.194 ms -taco reference new time: 427.789 - -kernel execution time: 93.1417 ms -fused time: 93.7188 diff --git a/test/stats/spmv-spmv.txt b/test/stats/spmv-spmv.txt deleted file mode 100644 index 90b7482e7..000000000 --- a/test/stats/spmv-spmv.txt +++ /dev/null @@ -1,81 +0,0 @@ - -spmv-spmv execution - ------------------------------------------ -A(i) = B(i,j) * C(j,k) * v(k); -B1_dimension: 5, B2_dimension: 5, vals: 19 -C1_dimension: 5, C2_dimension: 5, vals: 19 -D1_dimension: 5, vals: 5 - - -spmv-spmv execution - ------------------------------------------ -A(i) = B(i,j) * C(j,k) * v(k); -B1_dimension: 5, B2_dimension: 5, vals: 19 -C1_dimension: 5, C2_dimension: 5, vals: 19 -D1_dimension: 5, vals: 5 - - -spmv-spmv execution - ------------------------------------------ -A(i) = B(i,j) * C(j,k) * v(k); -B1_dimension: 5, B2_dimension: 5, vals: 19 -C1_dimension: 5, C2_dimension: 5, vals: 19 -D1_dimension: 5, vals: 5 - - -spmv-spmv execution - ------------------------------------------ -A(i) = B(i,j) * C(j,k) * v(k); -B1_dimension: 5, B2_dimension: 5, vals: 19 -C1_dimension: 5, C2_dimension: 5, vals: 19 -D1_dimension: 5, vals: 5 - - -spmv-spmv execution - ------------------------------------------ -A(i) = B(i,j) * C(j,k) * v(k); -B1_dimension: 5, B2_dimension: 5, vals: 19 -C1_dimension: 5, C2_dimension: 5, vals: 19 -D1_dimension: 5, vals: 5 - - -spmv-spmv execution - ------------------------------------------ -A(i) = B(i,j) * C(j,k) * v(k); -B1_dimension: 5, B2_dimension: 5, vals: 19 -C1_dimension: 5, C2_dimension: 5, vals: 19 -D1_dimension: 5, vals: 5 - - -spmv-spmv execution - ------------------------------------------ -A(i) = B(i,j) * C(j,k) * v(k); -B1_dimension: 5, B2_dimension: 5, vals: 19 -C1_dimension: 5, C2_dimension: 5, vals: 19 -D1_dimension: 5, vals: 5 - - -spmv-spmv execution - ------------------------------------------ -A(i) = B(i,j) * C(j,k) * v(k); -B1_dimension: 5, B2_dimension: 5, vals: 19 -C1_dimension: 5, C2_dimension: 5, vals: 19 -D1_dimension: 5, vals: 5 - - -spmv-spmv execution - ------------------------------------------ -A(i) = B(i,j) * C(j,k) * v(k); -B1_dimension: 5, B2_dimension: 5, vals: 19 -C1_dimension: 5, C2_dimension: 5, vals: 19 -D1_dimension: 5, vals: 5 - diff --git a/test/stats/ttm-ttm.txt b/test/stats/ttm-ttm.txt deleted file mode 100644 index 7080af67b..000000000 --- a/test/stats/ttm-ttm.txt +++ /dev/null @@ -1,2924 +0,0 @@ -ttm-ttm execution - ------------------------------------------ -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -/home/min/a/kadhitha/ispc-examples/data/tns/delicious-3d.tns -B1_dimension: 532924, B2_dimension: 17262471, B3_dimension: 532924, vals: 140126181 -C1_dimension: 2480308, C2_dimension: 16, vals: 39684928 -D1_dimension: 16, D2_dimension: 16, vals: 256 - - -kernel execution time: 6299.03 ms -fused time: 6300.12 - -kernel execution time: 21080.2 ms -reference time: 21081.3 - -kernel execution time: 2757.48 ms -reference time: 2757.94 - -kernel execution time: 5064.84 ms -reference time: 5065.87 - -ttm-ttm execution - ------------------------------------------ -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns -B1_dimension: 319686, B2_dimension: 28153045, B3_dimension: 319686, vals: 112890310 -C1_dimension: 1607191, C2_dimension: 16, vals: 25715056 -D1_dimension: 16, D2_dimension: 16, vals: 256 - - -kernel execution time: 3709.97 ms -fused time: 3711.05 - -kernel execution time: 16159.4 ms -reference time: 16160.5 - -kernel execution time: 1773.12 ms -reference time: 1773.58 - -kernel execution time: 3030.89 ms -reference time: 3031.42 - -ttm-ttm execution - ------------------------------------------- -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns -B1_dimension: 12092, B2_dimension: 9184, B3_dimension: 12092, vals: 76879419 -C1_dimension: 28818, C2_dimension: 16, vals: 461088 -D1_dimension: 16, D2_dimension: 16, vals: 256 - - -kernel execution time: 487.016 ms -fused time: 487.513 - -kernel execution time: 11041.9 ms -reference time: 11043 - -kernel execution time: 1009.63 ms -reference time: 1010.12 - -kernel execution time: 37.1546 ms -reference time: 37.757 - - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/nell-1.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 2902330, B2_dimension: 2143368, B3_dimension: 2902330, vals: 143599552 -C1_dimension: 25495389, C2_dimension: 16, vals: 407926224 -D1_dimension: 16, D2_dimension: 16, vals: 256 - - -kernel execution time: 11984.9 ms -fused time: 11985.9 - -kernel execution time: 34959 ms -reference time: 34960.1 - -kernel execution time: 8476.95 ms -reference time: 8477.9 - -kernel execution time: 1869.85 ms -reference time: 1870.39 - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854 -C1_dimension: 2, C2_dimension: 16, vals: 32 -D1_dimension: 16, D2_dimension: 16, vals: 256 - - -kernel execution time: 2730.05 ms -fused time: 2731.15 - -kernel execution time: 4167.74 ms -reference time: 4168.86 - -kernel execution time: 550.937 ms -reference time: 551.395 - -kernel execution time: 2788.55 ms -reference time: 2789.07 - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/1998DARPA.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 22476, B2_dimension: 22476, B3_dimension: 22476, vals: 28421307 -C1_dimension: 23776223, C2_dimension: 16, vals: 380419568 -D1_dimension: 16, D2_dimension: 16, vals: 256 - - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/freebase_music.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 23343790, B2_dimension: 23344784, B3_dimension: 23343790, vals: 99546550 -C1_dimension: 166, C2_dimension: 16, vals: 2656 -D1_dimension: 16, D2_dimension: 16, vals: 256 - - -kernel execution time: 10491.6 ms -fused time: 10492.7 - -kernel execution time: 15968 ms -reference time: 15969.1 - -kernel execution time: 1886.09 ms -reference time: 1886.55 - -kernel execution time: 10763.7 ms -reference time: 10765 - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/darpa1998.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 22476, B2_dimension: 22476, B3_dimension: 22476, vals: 28421307 -C1_dimension: 23776223, C2_dimension: 16, vals: 380419568 -D1_dimension: 16, D2_dimension: 16, vals: 256 - - -kernel execution time: 847.087 ms -fused time: 847.588 - -kernel execution time: 7136.54 ms -reference time: 7137.57 - -kernel execution time: 1340.45 ms -reference time: 1340.91 - -kernel execution time: 8.28247 ms -reference time: 8.80899 - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/freebase_sampled.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 38954435, B2_dimension: 38955429, B3_dimension: 38954435, vals: 139920770 -C1_dimension: 532, C2_dimension: 16, vals: 8512 -D1_dimension: 16, D2_dimension: 16, vals: 256 - - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/freebase_music.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 23343790, B2_dimension: 23344784, B3_dimension: 23343790, vals: 99546550 -C1_dimension: 166, C2_dimension: 16, vals: 2656 -D1_dimension: 16, D2_dimension: 16, vals: 256 - - -kernel execution time: 10540.6 ms -fused time: 10541.6 - -kernel execution time: 16072 ms -reference time: 16073 - -kernel execution time: 1900.39 ms -reference time: 1900.89 - -kernel execution time: 10819.5 ms -reference time: 10820.5 - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/freebase_sampled.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 38954435, B2_dimension: 38955429, B3_dimension: 38954435, vals: 139920770 -C1_dimension: 532, C2_dimension: 16, vals: 8512 -D1_dimension: 16, D2_dimension: 16, vals: 256 - - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 319686, B2_dimension: 28153045, B3_dimension: 319686, vals: 112890310 -C1_dimension: 1607191, C2_dimension: 16, vals: 25715056 -D1_dimension: 16, D2_dimension: 16, vals: 256 - - -kernel execution time: 3689.85 ms -fused time: 3690.99 - -kernel execution time: 16162.6 ms -reference time: 16163.7 - -kernel execution time: 2035.42 ms -TTM1: 2035.96 - -kernel execution time: 3004.2 ms -TTM2: 3004.74 - -kernel execution time: 147.233 ms -dense: 147.648 - -kernel execution time: 2240.45 ms -TTM after dense: 2240.96 - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 319686, B2_dimension: 28153045, B3_dimension: 319686, vals: 112890310 -C1_dimension: 1607191, C2_dimension: 16, vals: 25715056 -D1_dimension: 16, D2_dimension: 1024, vals: 16384 - - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 12092, B2_dimension: 9184, B3_dimension: 12092, vals: 76879419 -C1_dimension: 28818, C2_dimension: 16, vals: 461088 -D1_dimension: 16, D2_dimension: 32, vals: 512 - - -kernel execution time: 542.361 ms -fused time: 542.813 - -kernel execution time: 22547.6 ms -reference time: 22548.6 - -kernel execution time: 1008.25 ms -TTM1: 1008.82 - -kernel execution time: 70.7434 ms -TTM2: 71.2926 - -kernel execution time: 5.2174 ms -dense: 5.58699 - -kernel execution time: 2086.85 ms -TTM after dense: 2087.25 - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 12092, B2_dimension: 9184, B3_dimension: 12092, vals: 76879419 -C1_dimension: 28818, C2_dimension: 16, vals: 461088 -D1_dimension: 16, D2_dimension: 16, vals: 256 - - -kernel execution time: 531.924 ms -fused time: 532.696 - -kernel execution time: 11314 ms -reference time: 11315.1 - -kernel execution time: 1009.54 ms -TTM1: 1010.08 - -kernel execution time: 37.5466 ms -TTM2: 38.0867 - -kernel execution time: 2.77519 ms -dense: 3.13589 - -kernel execution time: 1014.37 ms -TTM after dense: 1014.74 - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 12092, B2_dimension: 9184, B3_dimension: 12092, vals: 76879419 -C1_dimension: 28818, C2_dimension: 16, vals: 461088 -D1_dimension: 16, D2_dimension: 64, vals: 1024 - - -kernel execution time: 604.787 ms -fused time: 605.25 - -kernel execution time: 45011.1 ms -reference time: 45012.2 - -kernel execution time: 1008.41 ms -TTM1: 1008.97 - -kernel execution time: 137.791 ms -TTM2: 138.316 - -kernel execution time: 10.0591 ms -dense: 10.4452 - -kernel execution time: 5120.5 ms -TTM after dense: 5121.57 - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 -C1_dimension: 25, C2_dimension: 32, vals: 800 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 0.129572 ms -fused time: 0.560598 - -kernel execution time: 0.151942 ms -reference time: 0.999013 - -kernel execution time: 0.01803 ms -TTM1: 0.310364 - -kernel execution time: 0.119052 ms -TTM2: 0.897713 - -kernel execution time: 0.093421 ms -dense: 0.284444 - -kernel execution time: 0.032111 ms -TTM after dense: 0.662509 - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 -C1_dimension: 25, C2_dimension: 32, vals: 800 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 0.136562 ms -fused time: 0.555088 - -kernel execution time: 0.155282 ms -reference time: 1.02811 - -kernel execution time: 0.01913 ms -TTM1: 0.293014 - -kernel execution time: 0.148032 ms -TTM2: 1.08159 - -kernel execution time: 0.093351 ms -dense: 0.282434 - -kernel execution time: 0.03336 ms -TTM after dense: 0.309775 - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 -C1_dimension: 25, C2_dimension: 32, vals: 800 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 0.133302 ms -fused time: 0.590248 - -kernel execution time: 0.154633 ms -reference time: 0.976683 - -kernel execution time: 0.032061 ms -TTM1: 0.554668 - -kernel execution time: 0.231943 ms -TTM2: 0.790901 - -kernel execution time: 0.093152 ms -dense: 0.456727 - -kernel execution time: 0.168413 ms -TTM after dense: 0.866702 - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 -C1_dimension: 25, C2_dimension: 32, vals: 800 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 0.211383 ms -fused time: 0.979204 - -kernel execution time: 0.300854 ms -reference time: 0.976764 - -kernel execution time: 0.03182 ms -TTM1: 0.986423 - -kernel execution time: 0.223513 ms -TTM2: 1.25582 - -kernel execution time: 0.140142 ms -dense: 0.491247 - -kernel execution time: 0.057651 ms -TTM after dense: 0.632639 - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 -C1_dimension: 25, C2_dimension: 32, vals: 800 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 0.226813 ms -fused time: 0.981434 - -kernel execution time: 0.299435 ms -reference time: 0.980784 - -kernel execution time: 0.03171 ms -TTM1: 1.17345 - -kernel execution time: 0.236723 ms -TTM2: 1.08452 - -kernel execution time: 0.099581 ms -dense: 0.448246 - -kernel execution time: 0.055691 ms -TTM after dense: 0.595948 - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 -C1_dimension: 25, C2_dimension: 32, vals: 800 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 0.183452 ms -fused time: 0.934223 - -kernel execution time: 0.258304 ms -reference time: 1.14423 - -kernel execution time: 0.028031 ms -TTM1: 0.530247 - -kernel execution time: 0.192393 ms -TTM2: 0.865752 - -kernel execution time: 0.104401 ms -dense: 0.458676 - -kernel execution time: 0.058181 ms -TTM after dense: 0.641949 - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 -C1_dimension: 25, C2_dimension: 32, vals: 800 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 0.212263 ms -fused time: 1.00447 - -kernel execution time: 0.293174 ms -reference time: 1.00466 - -kernel execution time: 0.03429 ms -TTM1: 1.06194 - -kernel execution time: 0.227643 ms -TTM2: 0.77555 - -kernel execution time: 0.093021 ms -dense: 0.615169 - -kernel execution time: 0.111302 ms -TTM after dense: 1.19147 - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 -C1_dimension: 25, C2_dimension: 32, vals: 800 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 0.126042 ms -fused time: 0.542138 - -kernel execution time: 0.170263 ms -reference time: 0.974603 - -kernel execution time: 0.01972 ms -TTM1: 0.286434 - -kernel execution time: 0.125282 ms -TTM2: 0.402736 - -kernel execution time: 0.103582 ms -dense: 0.7661 - -kernel execution time: 0.04149 ms -TTM after dense: 0.320775 - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 -C1_dimension: 25, C2_dimension: 32, vals: 800 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 0.193463 ms -fused time: 0.831391 - -kernel execution time: 0.347254 ms -reference time: 1.12168 - -kernel execution time: 0.03811 ms -TTM1: 1.19729 - -kernel execution time: 0.334915 ms -TTM2: 1.14708 - -kernel execution time: 0.109681 ms -dense: 0.526707 - -kernel execution time: 0.140412 ms -TTM after dense: 0.76001 - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 -C1_dimension: 25, C2_dimension: 32, vals: 800 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 0.147722 ms -fused time: 0.7865 - -kernel execution time: 0.237434 ms -reference time: 1.01788 - -kernel execution time: 0.020341 ms -TTM1: 0.330005 - -kernel execution time: 0.201823 ms -TTM2: 1.01705 - -kernel execution time: 0.069931 ms -dense: 0.261943 - -kernel execution time: 0.032231 ms -TTM after dense: 0.314845 - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 -C1_dimension: 25, C2_dimension: 32, vals: 800 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 0.210293 ms -fused time: 0.999243 - -kernel execution time: 0.577188 ms -reference time: 1.23453 - -kernel execution time: 0.032071 ms -TTM1: 0.965223 - -kernel execution time: 0.227183 ms -TTM2: 1.25077 - -kernel execution time: 0.091622 ms -dense: 0.449416 - -kernel execution time: 0.04494 ms -TTM after dense: 0.73161 - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 -C1_dimension: 25, C2_dimension: 32, vals: 800 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 0.109392 ms -fused time: 0.481746 - -kernel execution time: 0.242474 ms -reference time: 0.72963 - -kernel execution time: 0.01624 ms -TTM1: 0.257934 - -kernel execution time: 0.089982 ms -TTM2: 0.341365 - -kernel execution time: 0.106392 ms -dense: 0.74066 - -kernel execution time: 0.027241 ms -TTM after dense: 0.277864 - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 -C1_dimension: 25, C2_dimension: 32, vals: 800 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 -C1_dimension: 25, C2_dimension: 32, vals: 800 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 0.938612 ms -fused time: 1.66032 - -kernel execution time: 0.598878 ms -reference time: 1.2444 - -kernel execution time: 0.027881 ms -TTM1: 0.664309 - -kernel execution time: 0.172162 ms -TTM2: 1.0861 - -kernel execution time: 0.087052 ms -dense: 0.420256 - -kernel execution time: 0.044921 ms -TTM after dense: 0.669959 - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 -C1_dimension: 25, C2_dimension: 32, vals: 800 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 0.723749 ms -fused time: 1.52668 - -kernel execution time: 1.33287 ms -reference time: 2.02148 - -kernel execution time: 0.03285 ms -TTM1: 1.06994 - -kernel execution time: 0.227263 ms -TTM2: 1.00641 - -kernel execution time: 0.121451 ms -dense: 0.410656 - -kernel execution time: 0.046891 ms -TTM after dense: 0.612258 - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 -C1_dimension: 25, C2_dimension: 32, vals: 800 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 0.654879 ms -fused time: 1.0716 - -kernel execution time: 1.24327 ms -reference time: 1.59976 - -kernel execution time: 0.691129 ms -TTM1: 1.0059 - -kernel execution time: 0.859771 ms -TTM2: 1.1516 - -kernel execution time: 0.136762 ms -dense: 0.334665 - -kernel execution time: 0.524517 ms -TTM after dense: 0.806231 - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 319686, B2_dimension: 28153045, B3_dimension: 319686, vals: 112890310 -C1_dimension: 1607191, C2_dimension: 32, vals: 51430112 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 891.501 ms -fused time: 892.508 - -kernel execution time: 6378.22 ms -reference time: 6379.42 - -kernel execution time: 265.033 ms -TTM1: 265.676 - -kernel execution time: 514.397 ms -TTM2: 515.1 - -kernel execution time: 70.5991 ms -dense: 71.0624 - -kernel execution time: 541.878 ms -TTM after dense: 542.548 - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854 -C1_dimension: 2, C2_dimension: 32, vals: 64 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 753.49 ms -fused time: 754.615 - -kernel execution time: 1394.55 ms -reference time: 1395.28 - -kernel execution time: 197.246 ms -TTM1: 197.894 - -kernel execution time: 503.301 ms -TTM2: 503.886 - -kernel execution time: 0.0622 ms -dense: 1.00584 - -kernel execution time: 380.931 ms -TTM after dense: 381.331 - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 319686, B2_dimension: 28153045, B3_dimension: 319686, vals: 112890310 -C1_dimension: 1607191, C2_dimension: 32, vals: 51430112 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 894.532 ms -fused time: 895.512 - -kernel execution time: 6345.62 ms -reference time: 6346.77 - -kernel execution time: 266.55 ms -TTM1: 267.22 - -kernel execution time: 515.257 ms -TTM2: 515.893 - -kernel execution time: 70.7658 ms -dense: 71.2374 - -kernel execution time: 542.175 ms -TTM after dense: 542.864 - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 12092, B2_dimension: 9184, B3_dimension: 12092, vals: 76879419 -C1_dimension: 28818, C2_dimension: 32, vals: 922176 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 49.8694 ms -fused time: 50.6512 - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/nell-1.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 2902330, B2_dimension: 2143368, B3_dimension: 2902330, vals: 143599552 -C1_dimension: 25495389, C2_dimension: 32, vals: 815852448 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 1309.77 ms -fused time: 1310.84 - -kernel execution time: 8179.4 ms -reference time: 8180.68 - -kernel execution time: 805.812 ms -TTM1: 806.562 - -kernel execution time: 314.204 ms -TTM2: 314.751 - -kernel execution time: 1134.47 ms -dense: 1134.93 - -kernel execution time: 1621.3 ms -TTM after dense: 1621.92 - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854 -C1_dimension: 2, C2_dimension: 32, vals: 64 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 749.757 ms -fused time: 750.843 - -kernel execution time: 1391.56 ms -reference time: 1392.35 - -kernel execution time: 196.711 ms -TTM1: 197.347 - -kernel execution time: 502.61 ms -TTM2: 503.193 - -kernel execution time: 0.063271 ms -dense: 0.948892 - -kernel execution time: 381.132 ms -TTM after dense: 381.508 - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/darpa1998.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 22476, B2_dimension: 22476, B3_dimension: 22476, vals: 28421307 -C1_dimension: 23776223, C2_dimension: 32, vals: 760839136 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 230.973 ms -fused time: 231.921 - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 -C1_dimension: 25, C2_dimension: 32, vals: 800 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 0.72187 ms -fused time: 1.46707 - -kernel execution time: 0.842291 ms -reference time: 1.52295 - -kernel execution time: 0.490417 ms -TTM1: 1.08223 - -kernel execution time: 0.653919 ms -TTM2: 1.17803 - -kernel execution time: 0.115332 ms -dense: 0.889372 - -kernel execution time: 0.446076 ms -TTM after dense: 1.05921 - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 -C1_dimension: 25, C2_dimension: 16, vals: 400 -D1_dimension: 16, D2_dimension: 32, vals: 512 - - -kernel execution time: 1.29819 ms -fused time: 2.11481 - -kernel execution time: 0.560877 ms -reference time: 1.26788 - -kernel execution time: 0.506967 ms -TTM1: 1.14189 - -kernel execution time: 0.547697 ms -TTM2: 1.24278 - -kernel execution time: 0.075421 ms -dense: 0.508546 - -kernel execution time: 0.464356 ms -TTM after dense: 1.09434 - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/darpa1998.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 22476, B2_dimension: 22476, B3_dimension: 22476, vals: 28421307 -C1_dimension: 23776223, C2_dimension: 16, vals: 380419568 -D1_dimension: 16, D2_dimension: 32, vals: 512 - - -kernel execution time: 126.199 ms -fused time: 126.724 - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/darpa1998.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 22476, B2_dimension: 22476, B3_dimension: 22476, vals: 28421307 -C1_dimension: 23776223, C2_dimension: 16, vals: 380419568 -D1_dimension: 16, D2_dimension: 32, vals: 512 - - -kernel execution time: 132.543 ms -fused time: 133.165 - -kernel execution time: 2405.44 ms -reference time: 2406.19 - -kernel execution time: 331.61 ms -TTM1: 332.199 - -kernel execution time: 2.26417 ms -TTM2: 3.02615 - -kernel execution time: 400.791 ms -dense: 401.064 - -kernel execution time: 620.74 ms -TTM after dense: 621.389 - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 319686, B2_dimension: 28153045, B3_dimension: 319686, vals: 112890310 -C1_dimension: 1607191, C2_dimension: 16, vals: 25715056 -D1_dimension: 16, D2_dimension: 32, vals: 512 - - -kernel execution time: 455.645 ms -fused time: 456.696 - -kernel execution time: 718.699 ms -reference time: 719.384 - -kernel execution time: 142.557 ms -TTM1: 143.105 - -kernel execution time: 256.179 ms -TTM2: 256.785 - -kernel execution time: 29.5586 ms -dense: 30.0451 - -kernel execution time: 269.529 ms -TTM after dense: 270.186 - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 319686, B2_dimension: 28153045, B3_dimension: 319686, vals: 112890310 -C1_dimension: 1607191, C2_dimension: 32, vals: 51430112 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 890.318 ms -fused time: 891.345 - -kernel execution time: 2038.26 ms -reference time: 2038.96 - -kernel execution time: 265.076 ms -TTM1: 265.783 - -kernel execution time: 544.765 ms -TTM2: 545.423 - -kernel execution time: 70.9058 ms -dense: 71.4509 - -kernel execution time: 541.442 ms -TTM after dense: 542.115 - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854 -C1_dimension: 2, C2_dimension: 64, vals: 128 -D1_dimension: 64, D2_dimension: 64, vals: 4096 - - -kernel execution time: 902.466 ms -fused time: 903.626 - -kernel execution time: 1051.52 ms -reference time: 1052.27 - -kernel execution time: 385.619 ms -TTM1: 386.243 - -kernel execution time: 937.648 ms -TTM2: 938.212 - -kernel execution time: 0.067901 ms -dense: 1.00372 - -kernel execution time: 380.193 ms -TTM after dense: 380.613 - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854 -C1_dimension: 2, C2_dimension: 64, vals: 128 -D1_dimension: 64, D2_dimension: 128, vals: 8192 - - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854 -C1_dimension: 2, C2_dimension: 64, vals: 128 -D1_dimension: 64, D2_dimension: 64, vals: 4096 - - -kernel execution time: 898.295 ms -fused time: 899.297 - -kernel execution time: 1037.66 ms -reference time: 1038.39 - -kernel execution time: 385.768 ms -TTM1: 386.452 - -kernel execution time: 939.137 ms -TTM2: 939.74 - -kernel execution time: 0.073171 ms -dense: 1.20129 - -kernel execution time: 383.479 ms -TTM after dense: 384.01 - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 319686, B2_dimension: 28153045, B3_dimension: 319686, vals: 112890310 -C1_dimension: 1607191, C2_dimension: 64, vals: 102860224 -D1_dimension: 64, D2_dimension: 64, vals: 4096 - - -kernel execution time: 1034.06 ms -fused time: 1035.05 - -kernel execution time: 4275.39 ms -reference time: 4276.62 - -kernel execution time: 516.765 ms -TTM1: 517.518 - -kernel execution time: 1048.69 ms -TTM2: 1049.32 - -kernel execution time: 119.233 ms -dense: 119.711 - -kernel execution time: 546.744 ms -TTM after dense: 547.412 - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 319686, B2_dimension: 28153045, B3_dimension: 319686, vals: 112890310 -C1_dimension: 1607191, C2_dimension: 32, vals: 51430112 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 894.088 ms -fused time: 895.234 - -kernel execution time: 2025.29 ms -reference time: 2025.92 - -kernel execution time: 264.446 ms -TTM1: 265.069 - -kernel execution time: 541.153 ms -TTM2: 541.71 - -kernel execution time: 70.7936 ms -dense: 71.2153 - -kernel execution time: 542.474 ms -TTM after dense: 543.104 - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 319686, B2_dimension: 28153045, B3_dimension: 319686, vals: 112890310 -C1_dimension: 1607191, C2_dimension: 16, vals: 25715056 -D1_dimension: 16, D2_dimension: 64, vals: 1024 - - -kernel execution time: 871.496 ms -fused time: 872.523 - -kernel execution time: 1340.14 ms -reference time: 1340.84 - -kernel execution time: 143.439 ms -TTM1: 143.995 - -kernel execution time: 459.09 ms -TTM2: 459.668 - -kernel execution time: 51.7433 ms -dense: 52.1957 - -kernel execution time: 545.092 ms -TTM after dense: 545.899 - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 319686, B2_dimension: 28153045, B3_dimension: 319686, vals: 112890310 -C1_dimension: 1607191, C2_dimension: 32, vals: 51430112 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 893.815 ms -fused time: 894.866 - -kernel execution time: 2016.15 ms -reference time: 2016.8 - -kernel execution time: 266.599 ms -TTM1: 267.18 - -kernel execution time: 544.015 ms -TTM2: 544.597 - -kernel execution time: 70.7604 ms -dense: 71.1854 - -kernel execution time: 543.212 ms -TTM after dense: 543.879 - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 12092, B2_dimension: 9184, B3_dimension: 12092, vals: 76879419 -C1_dimension: 28818, C2_dimension: 32, vals: 922176 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 47.6087 ms -fused time: 48.0666 - -kernel execution time: 2381.79 ms -reference time: 2382.51 - -kernel execution time: 85.3431 ms -TTM1: 86.158 - -kernel execution time: 8.56212 ms -TTM2: 9.19594 - -kernel execution time: 1.27998 ms -dense: 1.66095 - -kernel execution time: 185.324 ms -TTM after dense: 185.729 - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/nell-1.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 2902330, B2_dimension: 2143368, B3_dimension: 2902330, vals: 143599552 -C1_dimension: 25495389, C2_dimension: 32, vals: 815852448 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 1312.78 ms -fused time: 1313.78 - -kernel execution time: 3548.92 ms -reference time: 3550.02 - -kernel execution time: 794.193 ms -TTM1: 794.835 - -kernel execution time: 371.233 ms -TTM2: 371.853 - -kernel execution time: 1136.25 ms -dense: 1136.73 - -kernel execution time: 1608.81 ms -TTM after dense: 1609.49 - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854 -C1_dimension: 2, C2_dimension: 32, vals: 64 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 749.836 ms -fused time: 750.93 - -kernel execution time: 566.457 ms -reference time: 567.141 - -kernel execution time: 197.095 ms -TTM1: 197.696 - -kernel execution time: 503.839 ms -TTM2: 504.407 - -kernel execution time: 0.05955 ms -dense: 0.911152 - -kernel execution time: 382.185 ms -TTM after dense: 382.591 - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/darpa1998.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 22476, B2_dimension: 22476, B3_dimension: 22476, vals: 28421307 -C1_dimension: 23776223, C2_dimension: 32, vals: 760839136 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 226.079 ms -fused time: 227.028 - -kernel execution time: 8763.95 ms -reference time: 8765.15 - -kernel execution time: 605.807 ms -TTM1: 606.7 - -kernel execution time: 5.27951 ms -TTM2: 5.94312 - -kernel execution time: 1075.36 ms -dense: 1075.63 - -kernel execution time: 1244.1 ms -TTM after dense: 1244.76 - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/freebase_music.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 23343790, B2_dimension: 23344784, B3_dimension: 23343790, vals: 99546550 -C1_dimension: 166, C2_dimension: 32, vals: 5312 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/delicious-3d.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 958893, B2_dimension: 55584242, B3_dimension: 958893, vals: 140126164 -C1_dimension: 2480308, C2_dimension: 32, vals: 79369856 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 319686, B2_dimension: 28153045, B3_dimension: 319686, vals: 112890310 -C1_dimension: 1607191, C2_dimension: 32, vals: 51430112 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 14896.3 ms -fused time: 14897.5 - -kernel execution time: 94041.2 ms -reference time: 94042.2 - -kernel execution time: 3578.66 ms -TTM1: 3579.61 - -kernel execution time: 18883.5 ms -TTM2: 18884.5 - -kernel execution time: 2197.87 ms -dense: 2198.28 - -kernel execution time: 7686.45 ms -TTM after dense: 7687.46 - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 12092, B2_dimension: 9184, B3_dimension: 12092, vals: 76879419 -C1_dimension: 28818, C2_dimension: 32, vals: 922176 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 1072.87 ms -fused time: 1073.82 - -kernel execution time: 71021.8 ms -reference time: 71022.9 - -kernel execution time: 1996.05 ms -TTM1: 1996.58 - -kernel execution time: 231.665 ms -TTM2: 232.177 - -kernel execution time: 40.2369 ms -dense: 40.6304 - -kernel execution time: 4971.71 ms -TTM after dense: 4972.6 - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/nell-1.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 2902330, B2_dimension: 2143368, B3_dimension: 2902330, vals: 143599552 -C1_dimension: 25495389, C2_dimension: 32, vals: 815852448 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 29074.9 ms -fused time: 29076 - -kernel execution time: 148072 ms -reference time: 148073 - -kernel execution time: 13571.2 ms -TTM1: 13572.2 - -kernel execution time: 11698.5 ms -TTM2: 11699.5 - -kernel execution time: 34736.9 ms -dense: 34737.7 - -kernel execution time: 22283.6 ms -TTM after dense: 22284.5 - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854 -C1_dimension: 2, C2_dimension: 32, vals: 64 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 12513.9 ms -fused time: 12515 - -kernel execution time: 23535.3 ms -reference time: 23536.3 - -kernel execution time: 1334.33 ms -TTM1: 1334.87 - -kernel execution time: 17560.3 ms -TTM2: 17561.3 - -kernel execution time: 0.019291 ms -dense: 0.885501 - -kernel execution time: 3394.59 ms -TTM after dense: 3395.34 - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/darpa1998.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 22476, B2_dimension: 22476, B3_dimension: 22476, vals: 28421307 -C1_dimension: 23776223, C2_dimension: 32, vals: 760839136 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 1517.3 ms -fused time: 1518.25 - -kernel execution time: 45929.9 ms -reference time: 45930.9 - -kernel execution time: 2929.29 ms -TTM1: 2929.82 - -kernel execution time: 53.4282 ms -TTM2: 53.9625 - -kernel execution time: 32592.7 ms -dense: 32593.5 - -kernel execution time: 6277.64 ms -TTM after dense: 6278.68 - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 -C1_dimension: 25, C2_dimension: 32, vals: 800 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 0.852321 ms -fused time: 1.60101 - -kernel execution time: 0.662379 ms -reference time: 1.32203 - -kernel execution time: 0.511427 ms -TTM1: 1.03372 - -kernel execution time: 0.667709 ms -TTM2: 1.20996 - -kernel execution time: 0.118331 ms -dense: 0.542977 - -kernel execution time: 0.483187 ms -TTM after dense: 0.900252 - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/matmul_5-5-5.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 25, B2_dimension: 25, B3_dimension: 25, vals: 125 -C1_dimension: 25, C2_dimension: 32, vals: 800 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 0.671739 ms -fused time: 4.90845 - -kernel execution time: 0.711039 ms -reference time: 5.04208 - -kernel execution time: 0.486907 ms -reference new time: 4.37081 - -kernel execution time: 0.482627 ms -TTM1: 3.67761 - -kernel execution time: 0.589078 ms -TTM2: 4.27397 - -kernel execution time: 0.095461 ms -dense: 0.492616 - -kernel execution time: 0.530937 ms -TTM after dense: 1.0284 - -ttm-ttm execution - ------------------------------------------ -/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 319686, B2_dimension: 28153045, B3_dimension: 319686, vals: 112890310 -C1_dimension: 1607191, C2_dimension: 32, vals: 51430112 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 881.367 ms -fused time: 886.111 - -reference impl time - -kernel execution time: 2050.43 ms -reference time: 2051.08 - -kernel execution time: 2002.9 ms -reference new time: 2003.54 - -kernel execution time: 260.701 ms -TTM1: 261.277 - -kernel execution time: 539.892 ms -TTM2: 540.489 - -kernel execution time: 69.5675 ms -dense: 70.0315 - -kernel execution time: 531.744 ms -TTM after dense: 532.375 -/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 12092, B2_dimension: 9184, B3_dimension: 12092, vals: 76879419 -C1_dimension: 28818, C2_dimension: 32, vals: 922176 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 46.1273 ms -fused time: 50.9231 - -reference impl time - -kernel execution time: 2363.18 ms -reference time: 2364.02 - -kernel execution time: 2340.56 ms -reference new time: 2341.2 - -kernel execution time: 82.5312 ms -TTM1: 83.1034 - -kernel execution time: 8.62143 ms -TTM2: 9.16734 - -kernel execution time: 1.20538 ms -dense: 1.48454 - -kernel execution time: 181.488 ms -TTM after dense: 181.827 - -ttm-ttm execution - ------------------------------------------ - -file: /home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns ----------------------------------------------------------------- -/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 319686, B2_dimension: 28153045, B3_dimension: 319686, vals: 112890310 -C1_dimension: 1607191, C2_dimension: 32, vals: 51430112 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 874.724 ms -fused time: 878.246 - -reference impl time - -kernel execution time: 2042.51 ms -reference time: 2043.27 - -kernel execution time: 46819.7 ms -reference new time: 46820.8 - -schedule 1 - -kernel execution time: 260.841 ms -TTM1: 261.378 - -kernel execution time: 539.264 ms -TTM2: 539.834 - -schedule 2 - -kernel execution time: 69.2965 ms -dense: 69.7197 - -kernel execution time: 532.774 ms -TTM after dense: 535.64 - -file: /home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns ----------------------------------------------------------------- -/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 12092, B2_dimension: 9184, B3_dimension: 12092, vals: 76879419 -C1_dimension: 28818, C2_dimension: 32, vals: 922176 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 51.3316 ms -fused time: 55.9685 - -reference impl time - -kernel execution time: 2363.6 ms -reference time: 2364.38 - -kernel execution time: 31523.9 ms -reference new time: 31525 - -schedule 1 - -kernel execution time: 84.4692 ms -TTM1: 84.9774 - -kernel execution time: 7.9451 ms -TTM2: 8.49167 - -schedule 2 - -kernel execution time: 1.17918 ms -dense: 1.49638 - -ttm-ttm execution - ------------------------------------------ - -file: /home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns ----------------------------------------------------------------- -/home/min/a/kadhitha/ispc-examples/data/tns/flickr-3d.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 319686, B2_dimension: 28153045, B3_dimension: 319686, vals: 112890310 -C1_dimension: 1607191, C2_dimension: 32, vals: 51430112 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 877.727 ms -fused time: 881.892 - -reference impl time - -kernel execution time: 1998.47 ms -reference time: 1999.14 - -kernel execution time: 1818.14 ms -reference new time: 1818.77 - -schedule 1 - -kernel execution time: 261.202 ms -TTM1: 261.759 - -kernel execution time: 539.615 ms -TTM2: 540.183 - -schedule 2 - -kernel execution time: 69.7746 ms -dense: 70.1943 - -kernel execution time: 532.374 ms -TTM after dense: 533.008 - -file: /home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns ----------------------------------------------------------------- -/home/min/a/kadhitha/ispc-examples/data/tns/nell-2.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 12092, B2_dimension: 9184, B3_dimension: 12092, vals: 76879419 -C1_dimension: 28818, C2_dimension: 32, vals: 922176 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 42.811 ms -fused time: 47.6618 - -reference impl time - -kernel execution time: 2267.84 ms -reference time: 2268.63 - -kernel execution time: 1379.49 ms -reference new time: 1380.15 - -schedule 1 - -kernel execution time: 81.6849 ms -TTM1: 82.4365 - -kernel execution time: 9.74645 ms -TTM2: 10.2848 - -schedule 2 - -kernel execution time: 1.47367 ms -dense: 1.78443 - -kernel execution time: 208.263 ms -TTM after dense: 210.169 - -file: /home/min/a/kadhitha/ispc-examples/data/tns/nell-1.tns ----------------------------------------------------------------- -/home/min/a/kadhitha/ispc-examples/data/tns/nell-1.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 2902330, B2_dimension: 2143368, B3_dimension: 2902330, vals: 143599552 -C1_dimension: 25495389, C2_dimension: 32, vals: 815852448 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 1299.91 ms -fused time: 1303.65 - -reference impl time - -kernel execution time: 3494.78 ms -reference time: 3497.66 - -kernel execution time: 2383.79 ms -reference new time: 2384.52 - -schedule 1 - -kernel execution time: 774.869 ms -TTM1: 775.571 - -kernel execution time: 1488.64 ms -TTM2: 1489.78 - -schedule 2 - -kernel execution time: 1121.66 ms -dense: 1122.11 - -kernel execution time: 1581.94 ms -TTM after dense: 1582.61 - -file: /home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns ----------------------------------------------------------------- -/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854 -C1_dimension: 2, C2_dimension: 32, vals: 64 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -ttm-ttm execution - ------------------------------------------ - -file: /home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns ----------------------------------------------------------------- -/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854 -C1_dimension: 2, C2_dimension: 32, vals: 64 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 746.344 ms -fused time: 749.212 - -reference impl time - -kernel execution time: 548.763 ms -reference time: 549.493 - -kernel execution time: 737.768 ms -reference new time: 738.436 - -schedule 1 - -kernel execution time: 195.639 ms -TTM1: 196.286 - -kernel execution time: 493.569 ms -TTM2: 494.15 - -schedule 2 - -kernel execution time: 0.052551 ms -dense: 0.648739 - -kernel execution time: 374.407 ms -TTM after dense: 376.248 - -file: /home/min/a/kadhitha/ispc-examples/data/tns/darpa1998.tns ----------------------------------------------------------------- -/home/min/a/kadhitha/ispc-examples/data/tns/darpa1998.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 22476, B2_dimension: 22476, B3_dimension: 22476, vals: 28421307 -C1_dimension: 23776223, C2_dimension: 32, vals: 760839136 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 221.905 ms -fused time: 222.964 - -reference impl time - -kernel execution time: 8826.57 ms -reference time: 8827.82 - -kernel execution time: 1435.28 ms -reference new time: 1437.65 - -schedule 1 - -kernel execution time: 574.934 ms -TTM1: 576.159 - -kernel execution time: 4.42254 ms -TTM2: 5.12181 - -schedule 2 - -kernel execution time: 1041.05 ms -dense: 1041.36 - -kernel execution time: 1247.06 ms -TTM after dense: 1247.76 - -ttm-ttm execution - ------------------------------------------ - -file: /home/min/a/kadhitha/ispc-examples/data/tns/nell-1.tns ----------------------------------------------------------------- -/home/min/a/kadhitha/ispc-examples/data/tns/nell-1.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 2902330, B2_dimension: 2143368, B3_dimension: 2902330, vals: 143599552 -C1_dimension: 25495389, C2_dimension: 32, vals: 815852448 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 1312.2 ms -fused time: 1315.79 - -reference impl time - -kernel execution time: 3512.84 ms -reference time: 3514.54 - -kernel execution time: 2381.97 ms -reference new time: 2382.6 - -schedule 1 - -kernel execution time: 779.205 ms -TTM1: 779.794 - -kernel execution time: 366.382 ms -TTM2: 367.081 - -schedule 2 - -kernel execution time: 1127.72 ms -dense: 1128.25 - -kernel execution time: 1579.85 ms -TTM after dense: 1580.5 - -ttm-ttm execution - ------------------------------------------ - -file: /home/min/a/kadhitha/ispc-examples/data/tns/nell-1.tns ----------------------------------------------------------------- -/home/min/a/kadhitha/ispc-examples/data/tns/nell-1.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 2902330, B2_dimension: 2143368, B3_dimension: 2902330, vals: 143599552 -C1_dimension: 25495389, C2_dimension: 32, vals: 815852448 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 1326.91 ms -fused time: 1331.56 - -reference impl time - -kernel execution time: 3535.03 ms -reference time: 3536.38 - -kernel execution time: 2387.24 ms -reference new time: 2387.99 - -schedule 1 - -kernel execution time: 780.495 ms -TTM1: 781.09 - -kernel execution time: 369.704 ms -TTM2: 370.292 - -schedule 2 - -kernel execution time: 1119.23 ms -dense: 1119.7 - -kernel execution time: 1579.78 ms -TTM after dense: 1580.54 - -ttm-ttm execution - ------------------------------------------ - -file: /home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns ----------------------------------------------------------------- -/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854 -C1_dimension: 2, C2_dimension: 64, vals: 128 -D1_dimension: 64, D2_dimension: 128, vals: 8192 - - -ttm-ttm execution - ------------------------------------------ - -file: /home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns ----------------------------------------------------------------- -/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854 -C1_dimension: 2, C2_dimension: 32, vals: 64 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 746.399 ms -fused time: 747.454 - -reference impl time - -kernel execution time: 549.908 ms -reference time: 550.683 - -kernel execution time: 731.657 ms -reference new time: 732.322 - -schedule 1 - -kernel execution time: 194.605 ms -TTM1: 195.252 - -kernel execution time: 491.591 ms -TTM2: 492.148 - -schedule 2 - -kernel execution time: 0.049841 ms -dense: 0.820181 - -kernel execution time: 372.064 ms -TTM after dense: 372.449 - -ttm-ttm execution - ------------------------------------------ - -file: /home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns ----------------------------------------------------------------- -/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854 -C1_dimension: 2, C2_dimension: 32, vals: 64 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 746.043 ms -fused time: 747.23 - -reference impl time - -kernel execution time: 561.015 ms -reference time: 561.669 - -kernel execution time: 737.535 ms -reference new time: 738.158 - -schedule 1 - -kernel execution time: 194.638 ms -TTM1: 195.169 - -kernel execution time: 495.355 ms -TTM2: 495.903 - -schedule 2 - -kernel execution time: 0.148292 ms -dense: 0.534998 - -kernel execution time: 374.231 ms -TTM after dense: 374.667 - -ttm-ttm execution - ------------------------------------------ - -file: /home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns ----------------------------------------------------------------- -/home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854 -C1_dimension: 2, C2_dimension: 32, vals: 64 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 745.881 ms -fused time: 746.992 - -reference impl time - -kernel execution time: 551.705 ms -reference time: 552.359 - -kernel execution time: 736.019 ms -reference new time: 736.611 - -schedule 1 - -kernel execution time: 194.777 ms -TTM1: 195.33 - -kernel execution time: 491.151 ms -TTM2: 491.732 - -schedule 2 - -kernel execution time: 0.144522 ms -dense: 0.528597 - -kernel execution time: 374.363 ms -TTM after dense: 374.752 - -ttm-ttm execution - ------------------------------------------ - -file: /home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns ----------------------------------------------------------------- - -ttm-ttm execution - ------------------------------------------ Europa - -file: /home/min/a/kadhitha/workspace/my_taco/tns/darpa1998.tns ----------------------------------------------------------------- -/home/min/a/kadhitha/workspace/my_taco/tns/darpa1998.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 22476, B2_dimension: 22476, B3_dimension: 22476, vals: 28421307 -C1_dimension: 23776223, C2_dimension: 32, vals: 760839136 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 2299.49 ms -fused time: 2301.59 - -reference impl time - -kernel execution time: 78844.2 ms -reference time: 78846.6 - -kernel execution time: 34427 ms -reference new time: 34429.3 - -schedule 1 - -kernel execution time: 6968.36 ms -TTM1: 6970.4 - -kernel execution time: 121.497 ms -TTM2: 123.127 - -schedule 2 - -kernel execution time: 64026.1 ms -dense: 64028 - -kernel execution time: 15531.3 ms -TTM after dense: 15533.4 - -ttm-ttm execution - ------------------------------------------ - -file: /home/min/a/kadhitha/ispc-examples/data/tns/vast-2015-mc1-3d.tns ----------------------------------------------------------------- - -ttm-ttm execution - ------------------------------------------ - -file: /home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns ----------------------------------------------------------------- -/home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854 -C1_dimension: 2, C2_dimension: 32, vals: 64 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 40017.6 ms -fused time: 40019.4 - -reference impl time - -kernel execution time: 50710.4 ms -reference time: 50712.8 - -kernel execution time: 37978.8 ms -reference new time: 37980.6 - -schedule 1 - -kernel execution time: 3848.85 ms -TTM1: 3850.48 - -ttm-ttm execution - ------------------------------------------ - -file: /home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns ----------------------------------------------------------------- -/home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 165427, vals: 26021854 -C1_dimension: 2, C2_dimension: 32, vals: 64 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 40277.5 ms -fused time: 40279.9 - -reference impl time - -kernel execution time: 50449.4 ms -reference time: 50452 - -kernel execution time: 37881.2 ms -reference new time: 37883.4 - -schedule 1 - -kernel execution time: 3987.96 ms -TTM1: 3990.09 - -kernel execution time: 40935.3 ms -TTM2: 40937.4 - -schedule 2 - -kernel execution time: 0.098195 ms -dense: 1.2874 - -kernel execution time: 12037.9 ms -TTM after dense: 12039.5 - -ttm-ttm execution - ------------------------------------------ - -file: /home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns ----------------------------------------------------------------- -/home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 2, vals: 26021854 -C1_dimension: 2, C2_dimension: 32, vals: 64 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 36918.5 ms -fused time: 36920.9 - -reference impl time - -kernel execution time: 47892.3 ms -reference time: 47894.8 - -kernel execution time: 37901.4 ms -reference new time: 37903.5 - -schedule 1 - -kernel execution time: 3801.16 ms -TTM1: 3803.21 - -kernel execution time: 43488.6 ms -TTM2: 43490.6 - -schedule 2 - -kernel execution time: 0.060642 ms -dense: 1.08588 - -kernel execution time: 15190.9 ms -TTM after dense: 15192.3 - -ttm-ttm execution - ------------------------------------------ - -file: /home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns - -ttm-ttm execution - ------------------------------------------ - -file: /home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns ----------------------------------------------------------------- -/home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 2, vals: 26021854 -C1_dimension: 2, C2_dimension: 32, vals: 64 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 35130 ms -fused time: 35133.9 - -reference impl time - -kernel execution time: 47634.1 ms -reference time: 47636.7 - -kernel execution time: 37616.7 ms -reference new time: 37618.9 - -schedule 1 - -kernel execution time: 2930.06 ms -TTM1: 2931.74 - -kernel execution time: 40710.7 ms -TTM2: 40713 - -schedule 2 - -kernel execution time: 0.07506 ms -dense: 1.28501 - -kernel execution time: 12393.3 ms -TTM after dense: 12394.9 - -ttm-ttm execution - ------------------------------------------ - -file: /home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns ----------------------------------------------------------------- -/home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 2, vals: 26021854 -C1_dimension: 2, C2_dimension: 32, vals: 64 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 12528.5 ms -fused time: 12529.7 - -reference impl time - -kernel execution time: 23576.9 ms -reference time: 23578.1 - -kernel execution time: 16282.8 ms -reference new time: 16283.8 - -schedule 1 - -kernel execution time: 1332.64 ms -TTM1: 1333.18 - -kernel execution time: 17503.1 ms -TTM2: 17504.2 - -schedule 2 - -kernel execution time: 0.025131 ms -dense: 0.438566 - -kernel execution time: 3369.58 ms -TTM after dense: 3370.48 - -ttm-ttm execution - ------------------------------------------ - -file: /home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns ----------------------------------------------------------------- -/home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 2, vals: 26021854 -C1_dimension: 2, C2_dimension: 32, vals: 64 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 12698.5 ms -fused time: 12699.7 - -reference impl time - -kernel execution time: 23669.6 ms -reference time: 23670.8 - -kernel execution time: 16390.1 ms -reference new time: 16391.1 - -schedule 1 - -kernel execution time: 1343.9 ms -TTM1: 1344.42 - -kernel execution time: 17641.6 ms -TTM2: 17642.6 - -schedule 2 - -kernel execution time: 0.02212 ms -dense: 0.397656 - -kernel execution time: 3411.14 ms -TTM after dense: 3412.04 - -ttm-ttm execution - ------------------------------------------ - -file: /home/min/a/kadhitha/ispc-examples/data/tns/delicious-3d.tns - -ttm-ttm execution - ------------------------------------------ - -file: /home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns - -ttm-ttm execution - ------------------------------------------ - -file: /home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns ----------------------------------------------------------------- -/home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 2, vals: 26021854 -C1_dimension: 2, C2_dimension: 32, vals: 64 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 844.466 ms -fused time: 845.618 - -reference impl time - -kernel execution time: 814.964 ms -reference time: 815.676 - -kernel execution time: 918.472 ms -reference new time: 919.142 - -schedule 1 - -kernel execution time: 200.521 ms -TTM1: 201.112 - -kernel execution time: 678.038 ms -TTM2: 678.647 - -schedule 2 - -kernel execution time: 0.07066 ms -dense: 0.524547 - -kernel execution time: 394.81 ms -TTM after dense: 395.266 - -ttm-ttm execution - ------------------------------------------ - -file: /home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns ----------------------------------------------------------------- -/home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 2, vals: 26021854 -C1_dimension: 2, C2_dimension: 32, vals: 64 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 2900.7 ms -fused time: 2903.25 - -reference impl time - -kernel execution time: 2746.32 ms -reference time: 2748.86 - -kernel execution time: 2812.87 ms -reference new time: 2815.19 - -schedule 1 - -kernel execution time: 2429.09 ms -TTM1: 2431.17 - -kernel execution time: 2451.88 ms -TTM2: 2454.06 - -schedule 2 - -kernel execution time: 1.43373 ms -dense: 2.85191 - -kernel execution time: 1651.7 ms -TTM after dense: 1652.91 - -ttm-ttm execution - ------------------------------------------ - -file: /home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns ----------------------------------------------------------------- -/home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 2, vals: 26021854 -C1_dimension: 2, C2_dimension: 32, vals: 64 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 3539.09 ms -fused time: 3541.54 - -reference impl time - -kernel execution time: 2968.95 ms -reference time: 2972.61 - -kernel execution time: 3354.98 ms -reference new time: 3357.43 - -schedule 1 - -kernel execution time: 2697.68 ms -TTM1: 2699.71 - -kernel execution time: 2804.11 ms -TTM2: 2806.99 - -schedule 2 - -kernel execution time: 6.38211 ms -dense: 8.06652 - -kernel execution time: 1822.02 ms -TTM after dense: 1823.06 - -ttm-ttm execution - ------------------------------------------ - -file: /home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns - -ttm-ttm execution - ------------------------------------------ - -file: /home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns ----------------------------------------------------------------- -/home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 2, vals: 26021854 -C1_dimension: 2, C2_dimension: 32, vals: 64 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 3608.92 ms -fused time: 3611.17 - -reference impl time - -kernel execution time: 3026.81 ms -reference time: 3029.09 - -kernel execution time: 3189.34 ms -reference new time: 3192.69 - -schedule 1 - -kernel execution time: 2659.86 ms -TTM1: 2661.48 - -kernel execution time: 2749.47 ms -TTM2: 2750.96 - -schedule 2 - -kernel execution time: 5.54375 ms -dense: 6.71077 - -kernel execution time: 1799.52 ms -TTM after dense: 1800.4 - -ttm-ttm execution - ------------------------------------------ - -file: /home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns ----------------------------------------------------------------- -/home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 2, vals: 26021854 -C1_dimension: 2, C2_dimension: 32, vals: 64 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 3553.08 ms -fused time: 3555.93 - -reference impl time - -kernel execution time: 2962.14 ms -reference time: 2964.25 - -kernel execution time: 3306.95 ms -reference new time: 3309.38 - -schedule 1 - -kernel execution time: 2723.22 ms -TTM1: 2724.83 - -kernel execution time: 2581.33 ms -TTM2: 2583.4 - -schedule 2 - -kernel execution time: 0.772961 ms -dense: 2.02166 - -kernel execution time: 1731.42 ms -TTM after dense: 1732.48 - -ttm-ttm execution - ------------------------------------------ - -file: /home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns ----------------------------------------------------------------- -/home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 2, vals: 26021854 -C1_dimension: 2, C2_dimension: 32, vals: 64 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 3577.13 ms -fused time: 3580.97 - -reference impl time - -kernel execution time: 3010.77 ms -reference time: 3013.04 - -kernel execution time: 3364.45 ms -reference new time: 3366.58 - -schedule 1 - -kernel execution time: 2740.85 ms -TTM1: 2742.84 - -kernel execution time: 2788.11 ms -TTM2: 2790.79 - -schedule 2 - -kernel execution time: 2.57712 ms -dense: 4.23057 - -kernel execution time: 1934.52 ms -TTM after dense: 1935.9 - -ttm-ttm execution - ------------------------------------------ - -file: /home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns ----------------------------------------------------------------- -/home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 2, vals: 26021854 -C1_dimension: 2, C2_dimension: 32, vals: 64 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 3424.23 ms -fused time: 3426.81 - -reference impl time - -kernel execution time: 3023.35 ms -reference time: 3025.97 - -kernel execution time: 3086.35 ms -reference new time: 3089.41 - -schedule 1 - -kernel execution time: 2913.43 ms -TTM1: 2915.13 - -kernel execution time: 2623.7 ms -TTM2: 2625.65 - -schedule 2 - -kernel execution time: 5.28416 ms -dense: 6.61329 - -kernel execution time: 1971.48 ms -TTM after dense: 1972.7 - -ttm-ttm execution - ------------------------------------------ - -file: /home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns ----------------------------------------------------------------- -/home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 2, vals: 26021854 -C1_dimension: 2, C2_dimension: 32, vals: 64 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 3693.12 ms -fused time: 3695.79 - -reference impl time - -kernel execution time: 2900.73 ms -reference time: 2902.96 - -kernel execution time: 3138.83 ms -reference new time: 3141.16 - -schedule 1 - -kernel execution time: 2673.94 ms -TTM1: 2675.57 - -kernel execution time: 2703.37 ms -TTM2: 2705.31 - -schedule 2 - -kernel execution time: 5.31585 ms -dense: 7.12051 - -kernel execution time: 1724.31 ms -TTM after dense: 1726.36 - -ttm-ttm execution - ------------------------------------------ - -file: /home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns ----------------------------------------------------------------- -/home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 2, vals: 26021854 -C1_dimension: 2, C2_dimension: 32, vals: 64 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 3572.56 ms -fused time: 3575.03 - -reference impl time - -kernel execution time: 2939.46 ms -reference time: 2941.84 - -kernel execution time: 3182.38 ms -reference new time: 3184.81 - -schedule 1 - -kernel execution time: 2731.33 ms -TTM1: 2733.2 - -kernel execution time: 2782.07 ms -TTM2: 2784.32 - -schedule 2 - -kernel execution time: 5.52055 ms -dense: 7.06503 - -kernel execution time: 1729.87 ms -TTM after dense: 1730.87 - -ttm-ttm execution - ------------------------------------------ - -file: /home/min/a/kadhitha/workspace/my_taco/tns/darpa1998.tns - -ttm-ttm execution - ------------------------------------------ - -file: /home/min/a/kadhitha/workspace/my_taco/tns/darpa1998.tns ----------------------------------------------------------------- -/home/min/a/kadhitha/workspace/my_taco/tns/darpa1998.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 22476, B2_dimension: 22476, B3_dimension: 23776223, vals: 28421307 -C1_dimension: 23776223, C2_dimension: 32, vals: 760839136 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 1404.79 ms -fused time: 1406.83 - -reference impl time - -kernel execution time: 28471.3 ms -reference time: 28474.9 - -kernel execution time: 5689.54 ms -reference new time: 5692.1 - -schedule 1 - -kernel execution time: 3526.34 ms -TTM1: 3528.66 - -kernel execution time: 21.5542 ms -TTM2: 23.6182 - -schedule 2 - -kernel execution time: 6069.99 ms -dense: 6071.91 - -kernel execution time: 6163.35 ms -TTM after dense: 6165.73 - -ttm-ttm execution - ------------------------------------------ - -file: /home/min/a/kadhitha/workspace/my_taco/tns/darpa1998.tns ----------------------------------------------------------------- -/home/min/a/kadhitha/workspace/my_taco/tns/darpa1998.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 22476, B2_dimension: 22476, B3_dimension: 23776223, vals: 28421307 -C1_dimension: 23776223, C2_dimension: 32, vals: 760839136 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 1390.55 ms -fused time: 1392.48 - -reference impl time - -kernel execution time: 30840.6 ms -reference time: 30843.4 - -kernel execution time: 5638.37 ms -reference new time: 5641.01 - -schedule 1 - -kernel execution time: 3642.19 ms -TTM1: 3644.13 - -kernel execution time: 24.3447 ms -TTM2: 25.6449 - -schedule 2 - -kernel execution time: 6027.41 ms -dense: 6029.82 - -kernel execution time: 6494.21 ms -TTM after dense: 6497.33 - -ttm-ttm execution - ------------------------------------------ - -file: /home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns ----------------------------------------------------------------- -/home/min/a/kadhitha/workspace/my_taco/tns/vast-2015-mc1-3d.tns -A(i,j,m) = B(i,j,k) * C(k,l) * D(l,m) -B1_dimension: 165427, B2_dimension: 11374, B3_dimension: 2, vals: 26021854 -C1_dimension: 2, C2_dimension: 32, vals: 64 -D1_dimension: 32, D2_dimension: 64, vals: 2048 - - -kernel execution time: 3727.32 ms -fused time: 3729.78 - -reference impl time - -kernel execution time: 2996.48 ms -reference time: 2999.42 - -kernel execution time: 3216.53 ms -reference new time: 3218.79 - -schedule 1 - -kernel execution time: 2902.94 ms -TTM1: 2904.86 - -kernel execution time: 2722.22 ms -TTM2: 2724.59 - -schedule 2 - -kernel execution time: 5.8157 ms -dense: 7.48208 - -kernel execution time: 1725.24 ms -TTM after dense: 1726.69 diff --git a/test/tests-scheduling-fuse.cpp b/test/tests-scheduling-fuse.cpp index 41fb86f6f..1a941175c 100644 --- a/test/tests-scheduling-fuse.cpp +++ b/test/tests-scheduling-fuse.cpp @@ -5,19 +5,12 @@ #include #include "gtest/gtest.h" #include -#include #define NUM_THREADS_TO_USE 1 // #define NUM_THREADS_TO_USE 32 -void handle_error (int retval) -{ - printf("PAPI error %d: %s\n", retval, PAPI_strerror(retval)); - exit(1); -} - TEST(scheduling_eval, spmvFusedWithSyntheticData) { - if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) { + if (should_use_CUDA_codegen()) { return; } taco_set_num_threads(NUM_THREADS_TO_USE); @@ -146,7 +139,7 @@ TEST(scheduling_eval, spmvFusedWithSyntheticData) { } TEST(scheduling_eval, spmvFused) { - if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) { + if (should_use_CUDA_codegen()) { return; } @@ -333,7 +326,7 @@ TEST(scheduling_eval, spmvFused) { } TEST(scheduling_eval, sddmmFusedWithSyntheticData) { - if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) { + if (should_use_CUDA_codegen()) { return; } @@ -494,7 +487,7 @@ IndexStmt scheduleSDDMMCPU_forfuse(IndexStmt stmt, Tensor B, int CHUNK_S } TEST(scheduling_eval, sddmmFused) { - if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) { + if (should_use_CUDA_codegen()) { return; } @@ -775,7 +768,7 @@ TEST(scheduling_eval, sddmmFused) { TEST(scheduling_eval, hadamardFused) { - if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) { + if (should_use_CUDA_codegen()) { return; } @@ -1047,7 +1040,7 @@ TEST(scheduling_eval, hadamardFused) { TEST(scheduling_eval, mttkrpFusedWithSyntheticData) { - if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) { + if (should_use_CUDA_codegen()) { return; } taco_set_num_threads(NUM_THREADS_TO_USE); @@ -1187,7 +1180,7 @@ TEST(scheduling_eval, mttkrpFusedWithSyntheticData) { TEST(scheduling_eval, mttkrpFused) { - if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) { + if (should_use_CUDA_codegen()) { return; } @@ -1602,13 +1595,6 @@ TEST(scheduling_eval, ttmFused) { return; } - int retval, EventSet = PAPI_NULL; - retval = PAPI_hl_region_begin("dummy"); - if ( retval != PAPI_OK ) handle_error(1); - - retval = PAPI_hl_region_end("dummy"); - if ( retval != PAPI_OK ) handle_error(1); - taco_set_num_threads(NUM_THREADS_TO_USE); ofstream statfile; @@ -1806,9 +1792,7 @@ TEST(scheduling_eval, ttmFused) { // TOOL_BENCHMARK_TIMER(ref.compute(), "\n\nReference ISPC: ", timevalue); std::string sofile_fused = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/fused.so"; - retval = PAPI_hl_region_begin("fusedTTM"); if ( retval != PAPI_OK ) handle_error(1); TOOL_BENCHMARK_TIMER(A.compute(statfile, sofile_fused), "\n\nFused TTM->TTM: ", timevalue); - retval = PAPI_hl_region_end("fusedTTM"); if ( retval != PAPI_OK ) handle_error(1); if (statfile.is_open()) { statfile << "fused time: "; statfile << timevalue.mean << std::endl; @@ -1822,9 +1806,7 @@ TEST(scheduling_eval, ttmFused) { statfile << "\nreference impl time \n"; std::string sofile_original = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/ttm_original.so"; - retval = PAPI_hl_region_begin("referenceTTM"); if ( retval != PAPI_OK ) handle_error(1); TOOL_BENCHMARK_TIMER(ref.compute(statfile, sofile_original), "\n\nReference TTM->TTM: ", timevalue); - retval = PAPI_hl_region_end("referenceTTM"); if ( retval != PAPI_OK ) handle_error(1); if (statfile.is_open()) { statfile << "reference time: "; statfile << timevalue.mean << std::endl; @@ -1836,9 +1818,7 @@ TEST(scheduling_eval, ttmFused) { } std::string sofile_original2 = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/ttm_original2.so"; - retval = PAPI_hl_region_begin("ref2TTM"); if ( retval != PAPI_OK ) handle_error(1); TOOL_BENCHMARK_TIMER(refn.compute(statfile, sofile_original2), "\n\nReference new TTM->TTM: ", timevalue); - retval = PAPI_hl_region_end("ref2TTM"); if ( retval != PAPI_OK ) handle_error(1); if (statfile.is_open()) { statfile << "reference new time: "; statfile << timevalue.mean << std::endl; @@ -1852,9 +1832,7 @@ TEST(scheduling_eval, ttmFused) { } std::string sofile_ttm11 = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/ttm1_1.so"; - retval = PAPI_hl_region_begin("ttm1_1"); if ( retval != PAPI_OK ) handle_error(1); TOOL_BENCHMARK_TIMER(ref1.compute(statfile, sofile_ttm11), "\n\nTTM1: ", timevalue); - retval = PAPI_hl_region_end("ttm1_1"); if ( retval != PAPI_OK ) handle_error(1); if (statfile.is_open()) { statfile << "TTM1: "; statfile << timevalue.mean << std::endl; @@ -1866,9 +1844,7 @@ TEST(scheduling_eval, ttmFused) { } std::string sofile_ttm2 = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/ttm2.so"; - retval = PAPI_hl_region_begin("ttm2"); if ( retval != PAPI_OK ) handle_error(1); TOOL_BENCHMARK_TIMER(ref2.compute(statfile, sofile_ttm2), "\n\nTTM2: ", timevalue); - retval = PAPI_hl_region_end("ttm2"); if ( retval != PAPI_OK ) handle_error(1); if (statfile.is_open()) { statfile << "TTM2: "; statfile << timevalue.mean << std::endl; @@ -1881,9 +1857,7 @@ TEST(scheduling_eval, ttmFused) { statfile << "\nschedule 2\n"; - retval = PAPI_hl_region_begin("gemm"); if ( retval != PAPI_OK ) handle_error(1); TOOL_BENCHMARK_TIMER(ref3.compute(statfile), "\n\ndense: ", timevalue); - retval = PAPI_hl_region_end("gemm"); if ( retval != PAPI_OK ) handle_error(1); if (statfile.is_open()) { statfile << "dense: "; statfile << timevalue.mean << std::endl; @@ -1895,9 +1869,7 @@ TEST(scheduling_eval, ttmFused) { } std::string sofile_ttm12 = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/ttm_ttm/ttm1_2.so"; - retval = PAPI_hl_region_begin("ttm1_2"); if ( retval != PAPI_OK ) handle_error(1); TOOL_BENCHMARK_TIMER(ref4.compute(statfile, sofile_ttm12), "\n\nTTM after dense: ", timevalue); - retval = PAPI_hl_region_end("ttm1_2"); if ( retval != PAPI_OK ) handle_error(1); if (statfile.is_open()) { statfile << "TTM after dense: "; statfile << timevalue.mean << std::endl; @@ -1965,7 +1937,7 @@ TEST(scheduling_eval, ttmFused) { TEST(scheduling_eval, spmmFusedWithSyntheticData) { - if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) { + if (should_use_CUDA_codegen()) { return; } @@ -2127,19 +2099,10 @@ TEST(scheduling_eval, spmmFusedWithSyntheticData) { TEST(scheduling_eval, spmmFused) { - if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) { + if (should_use_CUDA_codegen()) { return; } - // int retval, EventSet = PAPI_NULL; - // retval = PAPI_hl_region_begin("dummy"); - // if ( retval != PAPI_OK ) handle_error(1); - - /* Do some computation */ - - // retval = PAPI_hl_region_end("dummy"); - // if ( retval != PAPI_OK ) handle_error(1); - taco_set_num_threads(NUM_THREADS_TO_USE); ofstream statfile; @@ -2391,42 +2354,26 @@ TEST(scheduling_eval, spmmFused) { statfile << "\n--------- 1st pattern computation TTM, GEMM\n"; - // retval = PAPI_hl_region_begin("spmm"); - // if ( retval != PAPI_OK ) handle_error(1); TOOL_BENCHMARK_TIMER(ref1.compute(statfile), "\n\nSpMM Kernel: ", timevalue); - // retval = PAPI_hl_region_end("spmm"); - // if ( retval != PAPI_OK ) handle_error(1); if (statfile.is_open()) { statfile << "SpMM time: "; statfile << timevalue.mean << std::endl; } else { std::cout << " stat file is not open\n"; } - std::string sofile_spmm_template = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so"; - // retval = PAPI_hl_region_begin("spmmtemplate"); - // if ( retval != PAPI_OK ) handle_error(1); + std::string sofile_spmm_template = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/sddmm_spmm/csr_dense_spmm.so"; TOOL_BENCHMARK_TIMER(ref1.compute(statfile, sofile_spmm_template), "\n\nSpMM template Kernel: ", timevalue); - // retval = PAPI_hl_region_end("spmmtemplate"); - // if ( retval != PAPI_OK ) handle_error(1); if (statfile.is_open()) { statfile << "SpMM template time: "; statfile << timevalue.mean << std::endl; } else { std::cout << " stat file is not open\n"; } - - // retval = PAPI_hl_region_begin("gemm"); - // if ( retval != PAPI_OK ) handle_error(1); + TOOL_BENCHMARK_TIMER(ref2.compute(statfile), "\n\nGeMM Kernel: ", timevalue); - // retval = PAPI_hl_region_end("gemm"); - // if ( retval != PAPI_OK ) handle_error(1); if (statfile.is_open()) { statfile << "GeMM time: "; statfile << timevalue.mean << std::endl; } else { std::cout << " stat file is not open\n"; } - // retval = PAPI_hl_region_begin("gemmtemplate"); - // if ( retval != PAPI_OK ) handle_error(1); - TOOL_BENCHMARK_TIMER(ref2_2.compute(statfile), "\n\nref GeMM template Kernel: ", timevalue); - // retval = PAPI_hl_region_end("gemmtemplate"); - // if ( retval != PAPI_OK ) handle_error(1); + TOOL_BENCHMARK_TIMER(ref2_2.compute(statfile), "\n\nref GeMM template Kernel: ", timevalue); if (statfile.is_open()) { statfile << "ref 2 GeMM template time: "; statfile << timevalue.mean << std::endl; @@ -2434,21 +2381,13 @@ TEST(scheduling_eval, spmmFused) { // std::string sofile_gemm_template = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/spmm_gemm/spmm_template.so"; statfile << "\n--------- 2nd pattern computation GEMM, SpMM\n"; - // retval = PAPI_hl_region_begin("gemmtemplate2"); - // if ( retval != PAPI_OK ) handle_error(1); - TOOL_BENCHMARK_TIMER(ref3.compute(statfile), "\n\nGeMM template ref3 Kernel: ", timevalue); - // retval = PAPI_hl_region_end("gemmtemplate2"); - // if ( retval != PAPI_OK ) handle_error(1); + TOOL_BENCHMARK_TIMER(ref3.compute(statfile), "\n\nGeMM template ref3 Kernel: ", timevalue); if (statfile.is_open()) { statfile << "ref3 GeMM template time: "; statfile << timevalue.mean << std::endl; } else { std::cout << " stat file is not open\n"; } - // retval = PAPI_hl_region_begin("spmm2"); - // if ( retval != PAPI_OK ) handle_error(1); TOOL_BENCHMARK_TIMER(ref4.compute(statfile, sofile_spmm_template), "\n\nSpMM template Kernel ref4: ", timevalue); - // retval = PAPI_hl_region_end("spmm2"); - // if ( retval != PAPI_OK ) handle_error(1); if (statfile.is_open()) { statfile << "SpMM template time ref4: "; statfile << timevalue.mean << std::endl; @@ -2457,32 +2396,20 @@ TEST(scheduling_eval, spmmFused) { statfile << "\n-------- reference pattern computation\n"; - // retval = PAPI_hl_region_begin("ref"); - // if ( retval != PAPI_OK ) handle_error(1); - TOOL_BENCHMARK_TIMER(ref.compute(statfile), "\n\nReference Kernel: ", timevalue); - // retval = PAPI_hl_region_end("ref"); - // if ( retval != PAPI_OK ) handle_error(1); + TOOL_BENCHMARK_TIMER(ref.compute(statfile), "\n\nReference Kernel: ", timevalue); if (statfile.is_open()) { statfile << "taco reference time: "; statfile << timevalue << std::endl; } else { std::cout << " stat file is not open\n"; } - // retval = PAPI_hl_region_begin("refnew"); - // if ( retval != PAPI_OK ) handle_error(1); - TOOL_BENCHMARK_TIMER(refn.compute(statfile), "\n\nReference new Kernel: ", timevalue); - // retval = PAPI_hl_region_end("refnew"); - // if ( retval != PAPI_OK ) handle_error(1); + TOOL_BENCHMARK_TIMER(refn.compute(statfile), "\n\nReference new Kernel: ", timevalue); if (statfile.is_open()) { statfile << "taco reference new time: "; statfile << timevalue << std::endl; } else { std::cout << " stat file is not open\n"; } - // retval = PAPI_hl_region_begin("sparselnr"); - // if ( retval != PAPI_OK ) handle_error(1); TOOL_BENCHMARK_TIMER(A.compute(statfile), "\n\nFused Kernel: ", timevalue); - // retval = PAPI_hl_region_end("sparselnr"); - // if ( retval != PAPI_OK ) handle_error(1); if (statfile.is_open()) { statfile << "fused time: "; statfile << timevalue.mean << std::endl; @@ -2532,44 +2459,6 @@ TEST(scheduling_eval, spmmFused) { statfile.close(); } - - // unsigned int native = 0x0; - - // retval = PAPI_library_init(PAPI_VER_CURRENT); - - // if (retval != PAPI_VER_CURRENT) { - // printf("PAPI library init error!\n"); - // exit(1); - // } else { - // printf("PAPI library init success\n"); - // } - - // if (PAPI_create_eventset(&EventSet) != PAPI_OK) { - // handle_error(1); - // } - - // /* Add the native event */ - // native = () - - // retval = PAPI_hl_region_begin("computation1"); - // if ( retval != PAPI_OK ) - // handle_error(1); - - // /* Do some computation */ - - // retval = PAPI_hl_region_end("computation1"); - // if ( retval != PAPI_OK ) - // handle_error(1); - - // retval = PAPI_hl_region_begin("computation2"); - // if ( retval != PAPI_OK ) - // handle_error(1); - - // /* Do some computation */ - - // retval = PAPI_hl_region_end("computation2"); - // if ( retval != PAPI_OK ) - // handle_error(1); } @@ -2578,7 +2467,7 @@ TEST(scheduling_eval, spmmFused) { TEST(scheduling_eval, sddmmspmmFused) { - if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) { + if (should_use_CUDA_codegen()) { return; } diff --git a/test/tests-scheduling-ispc-eval.cpp b/test/tests-scheduling-ispc-eval.cpp deleted file mode 100644 index 139597f9c..000000000 --- a/test/tests-scheduling-ispc-eval.cpp +++ /dev/null @@ -1,2 +0,0 @@ - - From 0cfa13368ba4f62d40e8413ede296a8e36eccefc Mon Sep 17 00:00:00 2001 From: Adhhitha Dias Date: Wed, 11 May 2022 14:19:55 -0400 Subject: [PATCH 12/16] remove papi and ispc test cases --- CMakeLists.txt | 33 - test/tests-indexstmt.cpp | 1 - test/tests-scheduling-eval.cpp | 1543 ++------------------------------ test/tests-scheduling-fuse.cpp | 626 ++++++------- test/util.h | 27 - 5 files changed, 400 insertions(+), 1830 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c9012ca2d..e9ec7be7a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -95,39 +95,6 @@ if(OPENMP) set(C_CXX_FLAGS "-fopenmp ${C_CXX_FLAGS}") endif(OPENMP) -set(PAPI_DIR "/home/min/a/kadhitha/workspace/my_taco/papi/src/install/") - -find_path(PAPI_DIR - NAMES include/papi.h -) - -find_library(PAPI_LIBRARIES - # Pick the static library first for easier run-time linking. - NAMES libpapi.a papi - HINTS ${PAPI_DIR}/lib ${HILTIDEPS}/lib -) - -find_path(PAPI_INCLUDE_DIRS - NAMES papi.h - HINTS ${PAPI_DIR}/include ${HILTIDEPS}/include -) - -include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(PAPI DEFAULT_MSG - PAPI_LIBRARIES - PAPI_INCLUDE_DIRS -) - -mark_as_advanced( - PAPI_PREFIX_DIRS - PAPI_LIBRARIES - PAPI_INCLUDE_DIRS -) - -include_directories(${PAPI_INCLUDE_DIRS}) - -# project (ValgrindExample) - if(COVERAGE) find_program(PATH_TO_GCOVR gcovr REQUIRED) # add coverage tooling to build flags diff --git a/test/tests-indexstmt.cpp b/test/tests-indexstmt.cpp index 123bea3e6..ae80e5493 100644 --- a/test/tests-indexstmt.cpp +++ b/test/tests-indexstmt.cpp @@ -112,7 +112,6 @@ TEST(indexstmt, sddmm) { std::cout << "topologically reordered loops statement: " << spmm << std::endl; Kernel kernel = compile(spmm); - kernel.compute(); } TEST(indexstmt, sddmmPlusSpmm) { diff --git a/test/tests-scheduling-eval.cpp b/test/tests-scheduling-eval.cpp index 29a7e512e..3c5362118 100644 --- a/test/tests-scheduling-eval.cpp +++ b/test/tests-scheduling-eval.cpp @@ -10,14 +10,6 @@ IndexStmt scheduleSpMVCPU(IndexStmt stmt, int CHUNK_SIZE=16) { .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); } -IndexStmt scheduleSpMVISPC(IndexStmt stmt, int CHUNK_SIZE=16) { - IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1"); - // return stmt; - return stmt.split(i, i0, i1, CHUNK_SIZE) - .reorder({i0, i1, j}) - .parallelize(i0, ParallelUnit::CPUSimd, OutputRaceStrategy::NoRaces); -} - IndexStmt scheduleSpMMCPU(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); return stmt.split(i, i0, i1, CHUNK_SIZE) @@ -28,80 +20,6 @@ IndexStmt scheduleSpMMCPU(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, i .parallelize(k, ParallelUnit::CPUVector, OutputRaceStrategy::IgnoreRaces); } -IndexStmt scheduleSpMMISPC1(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { - IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); - return stmt.split(i, i0, i1, CHUNK_SIZE) - .pos(j, jpos, A(i,j)) - .split(jpos, jpos0, jpos1, UNROLL_FACTOR) - .reorder({i0, i1, jpos0, k, jpos1}) - // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) - .parallelize(k, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); -} - -IndexStmt scheduleSpMMISPCOMP1(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { - IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); - return stmt.split(i, i0, i1, CHUNK_SIZE) - .pos(j, jpos, A(i,j)) - .split(jpos, jpos0, jpos1, UNROLL_FACTOR) - .reorder({i0, i1, jpos0, k, jpos1}) - .parallelize(i0, ParallelUnit::CPUSpmd, OutputRaceStrategy::NoRaces) - .parallelize(k, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); -} - -IndexStmt scheduleSpMMISPC1_2(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { - IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); - return stmt.split(i, i0, i1, CHUNK_SIZE) - .pos(j, jpos, A(i,j)) - .split(jpos, jpos0, jpos1, UNROLL_FACTOR) - .reorder({i0, i1, jpos0, k, jpos1}) - // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) - .parallelize(i0, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); -} - -IndexStmt scheduleSpMMISPC1_3(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { - IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); - return stmt.split(i, i0, i1, CHUNK_SIZE) - .pos(j, jpos, A(i,j)) - .split(jpos, jpos0, jpos1, UNROLL_FACTOR) - .reorder({i0, i1, jpos0, k, jpos1}) - // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) - .parallelize(i1, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); -} - -IndexStmt scheduleSpMMISPC2(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { - IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); - return stmt - .parallelize(k, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); -} - -IndexStmt scheduleSpMMISPC2_2(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { - IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); - return stmt - .parallelize(i, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); -} - -IndexStmt scheduleSpMMISPC3(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { - IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); - return stmt - // .split(i, i0, i1, CHUNK_SIZE) - // .pos(j, jpos, A(i,j)) - // .split(jpos, jpos0, jpos1, UNROLL_FACTOR) - .reorder({j, k}) - // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) - .parallelize(k, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); -} - -IndexStmt scheduleSpMMISPC3_2(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { - IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); - return stmt - // .split(i, i0, i1, CHUNK_SIZE) - // .pos(j, jpos, A(i,j)) - // .split(jpos, jpos0, jpos1, UNROLL_FACTOR) - .reorder({j, k}) - // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) - .parallelize(i, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); -} - IndexStmt scheduleSpGEMMCPU(IndexStmt stmt, bool doPrecompute) { Assignment assign = stmt.as().getStmt().as().getStmt() .as().getStmt().as(); @@ -176,47 +94,6 @@ IndexStmt scheduleSDDMM2CPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, .parallelize(jpos1, ParallelUnit::CPUVector, OutputRaceStrategy::ParallelReduction); } -IndexStmt scheduleSDDMMISPC(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { - IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1"); - return stmt.split(i, i0, i1, CHUNK_SIZE) - .pos(k, kpos, B(i,k)) - .split(kpos, kpos0, kpos1, UNROLL_FACTOR) - .reorder({i0, i1, kpos0, j, kpos1}) - // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); - .parallelize(kpos1, ParallelUnit::CPUSimd, OutputRaceStrategy::ParallelReduction); -} - -IndexStmt scheduleSDDMM2ISPC(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { - IndexVar i0("i0"), i1("i1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); - return stmt.split(i, i0, i1, CHUNK_SIZE) - .pos(j, jpos, B(i,j)) - .split(jpos, jpos0, jpos1, UNROLL_FACTOR) - .reorder({i0, i1, jpos0, k, jpos1}) - // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); - .parallelize(jpos1, ParallelUnit::CPUSimd, OutputRaceStrategy::ParallelReduction); -} - -IndexStmt scheduleSDDMMISPC1(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { - IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1"); - return stmt.split(i, i0, i1, CHUNK_SIZE) - .pos(k, kpos, B(i,k)) - .split(kpos, kpos0, kpos1, UNROLL_FACTOR) - .reorder({i0, i1, kpos0, j, kpos1}) - .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) - .parallelize(kpos1, ParallelUnit::CPUSimd, OutputRaceStrategy::ParallelReduction); -} - -IndexStmt scheduleSDDMMISPC2(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { - IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1"); - return stmt; - // .split(i, i0, i1, CHUNK_SIZE) - // .pos(k, kpos, B(i,k)) - // .split(kpos, kpos0, kpos1, UNROLL_FACTOR) - // .reorder({i0, i1, kpos0, j, kpos1}) - // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) - // .parallelize(kpos1, ParallelUnit::CPUSimd, OutputRaceStrategy::ParallelReduction); -} - IndexStmt scheduleTTVCPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16) { IndexVar f("f"), fpos("fpos"), chunk("chunk"), fpos2("fpos2"); return stmt.fuse(i, j, f) @@ -226,16 +103,6 @@ IndexStmt scheduleTTVCPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16) { .parallelize(chunk, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); } -IndexStmt scheduleTTVISPC(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16) { - IndexVar f("f"), fpos("fpos"), chunk("chunk"), fpos2("fpos2"); - // return stmt; - return stmt.fuse(i, j, f) - .pos(f, fpos, B(i,j,k)) - .split(fpos, chunk, fpos2, CHUNK_SIZE) - .reorder({chunk, fpos2, k}) - .parallelize(chunk, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); -} - IndexStmt scheduleTTVCPUCSR(IndexStmt stmt) { TensorVar result = stmt.as().getStmt().as().getStmt() .as().getStmt().as().getLhs() @@ -252,18 +119,6 @@ IndexStmt scheduleTTVCPUCSR_ST(IndexStmt stmt) { return stmt.assemble(result, AssembleStrategy::Insert); } -IndexStmt scheduleTTVISPCCSR(IndexStmt stmt) { - TensorVar result = stmt.as().getStmt().as().getStmt() - .as().getStmt().as().getLhs() - .getTensorVar(); - return stmt.assemble(result, AssembleStrategy::Insert) - .parallelize(i, ParallelUnit::CPUSimd, OutputRaceStrategy::NoRaces); -} - -IndexStmt scheduleTTVISPCCSR2(IndexStmt stmt) { - return stmt; -} - IndexStmt scheduleTTMCPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { IndexVar f("f"), fpos("fpos"), chunk("chunk"), fpos2("fpos2"), kpos("kpos"), kpos1("kpos1"), kpos2("kpos2"); return stmt.fuse(i, j, f) @@ -301,18 +156,6 @@ IndexStmt scheduleMTTKRPCPU_ST(IndexStmt stmt, Tensor B, int CHUNK_SIZE= // .parallelize(i1, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); } -IndexStmt scheduleMTTKRPISPC(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { - IndexVar i1("i1"), i2("i2"); - IndexExpr precomputeExpr = stmt.as().getStmt().as().getStmt() - .as().getStmt().as().getStmt() - .as().getRhs().as().getA(); - TensorVar w("w", Type(Float64, {Dimension(j)}), taco::dense); - return stmt.split(i, i1, i2, CHUNK_SIZE) - .reorder({i1, i2, k, l, j}) - .precompute(precomputeExpr, j, j, w) - .parallelize(j, ParallelUnit::CPUSimd, OutputRaceStrategy::NoRaces); -} - IndexStmt scheduleMTTKRPPrecomputedCPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { IndexVar i1("i1"), i2("i2"), j_pre("j_pre"); return stmt.split(i, i1, i2, CHUNK_SIZE) @@ -324,11 +167,6 @@ IndexStmt scheduleMTTKRPPrecomputedCPU_ST(IndexStmt stmt, Tensor B, int return stmt.split(i, i1, i2, CHUNK_SIZE); } -IndexStmt scheduleMTTKRPPrecomputedISPC_ST(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { - IndexVar i1("i1"), i2("i2"), j_pre("j_pre"); - return stmt.parallelize(j, ParallelUnit::CPUSimd, OutputRaceStrategy::NoRaces); -} - IndexStmt scheduleMTTKRP4CPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { IndexVar i1("i1"), i2("i2"); return stmt.split(i, i1, i2, CHUNK_SIZE) @@ -342,13 +180,6 @@ IndexStmt scheduleMTTKRP4CPU_ST(IndexStmt stmt, Tensor B, int CHUNK_SIZE .reorder({i1, i2, k, l, m, j}); } -IndexStmt scheduleMTTKRP4ISPC_ST(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { - IndexVar i1("i1"), i2("i2"); - return stmt.split(i, i1, i2, CHUNK_SIZE) - .reorder({i1, i2, k, l, m, j}) - .parallelize(j, ParallelUnit::CPUSimd, OutputRaceStrategy::NoRaces); -} - IndexStmt scheduleMTTKRP5CPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { IndexVar i1("i1"), i2("i2"); return stmt.split(i, i1, i2, CHUNK_SIZE) @@ -763,92 +594,6 @@ TEST(scheduling_eval, spmmCPU) { ASSERT_TENSOR_EQ(expected, C); } -TEST(scheduling_eval, spmmISPC) { - taco::util::TimeResults timevalue; - bool time = true; - - set_ISPC_codegen_enabled(false); - set_CUDA_codegen_enabled(false); - - int NUM_I = 1021/10; - int NUM_J = 1039/10; - int NUM_K = 128; - float SPARSITY = .1; - Tensor A("A", {NUM_I, NUM_J}, CSR); - Tensor B("B", {NUM_J, NUM_K}, {Dense, Dense}); - Tensor C("C", {NUM_I, NUM_K}, {Dense, Dense}); - - srand(75883); - for (int i = 0; i < NUM_I; i++) { - for (int j = 0; j < NUM_J; j++) { - float rand_float = (float)rand()/(float)(RAND_MAX); - if (rand_float < SPARSITY) { - A.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY))); - } - } - } - - for (int j = 0; j < NUM_J; j++) { - for (int k = 0; k < NUM_K; k++) { - float rand_float = (float)rand()/(float)(RAND_MAX); - B.insert({j, k}, (double) ((int) (rand_float*3/SPARSITY))); - } - } - - A.pack(); - B.pack(); - - set_ISPC_codegen_enabled(true); - C(i, k) = A(i, j) * B(j, k); - - IndexStmt stmt = C.getAssignment().concretize(); - // stmt = scheduleSpMMISPC1(stmt, A); - // stmt = scheduleSpMMISPC1_2(stmt, A); - stmt = scheduleSpMMISPC1_3(stmt, A); - - // stmt = scheduleSpMMISPC2(stmt, A); - // stmt = scheduleSpMMISPC2_2(stmt, A); - - // stmt = scheduleSpMMISPC3(stmt, A); - // stmt = scheduleSpMMISPC3_2(stmt, A); - - //printToFile("spmm_cpu", stmt); - - C.compile(stmt); - C.assemble(); - C.compute(); - - set_ISPC_codegen_enabled(false); - Tensor expected("expected", {NUM_I, NUM_K}, {Dense, Dense}); - expected(i, k) = A(i, j) * B(j, k); - IndexStmt stmt_taco = expected.getAssignment().concretize(); - stmt_taco = scheduleSpMMCPU(stmt_taco, A); - - expected.compile(stmt_taco); - expected.assemble(); - expected.compute(); - ASSERT_TENSOR_EQ(expected, C); - - // float ERROR_MARGIN = 0.01; - // ASSERT_TENSOR_VAL(expected, y); - for (int i = 0; i < NUM_I; i++) { - for (int k = 0; k < NUM_K; k++) { - if (expected(i,k) <= C(i,k) + ERROR_MARGIN && expected(i,k) >= C(i,k) - ERROR_MARGIN) { - // std::cout << "matched values: expected -> " << expected(j) << " == " << y(j) << " <- actual\n"; - } - else { - std::cout << "unmatched values: expected -> " << expected(i,k) << " != " << C(i,k) << " <- actual\n"; - ASSERT_TRUE(false); - }; - } - } - - for (int i=0; i<10; i++) { - TOOL_BENCHMARK_TIMER(C.compute(), "Compute ISPC: ", timevalue); - TOOL_BENCHMARK_TIMER(expected.compute(), "Compute TACO: ", timevalue); - } -} - struct spgemm : public TestWithParam> {}; TEST_P(spgemm, scheduling_eval) { @@ -1093,7 +838,7 @@ TEST(scheduling_eval, sddmmCPU) { } TEST(scheduling_eval, sddmmSPMMFusedCPU) { - if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) { + if (should_use_CUDA_codegen()) { return; } @@ -1272,355 +1017,87 @@ TEST(scheduling_eval, sddmm2CPU) { } - -// bin/taco-test --gtest_filter=scheduling_eval.sddmmISPC -TEST(scheduling_eval, sddmmISPC) { - - taco::util::TimeResults timevalue; - bool time = true; - - set_CUDA_codegen_enabled(false); - set_ISPC_codegen_enabled(false); - +TEST(scheduling_eval, spmvCPU) { + if (should_use_CUDA_codegen()) { + return; + } int NUM_I = 1021/10; int NUM_J = 1039/10; - int NUM_K = 1057/10; float SPARSITY = .3; - Tensor A("A", {NUM_I, NUM_K}, {Dense, Dense}); - Tensor B("B", {NUM_I, NUM_K}, CSR); - Tensor C("C", {NUM_I, NUM_J}, {Dense, Dense}); - Tensor D("D", {NUM_J, NUM_K}, {Dense, Dense}); + Tensor A("A", {NUM_I, NUM_J}, CSR); + Tensor x("x", {NUM_J}, Format({Dense})); + Tensor y("y", {NUM_I}, Format({Dense})); - srand(268238); + srand(120); for (int i = 0; i < NUM_I; i++) { for (int j = 0; j < NUM_J; j++) { - float rand_float = (float)rand()/(float)(RAND_MAX); - C.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY))); - } - } - - for (int i = 0; i < NUM_I; i++) { - for (int k = 0; k < NUM_K; k++) { float rand_float = (float)rand()/(float)(RAND_MAX); if (rand_float < SPARSITY) { - B.insert({i, k}, (double) ((int) (rand_float*3/SPARSITY))); + A.insert({i, j}, (double) ((int) (rand_float * 3 / SPARSITY))); } } } for (int j = 0; j < NUM_J; j++) { - for (int k = 0; k < NUM_K; k++) { - float rand_float = (float)rand()/(float)(RAND_MAX); - D.insert({j, k}, (double) ((int) (rand_float*3/SPARSITY))); - } + float rand_float = (float)rand()/(float)(RAND_MAX); + x.insert({j}, (double) ((int) (rand_float*3/SPARSITY))); } - B.pack(); - C.pack(); - D.pack(); + x.pack(); + A.pack(); - set_ISPC_codegen_enabled(true); - A(i,k) = B(i,k) * C(i,j) * D(j,k); + y(i) = A(i, j) * x(j); - IndexStmt stmt = A.getAssignment().concretize(); - stmt = scheduleSDDMMISPC(stmt, B); + IndexStmt stmt = y.getAssignment().concretize(); + stmt = scheduleSpMVCPU(stmt); - //printToFile("sddmm_cpu", stmt); + //printToFile("spmv_cpu", stmt); - A.compile(stmt); - A.assemble(); - // A.compute(); + y.compile(stmt); + y.assemble(); + y.compute(); - set_ISPC_codegen_enabled(false); - Tensor expected("expected", {NUM_I, NUM_K}, {Dense, Dense}); - expected(i,k) = B(i,k) * C(i,j) * D(j,k); - IndexStmt stmt_taco = A.getAssignment().concretize(); - stmt_taco = scheduleSDDMMCPU(stmt_taco, B); - expected.compile(stmt_taco); + Tensor expected("expected", {NUM_I}, Format({Dense})); + expected(i) = A(i, j) * x(j); + expected.compile(); expected.assemble(); - // expected.compute(); - - TOOL_BENCHMARK_TIMER(A.compute(), "Compute ISPC: ", timevalue); - TOOL_BENCHMARK_TIMER(expected.compute(), "Compute TACO: ", timevalue); - - ASSERT_TENSOR_EQ(expected, A); + expected.compute(); + ASSERT_TENSOR_EQ(expected, y); +} +TEST(scheduling_eval, ttvCPU) { + if (should_use_CUDA_codegen()) { + return; + } + int NUM_I = 1021/10; + int NUM_J = 1039/10; + int NUM_K = 1057/10; + float SPARSITY = .3; + Tensor A("A", {NUM_I, NUM_J}, {Dense, Dense}); // TODO: change to sparse outputs + Tensor B("B", {NUM_I, NUM_J, NUM_K}, {Sparse, Sparse, Sparse}); + Tensor c("c", {NUM_K}, Format({Dense})); - // float ERROR_MARGIN = 0.01; - // ASSERT_TENSOR_VAL(expected, y); + srand(9536); for (int i = 0; i < NUM_I; i++) { - for (int k = 0; k < NUM_K; k++) { - if (expected(i,k) <= A(i,k) + ERROR_MARGIN && expected(i,k) >= A(i,k) - ERROR_MARGIN) { - // std::cout << "matched values: expected -> " << expected(j) << " == " << y(j) << " <- actual\n"; + for (int j = 0; j < NUM_J; j++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float) rand() / (float) (RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, j, k}, (double) ((int) (rand_float * 3 / SPARSITY))); + } } - else { - std::cout << "unmatched values: expected -> " << expected(i,k) << " != " << A(i,k) << " <- actual\n"; - ASSERT_TRUE(false); - }; } } - std::cout << "test scheduling_eval.sddmmISPC passed\n"; -} + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + c.insert({k}, (double) ((int) (rand_float*3))); + } + B.pack(); + c.pack(); -// bin/taco-test --gtest_filter=scheduling_eval.sddmmISPC -TEST(scheduling_eval, sddmm2ISPC) { - - taco::util::TimeResults timevalue; - bool time = true; - - set_CUDA_codegen_enabled(false); - set_ISPC_codegen_enabled(false); - - int NUM_I = 1021/10; - int NUM_K = 1039/10; - int NUM_J = 1021/10; - float SPARSITY = .3; - Tensor A("A", {NUM_I, NUM_J}, {Dense, Dense}); - Tensor B("B", {NUM_I, NUM_J}, CSR); - Tensor C("C", {NUM_I, NUM_K}, {Dense, Dense}); - - srand(268238); - for (int i = 0; i < NUM_I; i++) { - for (int k = 0; k < NUM_K; k++) { - float rand_float = (float)rand()/(float)(RAND_MAX); - C.insert({i, k}, (double) ((int) (rand_float*3/SPARSITY))); - } - } - - for (int i = 0; i < NUM_I; i++) { - for (int j = 0; j < NUM_J; j++) { - float rand_float = (float)rand()/(float)(RAND_MAX); - if (rand_float < SPARSITY) { - B.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY))); - } - } - } - - B.pack(); - C.pack(); - - set_ISPC_codegen_enabled(true); - A(i,j) = B(i,j) * C(i,k) * C(j,k); - - IndexStmt stmt = A.getAssignment().concretize(); - stmt = scheduleSDDMM2ISPC(stmt, B); - - //printToFile("sddmm_cpu", stmt); - - A.compile(stmt); - A.assemble(); - // A.compute(); - - set_ISPC_codegen_enabled(false); - Tensor expected("expected", {NUM_I, NUM_J}, {Dense, Dense}); - expected(i,j) = B(i,j) * C(i,k) * C(j,k); - IndexStmt stmt_taco = A.getAssignment().concretize(); - stmt_taco = scheduleSDDMM2CPU(stmt_taco, B); - expected.compile(stmt_taco); - expected.assemble(); - // expected.compute(); - - TOOL_BENCHMARK_TIMER(A.compute(), "Compute ISPC: ", timevalue); - TOOL_BENCHMARK_TIMER(expected.compute(), "Compute TACO: ", timevalue); - - ASSERT_TENSOR_EQ(expected, A); - - - // float ERROR_MARGIN = 0.01; - // ASSERT_TENSOR_VAL(expected, y); - for (int i = 0; i < NUM_I; i++) { - for (int j = 0; j < NUM_J; j++) { - if (expected(i,j) <= A(i,j) + ERROR_MARGIN && expected(i,j) >= A(i,j) - ERROR_MARGIN) { - // std::cout << "matched values: expected -> " << expected(j) << " == " << y(j) << " <- actual\n"; - } - else { - std::cout << "unmatched values: expected -> " << expected(i,j) << " != " << A(i,j) << " <- actual\n"; - ASSERT_TRUE(false); - }; - } - } - std::cout << "test scheduling_eval.sddmmISPC passed\n"; - -} - - -TEST(scheduling_eval, spmvCPU) { - if (should_use_CUDA_codegen()) { - return; - } - int NUM_I = 1021/10; - int NUM_J = 1039/10; - float SPARSITY = .3; - Tensor A("A", {NUM_I, NUM_J}, CSR); - Tensor x("x", {NUM_J}, Format({Dense})); - Tensor y("y", {NUM_I}, Format({Dense})); - - srand(120); - for (int i = 0; i < NUM_I; i++) { - for (int j = 0; j < NUM_J; j++) { - float rand_float = (float)rand()/(float)(RAND_MAX); - if (rand_float < SPARSITY) { - A.insert({i, j}, (double) ((int) (rand_float * 3 / SPARSITY))); - } - } - } - - for (int j = 0; j < NUM_J; j++) { - float rand_float = (float)rand()/(float)(RAND_MAX); - x.insert({j}, (double) ((int) (rand_float*3/SPARSITY))); - } - - x.pack(); - A.pack(); - - y(i) = A(i, j) * x(j); - - IndexStmt stmt = y.getAssignment().concretize(); - stmt = scheduleSpMVCPU(stmt); - - //printToFile("spmv_cpu", stmt); - - y.compile(stmt); - y.assemble(); - y.compute(); - - Tensor expected("expected", {NUM_I}, Format({Dense})); - expected(i) = A(i, j) * x(j); - expected.compile(); - expected.assemble(); - expected.compute(); - ASSERT_TENSOR_EQ(expected, y); -} - - -TEST(scheduling_eval, spmvISPC) { - - taco::util::TimeResults timevalue; - bool time = true; - - set_ISPC_codegen_enabled(false); - set_CUDA_codegen_enabled(false); - - int NUM_I = 200021/10; - int NUM_J = 200039/10; - float SPARSITY = .2; - Tensor A("A", {NUM_I, NUM_J}, CSR); - Tensor x("x", {NUM_J}, Format({Dense})); - Tensor y("y", {NUM_I}, Format({Dense})); - - srand(120); - for (int i = 0; i < NUM_I; i++) { - for (int j = 0; j < NUM_J; j++) { - float rand_float = (float)rand()/(float)(RAND_MAX); - if (rand_float < SPARSITY) { - A.insert({i, j}, (double) ((int) (rand_float * 3 / SPARSITY))); - } - } - } - - for (int j = 0; j < NUM_J; j++) { - float rand_float = (float)rand()/(float)(RAND_MAX); - x.insert({j}, (double) ((int) (rand_float*3/SPARSITY))); - } - - x.pack(); - A.pack(); - - set_ISPC_codegen_enabled(true); - - y(i) = A(i, j) * x(j); - - IndexStmt stmt = y.getAssignment().concretize(); - // stmt = scheduleSpMVISPC(stmt); - - printToFile("spmv_cpu", stmt); - - y.compile(stmt); - y.assemble(); - // y.compile(); - - set_ISPC_codegen_enabled(false); - - // Tensor expected("expected", {NUM_I}, Format({Dense})); - // expected(i) = A(i, j) * x(j); - // expected.compile(); - // expected.assemble(); - // expected.compute(); - - - Tensor expected("expected", {NUM_I}, Format({Dense})); - expected(i) = A(i, j) * x(j); - IndexStmt stmt_taco = expected.getAssignment().concretize(); - stmt_taco = scheduleSpMVCPU(stmt_taco); - - expected.compile(stmt_taco); - expected.assemble(); - // expected.compile(); - - - TOOL_BENCHMARK_TIMER(y.compute(), "Compute ISPC: ", timevalue); - TOOL_BENCHMARK_TIMER(expected.compute(), "Compute TACO: ", timevalue); - - - ASSERT_TENSOR_EQ(expected, y); - - // float ERROR_MARGIN = 0.01; - // ASSERT_TENSOR_VAL(expected, y); - for (int j = 0; j < NUM_J; j++) { - if (expected(j) <= y(j) + ERROR_MARGIN && expected(j) >= y(j) - ERROR_MARGIN) { - // std::cout << "matched values: expected -> " << expected(j) << " == " << y(j) << " <- actual\n"; - } - else { - std::cout << "unmatched values: expected -> " << expected(j) << " != " << y(j) << " <- actual\n"; - ASSERT_TRUE(false); - }; - } - - std::cout << "test scheduling_eval.spmvISPC passed\n"; - - for (int i=0; i<10; i++) { - TOOL_BENCHMARK_TIMER(y.compute(), "Compute ISPC: ", timevalue); - TOOL_BENCHMARK_TIMER(expected.compute(), "Compute TACO: ", timevalue); - } - - -} - -TEST(scheduling_eval, ttvCPU) { - if (should_use_CUDA_codegen()) { - return; - } - int NUM_I = 1021/10; - int NUM_J = 1039/10; - int NUM_K = 1057/10; - float SPARSITY = .3; - Tensor A("A", {NUM_I, NUM_J}, {Dense, Dense}); // TODO: change to sparse outputs - Tensor B("B", {NUM_I, NUM_J, NUM_K}, {Sparse, Sparse, Sparse}); - Tensor c("c", {NUM_K}, Format({Dense})); - - srand(9536); - for (int i = 0; i < NUM_I; i++) { - for (int j = 0; j < NUM_J; j++) { - for (int k = 0; k < NUM_K; k++) { - float rand_float = (float) rand() / (float) (RAND_MAX); - if (rand_float < SPARSITY) { - B.insert({i, j, k}, (double) ((int) (rand_float * 3 / SPARSITY))); - } - } - } - } - - for (int k = 0; k < NUM_K; k++) { - float rand_float = (float)rand()/(float)(RAND_MAX); - c.insert({k}, (double) ((int) (rand_float*3))); - } - - B.pack(); - c.pack(); - - A(i,j) = B(i,j,k) * c(k); + A(i,j) = B(i,j,k) * c(k); IndexStmt stmt = A.getAssignment().concretize(); stmt = scheduleTTVCPU(stmt, B); @@ -1640,64 +1117,6 @@ TEST(scheduling_eval, ttvCPU) { } -TEST(scheduling_eval, ttvISPC) { - if (should_use_CUDA_codegen()) { - return; - } - set_CUDA_codegen_enabled(false); - set_ISPC_codegen_enabled(false); - int NUM_I = 1021/10; - int NUM_J = 1039/10; - int NUM_K = 1057/10; - float SPARSITY = .3; - Tensor A("A", {NUM_I, NUM_J}, {Dense, Dense}); // TODO: change to sparse outputs - Tensor B("B", {NUM_I, NUM_J, NUM_K}, {Sparse, Sparse, Sparse}); - Tensor c("c", {NUM_K}, Format({Dense})); - - srand(9536); - for (int i = 0; i < NUM_I; i++) { - for (int j = 0; j < NUM_J; j++) { - for (int k = 0; k < NUM_K; k++) { - float rand_float = (float) rand() / (float) (RAND_MAX); - if (rand_float < SPARSITY) { - B.insert({i, j, k}, (double) ((int) (rand_float * 3 / SPARSITY))); - } - } - } - } - - for (int k = 0; k < NUM_K; k++) { - float rand_float = (float)rand()/(float)(RAND_MAX); - c.insert({k}, (double) ((int) (rand_float*3))); - } - - B.pack(); - c.pack(); - - set_ISPC_codegen_enabled(true); - A(i,j) = B(i,j,k) * c(k); - - IndexStmt stmt = A.getAssignment().concretize(); - stmt = scheduleTTVISPC(stmt, B); - - printToFile("ttv_ispc", "__ttv_ispc", stmt); - - A.compile(stmt); - A.assemble(); - A.compute(); - - set_ISPC_codegen_enabled(false); - Tensor expected("expected", {NUM_I, NUM_J}, {Dense, Dense}); - expected(i,j) = B(i,j,k) * c(k); - IndexStmt stmt_taco = expected.getAssignment().concretize(); - stmt_taco = scheduleTTVCPU(stmt_taco, B); - expected.compile(); - expected.assemble(); - expected.compute(); - ASSERT_TENSOR_EQ(expected, A); -} - - TEST(scheduling_eval, ttvCPU_CSR) { if (should_use_CUDA_codegen()) { return; @@ -1750,81 +1169,6 @@ TEST(scheduling_eval, ttvCPU_CSR) { ASSERT_TENSOR_EQ(expected, A); } -TEST(scheduling_eval, ttvISPC_CSR) { - if (should_use_CUDA_codegen()) { - return; - } - - int NUM_I = 10000; - int NUM_J = 1039/10; - int NUM_K = 128; - float SPARSITY = .3; - Tensor A("A", {NUM_I, NUM_J}, {Dense, Sparse}); - Tensor B("B", {NUM_I, NUM_J, NUM_K}, {Dense, Sparse, Sparse}); - Tensor c("c", {NUM_K}, Format({Dense})); - - srand(9536); - for (int i = 0; i < NUM_I; i++) { - for (int j = 0; j < NUM_J; j++) { - for (int k = 0; k < NUM_K; k++) { - float rand_float = (float) rand() / (float) (RAND_MAX); - if (rand_float < SPARSITY) { - B.insert({i, j, k}, (double) ((int) (rand_float * 3 / SPARSITY))); - } - } - } - } - - for (int k = 0; k < NUM_K; k++) { - float rand_float = (float)rand()/(float)(RAND_MAX); - c.insert({k}, (double) ((int) (rand_float*3))); - } - - B.pack(); - c.pack(); - - set_ISPC_codegen_enabled(true); - A(i,j) = B(i,j,k) * c(k); - - IndexStmt stmt = A.getAssignment().concretize(); - stmt = scheduleTTVISPCCSR(stmt); - printToFile("ttv_ispc_csr", "__ttv_ispc_csr", stmt); - - A.compile(stmt); - A.assemble(); - A.compute(); - - set_ISPC_codegen_enabled(false); - Tensor expected("expected", {NUM_I, NUM_J}, {Dense, Sparse}); - expected(i,j) = B(i,j,k) * c(k); - IndexStmt taco_stmt = expected.getAssignment().concretize(); - taco_stmt = scheduleTTVCPUCSR_ST(taco_stmt); - expected.compile(taco_stmt); - expected.assemble(); - expected.compute(); - ASSERT_TENSOR_EQ(expected, A); - - Tensor A2("A2", {NUM_I, NUM_J}, {Dense, Sparse}); - set_ISPC_codegen_enabled(true); - A2(i,j) = B(i,j,k) * c(k); - - IndexStmt stmt2 = A2.getAssignment().concretize(); - - A2.compile(stmt2); - A2.assemble(); - A2.compute(); - - taco::util::TimeResults timevalue; - bool time = true; - - for (int i=0; i<3; i++) { - TOOL_BENCHMARK_TIMER(expected.compute(), "Compute TACO1: ", timevalue); - TOOL_BENCHMARK_TIMER(A.compute(), "Compute ISPC1: ", timevalue); - TOOL_BENCHMARK_TIMER(A2.compute(), "Compute ISPC2: ", timevalue); - } - - -} TEST(scheduling_eval, ttmCPU) { if (should_use_CUDA_codegen()) { @@ -1866,320 +1210,41 @@ TEST(scheduling_eval, ttmCPU) { IndexStmt stmt = A.getAssignment().concretize(); stmt = scheduleTTMCPU(stmt, B); - //printToFile("ttm_cpu", stmt); - - A.compile(stmt); - A.assemble(); - A.compute(); - - Tensor expected("expected", {NUM_I, NUM_J, NUM_L}, {Dense, Dense, Dense}); - expected(i,j,l) = B(i,j,k) * C(k,l); - expected.compile(); - expected.assemble(); - expected.compute(); - ASSERT_TENSOR_EQ(expected, A); -} - -TEST(scheduling_eval, ttmISPC) { - if (should_use_CUDA_codegen()) { - return; - } - int NUM_I = 1021/40; - int NUM_J = 1039/40; - int NUM_K = 1057/40; - int NUM_L = 1232/40; - float SPARSITY = .1; - Tensor A("A", {NUM_I, NUM_J, NUM_L}, {Dense, Dense, Dense}); // TODO: change to sparse outputs - Tensor B("B", {NUM_I, NUM_J, NUM_K}, {Sparse, Sparse, Sparse}); - Tensor C("C", {NUM_K, NUM_L}, {Dense, Dense}); - - srand(935); - for (int i = 0; i < NUM_I; i++) { - for (int j = 0; j < NUM_J; j++) { - for (int k = 0; k < NUM_K; k++) { - float rand_float = (float) rand() / (float) (RAND_MAX); - if (rand_float < SPARSITY) { - B.insert({i, j, k}, (double) ((int) (rand_float * 3 / SPARSITY))); - } - } - } - } - - for (int k = 0; k < NUM_K; k++) { - for (int l = 0; l < NUM_L; l++) { - float rand_float = (float)rand()/(float)(RAND_MAX); - C.insert({k, l}, (double) ((int) (rand_float*3))); - } - } - - B.pack(); - C.pack(); - - A(i,j,l) = B(i,j,k) * C(k,l); - - IndexStmt stmt = A.getAssignment().concretize(); - stmt = scheduleTTMCPU(stmt, B); - - //printToFile("ttm_cpu", stmt); - - A.compile(stmt); - A.assemble(); - A.compute(); - - Tensor expected("expected", {NUM_I, NUM_J, NUM_L}, {Dense, Dense, Dense}); - expected(i,j,l) = B(i,j,k) * C(k,l); - expected.compile(); - expected.assemble(); - expected.compute(); - ASSERT_TENSOR_EQ(expected, A); -} - -TEST(scheduling_eval, mttkrpCPU) { - if (should_use_CUDA_codegen()) { - return; - } - int NUM_I = 1021/20; - int NUM_J = 1039/20; - int NUM_K = 1057/20; - int NUM_L = 1232/20; - float SPARSITY = .1; - Tensor A("A", {NUM_I, NUM_J}, {Dense, Dense}); - Tensor B("B", {NUM_I, NUM_K, NUM_L}, {Dense, Sparse, Sparse}); - Tensor C("C", {NUM_K, NUM_J}, {Dense, Dense}); - Tensor D("D", {NUM_L, NUM_J}, {Dense, Dense}); - - srand(549694); - for (int i = 0; i < NUM_I; i++) { - for (int k = 0; k < NUM_K; k++) { - for (int l = 0; l < NUM_L; l++) { - float rand_float = (float) rand() / (float) (RAND_MAX); - if (rand_float < SPARSITY) { - B.insert({i, k, l}, (double) ((int) (rand_float * 3 / SPARSITY))); - } - } - } - } - - for (int k = 0; k < NUM_K; k++) { - for (int j = 0; j < NUM_J; j++) { - float rand_float = (float)rand()/(float)(RAND_MAX); - C.insert({k, j}, (double) ((int) (rand_float*3))); - } - } - - for (int l = 0; l < NUM_L; l++) { - for (int j = 0; j < NUM_J; j++) { - float rand_float = (float)rand()/(float)(RAND_MAX); - D.insert({l, j}, (double) ((int) (rand_float*3))); - } - } - - B.pack(); - C.pack(); - D.pack(); - - A(i,j) = B(i,k,l) * C(k,j) * D(l,j); - - IndexStmt stmt = A.getAssignment().concretize(); - stmt = scheduleMTTKRPCPU(stmt, B); - //printToFile("mttkrp_cpu", stmt); - - A.compile(stmt); - A.assemble(); - A.compute(); - - Tensor expected("expected", {NUM_I, NUM_J}, {Dense, Dense}); - expected(i,j) = B(i,k,l) * C(k,j) * D(l,j); - expected.compile(); - expected.assemble(); - expected.compute(); - ASSERT_TENSOR_EQ(expected, A); -} - -TEST(scheduling_eval, temp) { - if (should_use_CUDA_codegen() || should_use_ISPC_codegen()) { - return; - } - std::default_random_engine gen(0); - std::uniform_real_distribution unif(0.0, 1.0); - // Predeclare the storage formats that the inputs and output will be stored as. - // To define a format, you must specify whether each dimension is dense or sparse - // and (optionally) the order in which dimensions should be stored. The formats - // declared below correspond to doubly compressed sparse row (dcsr), row-major - // dense (rm), and column-major dense (dm). - Format dcsr({Sparse,Sparse}); - Format rm({Dense,Dense}); - Format cm({Dense,Dense}, {1,0}); - - // Load a sparse matrix from file (stored in the Matrix Market format) and - // store it as a doubly compressed sparse row matrix. Matrices correspond to - // order-2 tensors in taco. The matrix in this example can be download from: - // https://www.cise.ufl.edu/research/sparse/MM/Williams/webbase-1M.tar.gz - Tensor B = read("/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx", dcsr); - // Generate a random dense matrix and store it in row-major (dense) format. - Tensor C({B.getDimension(0), 1000}, rm); - for (int i = 0; i < C.getDimension(0); ++i) { - for (int j = 0; j < C.getDimension(1); ++j) { - C.insert({i,j}, unif(gen)); - } - } - C.pack(); - - // Generate another random dense matrix and store it in column-major format. - Tensor D({1000, B.getDimension(1)}, cm); - for (int i = 0; i < D.getDimension(0); ++i) { - for (int j = 0; j < D.getDimension(1); ++j) { - D.insert({i,j}, unif(gen)); - } - } - D.pack(); - - // Declare the output matrix to be a sparse matrix with the same dimensions as - // input matrix B, to be also stored as a doubly compressed sparse row matrix. - Tensor A(B.getDimensions(), dcsr); - - // Define the SDDMM computation using index notation. - IndexVar i, j, k; - A(i,j) = B(i,j) * C(i,k) * D(k,j); - - // At this point, we have defined how entries in the output matrix should be - // computed from entries in the input matrices but have not actually performed - // the computation yet. To do so, we must first tell taco to generate code that - // can be executed to compute the SDDMM operation. - A.compile(); - // We can now call the functions taco generated to assemble the indices of the - // output matrix and then actually compute the SDDMM. - A.assemble(); - A.compute(); - // Write the output of the computation to file (stored in the Matrix Market format). - write("A.mtx", A); -} - -TEST(scheduling_eval, mttkrpISPC) { - if (should_use_CUDA_codegen()) { - return; - } - set_ISPC_codegen_enabled(false); - set_CUDA_codegen_enabled(false); - int NUM_I = 10000; // 1021/20; - int NUM_J = 256; - int NUM_K = 1057/20; - int NUM_L = 1232/20; - float SPARSITY = .1; - Tensor B("B", {NUM_I, NUM_K, NUM_L}, {Dense, Sparse, Sparse}); - Tensor C("C", {NUM_K, NUM_J}, {Dense, Dense}); - Tensor D("D", {NUM_L, NUM_J}, {Dense, Dense}); - - srand(549694); - for (int i = 0; i < NUM_I; i++) { - for (int k = 0; k < NUM_K; k++) { - for (int l = 0; l < NUM_L; l++) { - float rand_float = (float) rand() / (float) (RAND_MAX); - if (rand_float < SPARSITY) { - B.insert({i, k, l}, (double) ((int) (rand_float * 3 / SPARSITY))); - } - } - } - } - - for (int k = 0; k < NUM_K; k++) { - for (int j = 0; j < NUM_J; j++) { - float rand_float = (float)rand()/(float)(RAND_MAX); - C.insert({k, j}, (double) ((int) (rand_float*3))); - } - } - - for (int l = 0; l < NUM_L; l++) { - for (int j = 0; j < NUM_J; j++) { - float rand_float = (float)rand()/(float)(RAND_MAX); - D.insert({l, j}, (double) ((int) (rand_float*3))); - } - } - - B.pack(); - C.pack(); - D.pack(); + //printToFile("ttm_cpu", stmt); - set_ISPC_codegen_enabled(true); - - Tensor A1("A1", {NUM_I, NUM_J}, {Dense, Dense}); - A1(i,j) = B(i,k,l) * C(k,j) * D(l,j); - IndexStmt stmt1 = A1.getAssignment().concretize(); - stmt1 = scheduleMTTKRPISPC(stmt1, B); - // printToFile("mttkrp1_cpu_ispc", stmt1); - A1.compile(stmt1); - A1.assemble(); - A1.compute(); - - set_ISPC_codegen_enabled(false); - Tensor expected1("expected1", {NUM_I, NUM_J}, {Dense, Dense}); - expected1(i,j) = B(i,k,l) * C(k,j) * D(l,j); - IndexStmt taco_stmt1 = expected1.getAssignment().concretize(); - taco_stmt1 = scheduleMTTKRPCPU(taco_stmt1, B); - expected1.compile(taco_stmt1); - expected1.assemble(); - expected1.compute(); - ASSERT_TENSOR_EQ(expected1, A1); - - set_ISPC_codegen_enabled(true); - Tensor A2("A2", {NUM_I, NUM_J}, {Dense, Dense}); - A2(i,j) = B(i,k,l) * C(k,j) * D(l,j); - IndexStmt stmt2 = A1.getAssignment().concretize(); - stmt2 = scheduleMTTKRPPrecomputedISPC_ST(stmt2, B); - // printToFile("mttkrp_cpu_ispc", stmt); - A2.compile(stmt2); - A2.assemble(); - A2.compute(); - ASSERT_TENSOR_EQ(expected1, A2); - - set_ISPC_codegen_enabled(false); - Tensor expected2("expected2", {NUM_I, NUM_J}, {Dense, Dense}); - expected2(i,j) = B(i,k,l) * C(k,j) * D(l,j); - IndexStmt taco_stmt2 = expected2.getAssignment().concretize(); - taco_stmt2 = scheduleMTTKRPPrecomputedCPU_ST(taco_stmt2, B); - expected2.compile(taco_stmt2); - expected2.assemble(); - expected2.compute(); - ASSERT_TENSOR_EQ(expected1, expected2); - - taco::util::TimeResults timevalue; - bool time = true; - - for (int i=0; i<3; i++) { - TOOL_BENCHMARK_TIMER(expected1.compute(), "Compute TACO1: ", timevalue); - TOOL_BENCHMARK_TIMER(A1.compute(), "Compute ISPC1: ", timevalue); - TOOL_BENCHMARK_TIMER(expected2.compute(), "Compute TACO2: ", timevalue); - TOOL_BENCHMARK_TIMER(A2.compute(), "Compute ISPC2: ", timevalue); - } -} + A.compile(stmt); + A.assemble(); + A.compute(); + Tensor expected("expected", {NUM_I, NUM_J, NUM_L}, {Dense, Dense, Dense}); + expected(i,j,l) = B(i,j,k) * C(k,l); + expected.compile(); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, A); +} -TEST(scheduling_eval, mttkrp4ISPC) { +TEST(scheduling_eval, mttkrpCPU) { if (should_use_CUDA_codegen()) { return; } - set_ISPC_codegen_enabled(false); - set_CUDA_codegen_enabled(false); - int NUM_I = 1000; // 1021/20; - int NUM_J = 16; + int NUM_I = 1021/20; + int NUM_J = 1039/20; int NUM_K = 1057/20; int NUM_L = 1232/20; - int NUM_M = 1124/20; float SPARSITY = .1; - Tensor B("B", {NUM_I, NUM_K, NUM_L, NUM_M}, {Dense, Sparse, Sparse, Sparse}); + Tensor A("A", {NUM_I, NUM_J}, {Dense, Dense}); + Tensor B("B", {NUM_I, NUM_K, NUM_L}, {Dense, Sparse, Sparse}); Tensor C("C", {NUM_K, NUM_J}, {Dense, Dense}); Tensor D("D", {NUM_L, NUM_J}, {Dense, Dense}); - Tensor E("E", {NUM_M, NUM_J}, {Dense, Dense}); srand(549694); for (int i = 0; i < NUM_I; i++) { for (int k = 0; k < NUM_K; k++) { for (int l = 0; l < NUM_L; l++) { - for (int m = 0; m < NUM_M; m++) { - float rand_float = (float) rand() / (float) (RAND_MAX); - if (rand_float < SPARSITY) { - B.insert({i, k, l, m}, (double) ((int) (rand_float * 3 / SPARSITY))); - } + float rand_float = (float) rand() / (float) (RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, k, l}, (double) ((int) (rand_float * 3 / SPARSITY))); } } } @@ -2199,83 +1264,27 @@ TEST(scheduling_eval, mttkrp4ISPC) { } } - for (int m = 0; m < NUM_M; m++) { - for (int j = 0; j < NUM_J; j++) { - float rand_float = (float)rand()/(float)(RAND_MAX); - E.insert({m, j}, (double) ((int) (rand_float*3))); - } - } - B.pack(); C.pack(); D.pack(); - E.pack(); - - set_ISPC_codegen_enabled(true); - Tensor A1("A1", {NUM_I, NUM_J}, {Dense, Dense}); - A1(i,j) = B(i,k,l,m) * C(k,j) * D(l,j) * E(m,j); - IndexStmt stmt1 = A1.getAssignment().concretize(); - stmt1 = scheduleMTTKRP4ISPC_ST(stmt1, B); - // printToFile("mttkrp1_cpu_ispc", stmt1); - A1.compile(stmt1); - A1.assemble(); - A1.compute(); - - set_ISPC_codegen_enabled(false); - Tensor expected1("expected1", {NUM_I, NUM_J}, {Dense, Dense}); - expected1(i,j) = B(i,k,l,m) * C(k,j) * D(l,j) * E(m,j); - IndexStmt taco_stmt1 = expected1.getAssignment().concretize(); - taco_stmt1 = scheduleMTTKRP4CPU_ST(taco_stmt1, B); - expected1.compile(taco_stmt1); - expected1.assemble(); - expected1.compute(); - ASSERT_TENSOR_EQ(expected1, A1); - - // set_ISPC_codegen_enabled(true); - // Tensor A2("A2", {NUM_I, NUM_J}, {Dense, Dense}); - // A2(i,j) = B(i,k,l) * C(k,j) * D(l,j); - // IndexStmt stmt2 = A1.getAssignment().concretize(); - // stmt2 = scheduleMTTKRPPrecomputedISPC_ST(stmt2, B); - // // printToFile("mttkrp_cpu_ispc", stmt); - // A2.compile(stmt2); - // A2.assemble(); - // A2.compute(); - // ASSERT_TENSOR_EQ(expected1, A2); - - set_ISPC_codegen_enabled(false); - Tensor expected2("expected2", {NUM_I, NUM_J}, {Dense, Dense}); - expected2(i,j) = B(i,k,l,m) * C(k,j) * D(l,j) * E(m,j); - - IndexExpr BE = B(i,k,l,m) * E(m,j); - IndexExpr BDE = BE * D(l, j); - expected2(i,j) = BDE * C(k,j); - IndexStmt taco_stmt2 = expected2.getAssignment().concretize(); - TensorVar BE_workspace("BE_workspace", Type(Float64, {Dimension(j)}), taco::dense); - TensorVar BDE_workspace("BDE_workspace", Type(Float64, {Dimension(j)}), taco::dense); - - IndexStmt precomputed_stmt = forall(i, forall(k, - where(forall(j, expected2(i,j) += BDE_workspace(j) * C(k,j)), - forall(l, where(forall(j, BDE_workspace(j) += BE_workspace(j) * D(l,j)), - forall(m, forall(j, BE_workspace(j) += B(i,k,l,m) * E(m,j)))))))); - - // IndexStmt scheduled2 = scheduleMTTKRPPrecomputedCPU(precomputed_stmt, B, 64); - // expected2.compile(scheduled2); - // expected2.assemble(); - // expected2.compute(); - // ASSERT_TENSOR_EQ(expected1, expected2); - - taco::util::TimeResults timevalue; - bool time = true; - - for (int i=0; i<3; i++) { - TOOL_BENCHMARK_TIMER(expected1.compute(), "Compute TACO1: ", timevalue); - TOOL_BENCHMARK_TIMER(A1.compute(), "Compute ISPC1: ", timevalue); - // TOOL_BENCHMARK_TIMER(expected2.compute(), "Compute TACO2: ", timevalue); - // TOOL_BENCHMARK_TIMER(A2.compute(), "Compute ISPC2: ", timevalue); - } -} + A(i,j) = B(i,k,l) * C(k,j) * D(l,j); + + IndexStmt stmt = A.getAssignment().concretize(); + stmt = scheduleMTTKRPCPU(stmt, B); + //printToFile("mttkrp_cpu", stmt); + + A.compile(stmt); + A.assemble(); + A.compute(); + Tensor expected("expected", {NUM_I, NUM_J}, {Dense, Dense}); + expected(i,j) = B(i,k,l) * C(k,j) * D(l,j); + expected.compile(); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, A); +} TEST(scheduling_eval, spmvGPU) { if (!should_use_CUDA_codegen()) { @@ -2656,334 +1665,6 @@ TEST(scheduling_eval, mttkrpGPU) { ASSERT_TENSOR_EQ(expected, A); } -TEST(generate_evaluation_files, ispc) { - std::cout << "Hi Adhitha!\n" << std::endl ; - set_CUDA_codegen_enabled(false); - set_ISPC_codegen_enabled(true); - - vector> spmv_parameters = {{32}}; - vector> spmspv_parameters = {{8}}; - - // 4 to 512 and 4, 8, 16 - vector> spmm_dcsr_parameters = {{16, 8}}; - vector> spmm_parameters = {{16,4}}; - - vector> mttkrp_parameters = {}; - mttkrp_parameters.push_back({64,0}); - - vector> sddmm_parameters = {{8, 8}}; - vector> ttv_parameters = {{32}}; - - int NUM_I = 100; - int NUM_J = 100; - int NUM_K = 100; - int NUM_L = 100; - - string c_file_ending = ".h"; - string file_ending = ".ispc"; - string file_path = "eval_prepared_ispc/"; - mkdir(file_path.c_str(), 0777); - - // spmv - { - stringstream source1; - stringstream source2; - std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); - Tensor A("A", {NUM_I, NUM_J}, CSR); - Tensor x("x", {NUM_J}, {Dense}); - Tensor y("y", {NUM_I}, {Dense}); - y(i) = A(i, j) * x(j); - std::cout << "concretizing the assignment statement\n"; - IndexStmt stmt = y.getAssignment().concretize(); - std::cout << "Printing the original IndexStmt: " << stmt << std::endl; - - for (auto paramSet : spmv_parameters) { - std::cout << "param set: " << paramSet[0] << std::endl; - IndexStmt scheduled = scheduleSpMVISPC(stmt, paramSet[0]); - std::cout << "scheduled IndexStmt: " << scheduled << std::endl; - ir::Stmt compute = lower(scheduled, string("compute_") + util::join(paramSet, "_"), false, true); - std::cout << "computed statement: \n" << compute << std::endl; - codegen->compile(compute, false); - } - ofstream source_file; - source_file.open(file_path + "spmv_csr_ispc_taco" + c_file_ending); - source_file << source1.str(); - source_file.close(); - - ofstream ispc_source_file; - ispc_source_file.open(file_path + "__spmv_csr_ispc_taco" + file_ending); - ispc_source_file << source2.str(); - ispc_source_file.close(); - - } - - // spmm - { - stringstream source1; - stringstream source2; - std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); - Tensor A("A", {NUM_I, NUM_J}, CSR); - Tensor X("X", {NUM_J, NUM_K}, {Dense, Dense}); - Tensor Y("Y", {NUM_I, NUM_K}, {Dense, Dense}); - Y(i, k) = A(i, j) * X(j, k); - IndexStmt stmt = Y.getAssignment().concretize(); - bool isFirst = true; - for (auto paramSet : spmm_parameters) { - IndexStmt scheduled = scheduleSpMMISPC1(stmt, A, paramSet[0], paramSet[1]); - ir::Stmt compute = lower(scheduled, string("compute1_") + util::join(paramSet, "_"), false, true); - codegen->compile(compute, isFirst); - isFirst = false; - } - ofstream source_file; - source_file.open(file_path + "spmm_csr_ispc_taco1" + c_file_ending); - source_file << source1.str(); - source_file.close(); - - ofstream ispc_source_file; - ispc_source_file.open(file_path + "__spmm_csr_ispc_taco1" + file_ending); - ispc_source_file << source2.str(); - ispc_source_file.close(); - } - - // spmm omp - { - stringstream source1; - stringstream source2; - std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); - Tensor A("A", {NUM_I, NUM_J}, CSR); - Tensor X("X", {NUM_J, NUM_K}, {Dense, Dense}); - Tensor Y("Y", {NUM_I, NUM_K}, {Dense, Dense}); - Y(i, k) = A(i, j) * X(j, k); - IndexStmt stmt = Y.getAssignment().concretize(); - bool isFirst = true; - for (auto paramSet : spmm_parameters) { - IndexStmt scheduled = scheduleSpMMISPCOMP1(stmt, A, paramSet[0], paramSet[1]); - ir::Stmt compute = lower(scheduled, string("compute1_") + util::join(paramSet, "_"), false, true); - codegen->compile(compute, isFirst); - isFirst = false; - } - ofstream source_file; - source_file.open(file_path + "spmm_omp_ispc_taco1" + c_file_ending); - source_file << source1.str(); - source_file.close(); - - ofstream ispc_source_file; - ispc_source_file.open(file_path + "__spmm_omp_ispc_taco1" + file_ending); - ispc_source_file << source2.str(); - ispc_source_file.close(); - } - - // spmm2 - { - stringstream source1; - stringstream source2; - std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); - Tensor A("A", {NUM_I, NUM_J}, CSR); - Tensor X("X", {NUM_J, NUM_K}, {Dense, Dense}); - Tensor Y("Y", {NUM_I, NUM_K}, {Dense, Dense}); - Y(i, k) = A(i, j) * X(j, k); - IndexStmt stmt = Y.getAssignment().concretize(); - bool isFirst = true; - for (auto paramSet : spmm_parameters) { - IndexStmt scheduled = scheduleSpMMISPC2(stmt, A, paramSet[0], paramSet[1]); - ir::Stmt compute = lower(scheduled, string("compute2_") + util::join(paramSet, "_"), false, true); - codegen->compile(compute, isFirst); - isFirst = false; - } - ofstream source_file; - source_file.open(file_path + "spmm_csr_ispc_taco2" + c_file_ending); - source_file << source1.str(); - source_file.close(); - - ofstream ispc_source_file; - ispc_source_file.open(file_path + "__spmm_csr_ispc_taco2" + file_ending); - ispc_source_file << source2.str(); - ispc_source_file.close(); - } - - // spmm - { - stringstream source1; - stringstream source2; - std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); - Tensor A("A", {NUM_I, NUM_J}, CSR); - Tensor X("X", {NUM_J, NUM_K}, {Dense, Dense}); - Tensor Y("Y", {NUM_I, NUM_K}, {Dense, Dense}); - Y(i, k) = A(i, j) * X(j, k); - IndexStmt stmt = Y.getAssignment().concretize(); - bool isFirst = true; - for (auto paramSet : spmm_parameters) { - IndexStmt scheduled = scheduleSpMMISPC3(stmt, A, paramSet[0], paramSet[1]); - ir::Stmt compute = lower(scheduled, string("compute3_") + util::join(paramSet, "_"), false, true); - codegen->compile(compute, isFirst); - isFirst = false; - } - ofstream source_file; - source_file.open(file_path + "spmm_csr_ispc_taco3" + c_file_ending); - source_file << source1.str(); - source_file.close(); - - ofstream ispc_source_file; - ispc_source_file.open(file_path + "__spmm_csr_ispc_taco3" + file_ending); - ispc_source_file << source2.str(); - ispc_source_file.close(); - } - - // ttv - { - stringstream source; - stringstream source2; - std::shared_ptr codegen = ir::CodeGen::init_default(source, source2, ir::CodeGen::ImplementationGen); - Tensor A("A", {NUM_I, NUM_J}, {Dense, Dense}); // TODO: change to sparse outputs - Tensor B("B", {NUM_I, NUM_J, NUM_K}, {Sparse, Sparse, Sparse}); - Tensor c("c", {NUM_K}, Format({Dense})); - A(i,j) = B(i,j,k) * c(k); - IndexStmt stmt = A.getAssignment().concretize(); - bool isFirst = true; - for (auto paramSet : ttv_parameters) { - IndexStmt scheduled = scheduleTTVCPU(stmt, B, paramSet[0]); - ir::Stmt compute = lower(scheduled, string("compute_") + util::join(paramSet, "_"), false, true); - codegen->compile(compute, isFirst); - isFirst = false; - } - ofstream source_file; - source_file.open(file_path + "ttv_cpu" + c_file_ending); - source_file << source.str(); - source_file.close(); - - ofstream ispc_source_file; - ispc_source_file.open(file_path + "__ttv_cpu" + file_ending); - ispc_source_file << source2.str(); - ispc_source_file.close(); - } - - - // mttkrp3 - { - stringstream source; - stringstream source2; - std::shared_ptr codegen = ir::CodeGen::init_default(source, source2, ir::CodeGen::ImplementationGen); - Tensor A("A", {NUM_I, NUM_J}, {Dense, Dense}); - Tensor B("B", {NUM_I, NUM_K, NUM_L}, {Dense, Sparse, Sparse}); - Tensor C("C", {NUM_K, NUM_J}, {Dense, Dense}); - Tensor D("D", {NUM_L, NUM_J}, {Dense, Dense}); - A(i,j) = B(i,k,l) * C(k,j) * D(l,j); - IndexStmt stmt = A.getAssignment().concretize(); - bool isFirst = true; - for (auto paramSet : mttkrp_parameters) { - IndexStmt scheduled = scheduleMTTKRPCPU(stmt, B, paramSet[0], paramSet[1]); - ir::Stmt compute = lower(scheduled, string("compute_") + util::join(paramSet, "_"), false, true); - codegen->compile(compute, isFirst); - isFirst = false; - } - ofstream source_file; - source_file.open(file_path + "mttkrp3_cpu" + c_file_ending); - source_file << source.str(); - source_file.close(); - - ofstream ispc_source_file; - ispc_source_file.open(file_path + "__mttkrp3_cpu" + file_ending); - ispc_source_file << source2.str(); - ispc_source_file.close(); - } - - - return; -} - - - -TEST(generate_ispc_sddmm_evaluation_files, ispc) { - std::cout << "Hi Adhitha!\n" << std::endl ; - set_CUDA_codegen_enabled(false); - set_ISPC_codegen_enabled(true); - - vector> spmv_parameters = {{32}}; - vector> spmspv_parameters = {{8}}; - - // 4 to 512 and 4, 8, 16 - vector> spmm_dcsr_parameters = {{16, 8}}; - vector> spmm_parameters = {{16,4}}; - - vector> mttkrp_parameters = {}; - mttkrp_parameters.push_back({64,0}); - - vector> sddmm_parameters = {{8, 8}}; - vector> ttv_parameters = {{32}}; - - int NUM_I = 100; - int NUM_J = 100; - int NUM_K = 100; - - string c_file_ending = ".h"; - string file_ending = ".ispc"; - string file_path = "eval_prepared_ispc/sddmm/"; - mkdir(file_path.c_str(), 0777); - - // sddmm - { - stringstream source1; - stringstream source2; - std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); - Tensor A("A", {NUM_I, NUM_K}, {Dense, Dense}); - Tensor B("B", {NUM_I, NUM_K}, CSR); - Tensor C("C", {NUM_I, NUM_J}, {Dense, Dense}); - Tensor D("D", {NUM_J, NUM_K}, {Dense, Dense}); - A(i,k) = B(i,k) * C(i,j) * D(j,k); - IndexStmt stmt = A.getAssignment().concretize(); - bool isFirst = true; - for (auto paramSet : sddmm_parameters) { - IndexStmt scheduled = scheduleSDDMMISPC1(stmt, B, paramSet[0], paramSet[1]); - ir::Stmt compute = lower(scheduled, string("compute1_") + util::join(paramSet, "_"), false, true); - codegen->compile(compute, isFirst); - isFirst = false; - } - ofstream source_file; - source_file.open(file_path + "sddmm_cpu_ispc_taco1" + file_ending); - source_file << source1.str(); - source_file.close(); - - ofstream ispc_source_file; - ispc_source_file.open(file_path + "__sddmm_cpu_ispc_taco1" + file_ending); - ispc_source_file << source2.str(); - ispc_source_file.close(); - } - - - // sddmm - { - stringstream source1; - stringstream source2; - std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); - Tensor Y("Y", {NUM_I, NUM_K}, {Dense, Dense}); - Tensor A("A", {NUM_I, NUM_K}, CSR); - Tensor X("X", {NUM_I, NUM_J}, {Dense, Dense}); - Y(i,j) = A(i,j) * X(i,k) * X(j,k); - IndexStmt stmt = Y.getAssignment().concretize(); - bool isFirst = true; - for (auto paramSet : sddmm_parameters) { - IndexStmt scheduled = scheduleSDDMMISPC2(stmt, A, paramSet[0], paramSet[1]); - ir::Stmt compute = lower(scheduled, string("compute2_") + util::join(paramSet, "_"), false, true); - codegen->compile(compute, isFirst); - isFirst = false; - } - ofstream source_file; - source_file.open(file_path + "sddmm_cpu_ispc_taco2" + file_ending); - source_file << source1.str(); - source_file.close(); - - ofstream ispc_source_file; - ispc_source_file.open(file_path + "__sddmm_cpu_ispc_taco2" + file_ending); - ispc_source_file << source2.str(); - ispc_source_file.close(); - } - - - return; -} - - - TEST(generate_evaluation_files, cpu) { if (should_use_CUDA_codegen()) { @@ -3301,61 +1982,11 @@ TEST(generate_evaluation_files, cpu) { } } -TEST(generate_evaluation_files, spmv_ispc) { - set_CUDA_codegen_enabled(false); - set_ISPC_codegen_enabled(true); - - std::cout << "executing generate_evaluation_file.ispc\n"; - - int NUM_I = 100; - int NUM_J = 100; - - vector> spmv_parameters = {}; // {NNZ_PER_THREAD, BLOCK_SIZE} - for (int i = 3; i <= 20; i++) { - spmv_parameters.push_back({i, 512}); - } - - string file_ending_c = ".c"; - string file_ending_ispc = ".ispc"; - string file_path = "eval_prepared_ispc/spmv/"; - mkdir(file_path.c_str(), 0777); - - // spmv - { - stringstream source1; - stringstream source2; - std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); - Tensor A("A", {NUM_I, NUM_J}, CSR); - Tensor x("x", {NUM_J}, Format({Dense})); - Tensor y("y", {NUM_I}, Format({Dense})); - IndexExpr precomputed = A(i, j) * x(j); - y(i) = precomputed; - IndexStmt stmt = y.getAssignment().concretize(); - bool isFirst = true; - for (auto paramSet : spmv_parameters) { - IndexStmt scheduled = scheduleSpMVCPU(stmt); - ir::Stmt compute = lower(scheduled, string("compute_") + util::join(paramSet, "_"), false, true); - codegen->compile(compute, isFirst); - isFirst = false; - } - ofstream source_file1; - source_file1.open(file_path + "spmv_ispc" + file_ending_c); - source_file1 << source1.str(); - source_file1.close(); - - ofstream source_file2; - source_file2.open(file_path + "__spmv_ispc" + file_ending_ispc); - source_file2 << source2.str(); - source_file2.close(); - } -} - TEST(generate_evaluation_files, gpu) { // if (!should_use_CUDA_codegen()) { // return; // } set_CUDA_codegen_enabled(true); - set_ISPC_codegen_enabled(false); std::cout << "executing generate_evaluation_file.gpu\n"; diff --git a/test/tests-scheduling-fuse.cpp b/test/tests-scheduling-fuse.cpp index 1a941175c..2fbececfe 100644 --- a/test/tests-scheduling-fuse.cpp +++ b/test/tests-scheduling-fuse.cpp @@ -9,321 +9,321 @@ #define NUM_THREADS_TO_USE 1 // #define NUM_THREADS_TO_USE 32 -TEST(scheduling_eval, spmvFusedWithSyntheticData) { - if (should_use_CUDA_codegen()) { - return; - } - taco_set_num_threads(NUM_THREADS_TO_USE); - - std::default_random_engine gen(0); - std::uniform_real_distribution unif(0.0, 1.0); - - Format csr({dense, sparse}); - Format rm({dense}); - - // uncomment this for reading the csr matrix saved in mtx file - std::cout << "reading B mat mtx\n"; - - int NUM_I = 5; // 1021/10; - int NUM_J = 5; // 1039/10; - int NUM_K = 8; - float SPARSITY = .3; - Tensor B("B", {NUM_I, NUM_J}, csr); - srand(75883); - for (int i = 0; i < NUM_I; i++) { - for (int j = 0; j < NUM_J; j++) { - float rand_float = (float)rand()/(float)(RAND_MAX); - if (rand_float < SPARSITY) { - B.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY))); - } - } - } - B.pack(); - - - std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl; - std::cout << "adding c mat\n"; - Tensor C("C", {NUM_J, NUM_K}, csr); - for (int i = 0; i < C.getDimension(0); ++i) { - for (int j = 0; j < C.getDimension(1); ++j) { - C.insert({i,j}, unif(gen)); - } - } - std::cout << "packing C mat\n"; - C.pack(); - - Tensor v("v", {NUM_K}, rm); - for (int i = 0; i < v.getDimension(0); ++i) { - v.insert({i}, unif(gen)); - } - std::cout << "packing D mat\n"; - v.pack(); - - Tensor A("A", {NUM_I}, rm); - Tensor ref("ref", {NUM_I}, rm); - IndexVar i, j, k, l, m; - A(i) = B(i,j) * C(j,k) * v(k); - - // IndexStmt stmt = A.getAssignment().concretize(); - IndexStmt stmt = makeReductionNotation(A.getAssignment()); - stmt = makeConcreteNotation(stmt); - printToFile("SpMVfused", stmt); - stmt = reorderLoopsTopologically(stmt); - stmt = loopFusionOverFission(stmt, A.getAssignment(), "f", 1); - stmt = insertTemporaries(stmt); - stmt = parallelizeOuterLoop(stmt); - - A.compile(stmt); - // We can now call the functions taco generated to assemble the indices of the - // output matrix and then actually compute the MTTKRP. - A.assemble(); - - - // ref(i) = B(i,j) * C(j,k) * v(k); - // IndexStmt refStmt = makeReductionNotation(ref.getAssignment()); - // refStmt = makeConcreteNotation(refStmt); - // refStmt = insertTemporaries(refStmt); - // refStmt = parallelizeOuterLoop(refStmt); - // ref.compile(refStmt); - // ref.assemble(); - - // Tensor ref1({NUM_J}, rm); - // Tensor ref2({NUM_I}, rm); - // ref1(j) = C(j,k) * v(k); - // ref2(i) = B(i,j) * ref1(j); - - // IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment()); - // ref1Stmt = makeConcreteNotation(ref1Stmt); - // ref1Stmt = insertTemporaries(ref1Stmt); - // ref1Stmt = parallelizeOuterLoop(ref1Stmt); - // ref1.compile(ref1Stmt); - // ref1.assemble(); - - // IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment()); - // ref2Stmt = makeConcreteNotation(ref2Stmt); - // ref2Stmt = insertTemporaries(ref2Stmt); - // ref2Stmt = parallelizeOuterLoop(ref2Stmt); - // ref2.compile(ref2Stmt); - // ref2.assemble(); - - std::cout << "compute start\n"; - taco::util::TimeResults timevalue; - bool time = true; - // TOOL_BENCHMARK_TIMER(ref.compute(), "\n\nReference Kernel: ", timevalue); - TOOL_BENCHMARK_TIMER(A.compute(), "\n\nFused Kernel: ", timevalue); - // ASSERT_TENSOR_EQ(ref, A); - - // // check results - // for (int q = 0; q < A.getDimension(0); ++q) { - // if ( abs(A(q) - ref(q))/abs(ref(q)) > ERROR_MARGIN) { - // std::cout << "error: results don't match A("<< q << "): " - // << A(q) << ", ref: " << ref(q) << std::endl; - // ASSERT_TRUE(false); - // } - // } - // // ASSERT_TENSOR_EQ(A, ref); - // TOOL_BENCHMARK_TIMER(ref1.compute(), "\n\nSDDMM Kernel: ", timevalue); - // TOOL_BENCHMARK_TIMER(ref2.compute(), "\n\nSpMM Kernel: ", timevalue); - // ASSERT_TENSOR_EQ(ref, ref2); - - // for (int q = 0; q < ref2.getDimension(0); ++q) { - // for (int w = 0; w < ref2.getDimension(1); ++w) { - // if ( abs(ref2(q,w) - ref(q,w))/abs(ref(q,w)) > ERROR_MARGIN) { - // std::cout << "error: results don't match A("<< q << "," << w << "): " - // << ref2(q,w) << ", ref: " << ref(q,w) << std::endl; - // ASSERT_TRUE(false); - // } - // } - // } - -} - -TEST(scheduling_eval, spmvFused) { - if (should_use_CUDA_codegen()) { - return; - } - - ofstream statfile; - statfile.open( - "/home/min/a/kadhitha/workspace/my_taco/taco/test/stats/spmv-spmv.txt", std::ios::app); - if (statfile.is_open()) { - statfile << "\nspmv-spmv execution\n"; - statfile << "\n-----------------------------------------\n"; - } - taco_set_num_threads(NUM_THREADS_TO_USE); - - std::default_random_engine gen(0); - std::uniform_real_distribution unif(0.0, 1.0); - - Format csr({dense, sparse}); - Format rm({dense}); - - - - int filenum = 1; - - std::vector matfiles = { - "/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx", - "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx", - "/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx", - "/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx", - "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx", - "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx", // 5 - "/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx", - "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx", - "/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx", // 8 - "/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx", - "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx", // 10 - "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx", - "/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx", // 12 - "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wiki-Talk/wiki-Talk.mtx", // 13 - "/home/min/a/kadhitha/ispc-examples/data/suitesparse/com-Orkut/com-Orkut.mtx", - "/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx", // 15 - "/home/min/a/kadhitha/workspace/my_taco/FusedMM/dataset/harvard.mtx", - "/home/min/a/kadhitha/ispc-examples/data/suitesparse/twitter7/twitter7.mtx" - }; - std::vector matfilesrw = { - "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/synthetic.mtx", - "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cage3.mtx", - "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/bcsstk17.mtx", - "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/pdb1HYS.mtx", - "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/rma10.mtx", - "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cant.mtx", - "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/consph.mtx", - "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cop20k_A.mtx", - "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/shipsec1.mtx", - "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/scircuit.mtx", - "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx", - "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx", - "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/webbase-1M.mtx", - "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/wiki-Talk.mtx", - "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/com-Orkut.mtx", - "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/circuit5M.mtx", - "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/harvard.mtx", - "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/twitter7.mtx" - }; - - // uncomment this for reading the csr matrix saved in mtx file - std::cout << "reading B mat mtx\n"; - - - int kDim = 8; - float SPARSITY = .3; - std::string matfile = matfiles[filenum]; - std::cout << "reading B mat mtx\n"; - Tensor B = read(matfile, csr, true); - B.setName("B"); - B.pack(); - - std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl; - std::cout << "adding c mat\n"; - - std::cout << "reading B mat mtx\n"; - Tensor C = read(matfile, csr, true); - C.setName("C"); - C.pack(); - - - Tensor v("v", {C.getDimension(1)}, rm); - for (int i = 0; i < v.getDimension(0); ++i) { - v.insert({i}, unif(gen)); - } - std::cout << "packing D mat\n"; - v.pack(); - - if (statfile.is_open()) { - statfile - << "A(i) = B(i,j) * C(j,k) * v(k);" << std::endl - << "B1_dimension: " << B.getDimension(0) << ", B2_dimension: " << B.getDimension(1) << ", vals: " << B.getStorage().getValues().getSize() << std::endl - << "C1_dimension: " << C.getDimension(0) << ", C2_dimension: " << C.getDimension(1) << ", vals: " << C.getStorage().getValues().getSize() << std::endl - << "D1_dimension: " << v.getDimension(0) << ", vals: " << v.getStorage().getValues().getSize() << std::endl - << std::endl; - } - - Tensor A("A", {B.getDimension(0)}, rm); - Tensor ref("ref", {B.getDimension(0)}, rm); - IndexVar i, j, k, l, m; - A(i) = B(i,j) * C(j,k) * v(k); - - ref(i) = B(i,j) * C(j,k) * v(k); - IndexStmt refStmt = makeReductionNotation(ref.getAssignment()); - refStmt = makeConcreteNotation(refStmt); - refStmt = insertTemporaries(refStmt); - refStmt = parallelizeOuterLoop(refStmt); - ref.compile(refStmt); - ref.assemble(); - - // IndexStmt stmt = A.getAssignment().concretize(); - IndexStmt stmt = makeReductionNotation(A.getAssignment()); - stmt = makeConcreteNotation(stmt); - printToFile("SpMVfused", stmt); - stmt = reorderLoopsTopologically(stmt); - stmt = loopFusionOverFission(stmt, A.getAssignment(), "f", 1); - stmt = insertTemporaries(stmt); - stmt = parallelizeOuterLoop(stmt); - A.compile(stmt); - A.assemble(); - - - // Tensor ref1({NUM_J}, rm); - // Tensor ref2({NUM_I}, rm); - // ref1(j) = C(j,k) * v(k); - // ref2(i) = B(i,j) * ref1(j); - - // IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment()); - // ref1Stmt = makeConcreteNotation(ref1Stmt); - // ref1Stmt = insertTemporaries(ref1Stmt); - // ref1Stmt = parallelizeOuterLoop(ref1Stmt); - // ref1.compile(ref1Stmt); - // ref1.assemble(); - - // IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment()); - // ref2Stmt = makeConcreteNotation(ref2Stmt); - // ref2Stmt = insertTemporaries(ref2Stmt); - // ref2Stmt = parallelizeOuterLoop(ref2Stmt); - // ref2.compile(ref2Stmt); - // ref2.assemble(); - - std::cout << "compute start\n"; - taco::util::TimeResults timevalue; - bool time = true; - std::string sofused = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/spmv_spmv/spmv_fused.so"; - - TOOL_BENCHMARK_TIMER(ref.compute(statfile, sofused), "\n\nReference Kernel: ", timevalue); +// TEST(scheduling_eval, spmvFusedWithSyntheticData) { +// if (should_use_CUDA_codegen()) { +// return; +// } +// taco_set_num_threads(NUM_THREADS_TO_USE); + +// std::default_random_engine gen(0); +// std::uniform_real_distribution unif(0.0, 1.0); + +// Format csr({dense, sparse}); +// Format rm({dense}); + +// // uncomment this for reading the csr matrix saved in mtx file +// std::cout << "reading B mat mtx\n"; + +// int NUM_I = 5; // 1021/10; +// int NUM_J = 5; // 1039/10; +// int NUM_K = 8; +// float SPARSITY = .3; +// Tensor B("B", {NUM_I, NUM_J}, csr); +// srand(75883); +// for (int i = 0; i < NUM_I; i++) { +// for (int j = 0; j < NUM_J; j++) { +// float rand_float = (float)rand()/(float)(RAND_MAX); +// if (rand_float < SPARSITY) { +// B.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY))); +// } +// } +// } +// B.pack(); + + +// std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl; +// std::cout << "adding c mat\n"; +// Tensor C("C", {NUM_J, NUM_K}, csr); +// for (int i = 0; i < C.getDimension(0); ++i) { +// for (int j = 0; j < C.getDimension(1); ++j) { +// C.insert({i,j}, unif(gen)); +// } +// } +// std::cout << "packing C mat\n"; +// C.pack(); + +// Tensor v("v", {NUM_K}, rm); +// for (int i = 0; i < v.getDimension(0); ++i) { +// v.insert({i}, unif(gen)); +// } +// std::cout << "packing D mat\n"; +// v.pack(); + +// Tensor A("A", {NUM_I}, rm); +// Tensor ref("ref", {NUM_I}, rm); +// IndexVar i, j, k, l, m; +// A(i) = B(i,j) * C(j,k) * v(k); + +// // IndexStmt stmt = A.getAssignment().concretize(); +// IndexStmt stmt = makeReductionNotation(A.getAssignment()); +// stmt = makeConcreteNotation(stmt); +// printToFile("SpMVfused", stmt); +// stmt = reorderLoopsTopologically(stmt); +// stmt = loopFusionOverFission(stmt, A.getAssignment(), "f", 1); +// stmt = insertTemporaries(stmt); +// stmt = parallelizeOuterLoop(stmt); + +// A.compile(stmt); +// // We can now call the functions taco generated to assemble the indices of the +// // output matrix and then actually compute the MTTKRP. +// A.assemble(); + + +// // ref(i) = B(i,j) * C(j,k) * v(k); +// // IndexStmt refStmt = makeReductionNotation(ref.getAssignment()); +// // refStmt = makeConcreteNotation(refStmt); +// // refStmt = insertTemporaries(refStmt); +// // refStmt = parallelizeOuterLoop(refStmt); +// // ref.compile(refStmt); +// // ref.assemble(); + +// // Tensor ref1({NUM_J}, rm); +// // Tensor ref2({NUM_I}, rm); +// // ref1(j) = C(j,k) * v(k); +// // ref2(i) = B(i,j) * ref1(j); + +// // IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment()); +// // ref1Stmt = makeConcreteNotation(ref1Stmt); +// // ref1Stmt = insertTemporaries(ref1Stmt); +// // ref1Stmt = parallelizeOuterLoop(ref1Stmt); +// // ref1.compile(ref1Stmt); +// // ref1.assemble(); + +// // IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment()); +// // ref2Stmt = makeConcreteNotation(ref2Stmt); +// // ref2Stmt = insertTemporaries(ref2Stmt); +// // ref2Stmt = parallelizeOuterLoop(ref2Stmt); +// // ref2.compile(ref2Stmt); +// // ref2.assemble(); + +// std::cout << "compute start\n"; +// taco::util::TimeResults timevalue; +// bool time = true; +// // TOOL_BENCHMARK_TIMER(ref.compute(), "\n\nReference Kernel: ", timevalue); +// TOOL_BENCHMARK_TIMER(A.compute(), "\n\nFused Kernel: ", timevalue); +// // ASSERT_TENSOR_EQ(ref, A); + +// // // check results +// // for (int q = 0; q < A.getDimension(0); ++q) { +// // if ( abs(A(q) - ref(q))/abs(ref(q)) > ERROR_MARGIN) { +// // std::cout << "error: results don't match A("<< q << "): " +// // << A(q) << ", ref: " << ref(q) << std::endl; +// // ASSERT_TRUE(false); +// // } +// // } +// // // ASSERT_TENSOR_EQ(A, ref); +// // TOOL_BENCHMARK_TIMER(ref1.compute(), "\n\nSDDMM Kernel: ", timevalue); +// // TOOL_BENCHMARK_TIMER(ref2.compute(), "\n\nSpMM Kernel: ", timevalue); +// // ASSERT_TENSOR_EQ(ref, ref2); + +// // for (int q = 0; q < ref2.getDimension(0); ++q) { +// // for (int w = 0; w < ref2.getDimension(1); ++w) { +// // if ( abs(ref2(q,w) - ref(q,w))/abs(ref(q,w)) > ERROR_MARGIN) { +// // std::cout << "error: results don't match A("<< q << "," << w << "): " +// // << ref2(q,w) << ", ref: " << ref(q,w) << std::endl; +// // ASSERT_TRUE(false); +// // } +// // } +// // } + +// } + +// TEST(scheduling_eval, spmvFused) { +// if (should_use_CUDA_codegen()) { +// return; +// } + +// ofstream statfile; +// statfile.open( +// "/home/min/a/kadhitha/workspace/my_taco/taco/test/stats/spmv-spmv.txt", std::ios::app); +// if (statfile.is_open()) { +// statfile << "\nspmv-spmv execution\n"; +// statfile << "\n-----------------------------------------\n"; +// } +// taco_set_num_threads(NUM_THREADS_TO_USE); + +// std::default_random_engine gen(0); +// std::uniform_real_distribution unif(0.0, 1.0); + +// Format csr({dense, sparse}); +// Format rm({dense}); + + + +// int filenum = 1; + +// std::vector matfiles = { +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/synthetic/synthetic.mtx", +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cage3/cage3.mtx", +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/bcsstk17/bcsstk17.mtx", +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/pdb1HYS/pdb1HYS.mtx", +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rma10/rma10.mtx", +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cant/cant.mtx", // 5 +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/consph/consph.mtx", +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/cop20k_A/cop20k_A.mtx", +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/shipsec1/shipsec1.mtx", // 8 +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/scircuit/scircuit.mtx", +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx", // 10 +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx", +// "/home/min/a/kadhitha/ispc-examples/data/ufl/webbase-1M/webbase-1M.mtx", // 12 +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wiki-Talk/wiki-Talk.mtx", // 13 +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/com-Orkut/com-Orkut.mtx", +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/circuit5M/circuit5M.mtx", // 15 +// "/home/min/a/kadhitha/workspace/my_taco/FusedMM/dataset/harvard.mtx", +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/twitter7/twitter7.mtx" +// }; +// std::vector matfilesrw = { +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/synthetic.mtx", +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cage3.mtx", +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/bcsstk17.mtx", +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/pdb1HYS.mtx", +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/rma10.mtx", +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cant.mtx", +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/consph.mtx", +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/cop20k_A.mtx", +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/shipsec1.mtx", +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/scircuit.mtx", +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/mac_econ_fwd500/mac_econ_fwd500.mtx", +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/wtk/pwtk.mtx", +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/webbase-1M.mtx", +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/wiki-Talk.mtx", +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/com-Orkut.mtx", +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/circuit5M.mtx", +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/harvard.mtx", +// "/home/min/a/kadhitha/ispc-examples/data/suitesparse/rw/twitter7.mtx" +// }; + +// // uncomment this for reading the csr matrix saved in mtx file +// std::cout << "reading B mat mtx\n"; + + +// int kDim = 8; +// float SPARSITY = .3; +// std::string matfile = matfiles[filenum]; +// std::cout << "reading B mat mtx\n"; +// Tensor B = read(matfile, csr, true); +// B.setName("B"); +// B.pack(); + +// std::cout << "B dim0: " << B.getDimension(0) << ", dim1: " << B.getDimension(1) << std::endl; +// std::cout << "adding c mat\n"; + +// std::cout << "reading B mat mtx\n"; +// Tensor C = read(matfile, csr, true); +// C.setName("C"); +// C.pack(); + + +// Tensor v("v", {C.getDimension(1)}, rm); +// for (int i = 0; i < v.getDimension(0); ++i) { +// v.insert({i}, unif(gen)); +// } +// std::cout << "packing D mat\n"; +// v.pack(); + +// if (statfile.is_open()) { +// statfile +// << "A(i) = B(i,j) * C(j,k) * v(k);" << std::endl +// << "B1_dimension: " << B.getDimension(0) << ", B2_dimension: " << B.getDimension(1) << ", vals: " << B.getStorage().getValues().getSize() << std::endl +// << "C1_dimension: " << C.getDimension(0) << ", C2_dimension: " << C.getDimension(1) << ", vals: " << C.getStorage().getValues().getSize() << std::endl +// << "D1_dimension: " << v.getDimension(0) << ", vals: " << v.getStorage().getValues().getSize() << std::endl +// << std::endl; +// } + +// Tensor A("A", {B.getDimension(0)}, rm); +// Tensor ref("ref", {B.getDimension(0)}, rm); +// IndexVar i, j, k, l, m; +// A(i) = B(i,j) * C(j,k) * v(k); + +// ref(i) = B(i,j) * C(j,k) * v(k); +// IndexStmt refStmt = makeReductionNotation(ref.getAssignment()); +// refStmt = makeConcreteNotation(refStmt); +// refStmt = insertTemporaries(refStmt); +// refStmt = parallelizeOuterLoop(refStmt); +// ref.compile(refStmt); +// ref.assemble(); + +// // IndexStmt stmt = A.getAssignment().concretize(); +// IndexStmt stmt = makeReductionNotation(A.getAssignment()); +// stmt = makeConcreteNotation(stmt); +// printToFile("SpMVfused", stmt); +// stmt = reorderLoopsTopologically(stmt); +// stmt = loopFusionOverFission(stmt, A.getAssignment(), "f", 1); +// stmt = insertTemporaries(stmt); +// stmt = parallelizeOuterLoop(stmt); +// A.compile(stmt); +// A.assemble(); + + +// // Tensor ref1({NUM_J}, rm); +// // Tensor ref2({NUM_I}, rm); +// // ref1(j) = C(j,k) * v(k); +// // ref2(i) = B(i,j) * ref1(j); + +// // IndexStmt ref1Stmt = makeReductionNotation(ref1.getAssignment()); +// // ref1Stmt = makeConcreteNotation(ref1Stmt); +// // ref1Stmt = insertTemporaries(ref1Stmt); +// // ref1Stmt = parallelizeOuterLoop(ref1Stmt); +// // ref1.compile(ref1Stmt); +// // ref1.assemble(); + +// // IndexStmt ref2Stmt = makeReductionNotation(ref2.getAssignment()); +// // ref2Stmt = makeConcreteNotation(ref2Stmt); +// // ref2Stmt = insertTemporaries(ref2Stmt); +// // ref2Stmt = parallelizeOuterLoop(ref2Stmt); +// // ref2.compile(ref2Stmt); +// // ref2.assemble(); + +// std::cout << "compute start\n"; +// taco::util::TimeResults timevalue; +// bool time = true; +// std::string sofused = "/home/min/a/kadhitha/workspace/my_taco/taco/test/kernels/spmv_spmv/spmv_fused.so"; + +// TOOL_BENCHMARK_TIMER(ref.compute(statfile, sofused), "\n\nReference Kernel: ", timevalue); - std::cout << "b1 dim: " << B.getTacoTensorT()->dimensions[1] << std::endl; - // TOOL_BENCHMARK_TIMER(ref.compute(statfile, sofused), "\n\nFused Kernel: ", timevalue); - // ASSERT_TENSOR_EQ(ref, A); - - // // check results - // for (int q = 0; q < A.getDimension(0); ++q) { - // if ( abs(A(q) - ref(q))/abs(ref(q)) > ERROR_MARGIN) { - // std::cout << "error: results don't match A("<< q << "): " - // << A(q) << ", ref: " << ref(q) << std::endl; - // ASSERT_TRUE(false); - // } - // } - // // ASSERT_TENSOR_EQ(A, ref); - // TOOL_BENCHMARK_TIMER(ref1.compute(), "\n\nSDDMM Kernel: ", timevalue); - // TOOL_BENCHMARK_TIMER(ref2.compute(), "\n\nSpMM Kernel: ", timevalue); - // ASSERT_TENSOR_EQ(ref, ref2); - - // for (int q = 0; q < ref2.getDimension(0); ++q) { - // for (int w = 0; w < ref2.getDimension(1); ++w) { - // if ( abs(ref2(q,w) - ref(q,w))/abs(ref(q,w)) > ERROR_MARGIN) { - // std::cout << "error: results don't match A("<< q << "," << w << "): " - // << ref2(q,w) << ", ref: " << ref(q,w) << std::endl; - // ASSERT_TRUE(false); - // } - // } - // } - - if (statfile.is_open()) { - statfile.close(); - } - -} +// std::cout << "b1 dim: " << B.getTacoTensorT()->dimensions[1] << std::endl; +// // TOOL_BENCHMARK_TIMER(ref.compute(statfile, sofused), "\n\nFused Kernel: ", timevalue); +// // ASSERT_TENSOR_EQ(ref, A); + +// // // check results +// // for (int q = 0; q < A.getDimension(0); ++q) { +// // if ( abs(A(q) - ref(q))/abs(ref(q)) > ERROR_MARGIN) { +// // std::cout << "error: results don't match A("<< q << "): " +// // << A(q) << ", ref: " << ref(q) << std::endl; +// // ASSERT_TRUE(false); +// // } +// // } +// // // ASSERT_TENSOR_EQ(A, ref); +// // TOOL_BENCHMARK_TIMER(ref1.compute(), "\n\nSDDMM Kernel: ", timevalue); +// // TOOL_BENCHMARK_TIMER(ref2.compute(), "\n\nSpMM Kernel: ", timevalue); +// // ASSERT_TENSOR_EQ(ref, ref2); + +// // for (int q = 0; q < ref2.getDimension(0); ++q) { +// // for (int w = 0; w < ref2.getDimension(1); ++w) { +// // if ( abs(ref2(q,w) - ref(q,w))/abs(ref(q,w)) > ERROR_MARGIN) { +// // std::cout << "error: results don't match A("<< q << "," << w << "): " +// // << ref2(q,w) << ", ref: " << ref(q,w) << std::endl; +// // ASSERT_TRUE(false); +// // } +// // } +// // } + +// if (statfile.is_open()) { +// statfile.close(); +// } + +// } TEST(scheduling_eval, sddmmFusedWithSyntheticData) { if (should_use_CUDA_codegen()) { diff --git a/test/util.h b/test/util.h index f96087ba1..0f8b633e6 100644 --- a/test/util.h +++ b/test/util.h @@ -4,7 +4,6 @@ #include #include #include -#include #include #include #include @@ -59,7 +58,6 @@ using namespace taco; static void printToCout(IndexStmt stmt); static void printToFile(string filename, IndexStmt stmt); -static void printToFile(string filename, string additional_filename, IndexStmt stmt); static void printToCout(IndexStmt stmt) { @@ -85,29 +83,4 @@ void printToFile(string filename, IndexStmt stmt) { source_file.close(); } -void printToFile(string filename, string additional_filename, IndexStmt stmt) { - stringstream source1; - stringstream source2; - - string file_path = "eval_generated/"; - mkdir(file_path.c_str(), 0777); - - std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); - ir::Stmt compute = lower(stmt, "compute", false, true); - codegen->compile(compute, true); - - ofstream source_file; - string file_ending = should_use_CUDA_codegen() ? ".cu" : ".c"; - source_file.open(file_path+filename+file_ending); - source_file << source1.str(); - source_file.close(); - - ofstream additional_source_file; - string additional_file_ending = ".ispc"; - additional_source_file.open(file_path+additional_filename+additional_file_ending); - additional_source_file << source2.str(); - additional_source_file.close(); - -} - #endif // __SCHEDULE_UTIL_HH__ \ No newline at end of file From 79ce4e7367d12acfc6fb6e3c9aac74bb1d4b54cf Mon Sep 17 00:00:00 2001 From: Adhhitha Dias Date: Wed, 11 May 2022 16:39:00 -0400 Subject: [PATCH 13/16] task: remove ispc related content partially --- CMakeLists.txt | 6 +- include/taco/lower/lowerer_impl_imperative.h | 3 - include/taco/util/strings.h | 21 --- src/lower/lowerer_impl_imperative.cpp | 168 +------------------ test/CMakeLists.txt | 1 - tools/taco.cpp | 65 +------ 6 files changed, 13 insertions(+), 251 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e9ec7be7a..bbc678c72 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -104,8 +104,7 @@ if(COVERAGE) message("-- Code coverage analysis (gcovr) enabled") endif(COVERAGE) -set(C_CXX_FLAGS "${C_CXX_FLAGS} -I/${PAPI_DIR}/include -L/${PAPI_DIR}/lib") -# set(C_CXX_FLAGS "${C_CXX_FLAGS}") +set(C_CXX_FLAGS "${C_CXX_FLAGS}") set(CMAKE_C_FLAGS "${C_CXX_FLAGS}") set(CMAKE_CXX_FLAGS "${C_CXX_FLAGS} -std=c++14") @@ -118,9 +117,6 @@ set(TACO_INCLUDE_DIR ${TACO_PROJECT_DIR}/include) enable_testing() include_directories(${TACO_INCLUDE_DIR}) -# include_directories("/home/min/a/kadhitha/workspace/my_taco/valgrind") -# project (ValgrindExample) -# include (CTest) set(TACO_LIBRARY_DIR ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}) diff --git a/include/taco/lower/lowerer_impl_imperative.h b/include/taco/lower/lowerer_impl_imperative.h index d743f5875..65f069fda 100644 --- a/include/taco/lower/lowerer_impl_imperative.h +++ b/include/taco/lower/lowerer_impl_imperative.h @@ -499,13 +499,10 @@ class LowererImplImperative : public LowererImpl { bool emitUnderivedGuards = true; - int loopDepth = 0; int inParallelLoopDepth = 0; std::map parallelUnitSizes; std::map parallelUnitIndexVars; - std::map forUnits; // - std::map whereTempsWithLoopDepth; /// Keep track of what IndexVars have already been defined std::set definedIndexVars; diff --git a/include/taco/util/strings.h b/include/taco/util/strings.h index a3c3d863f..35d2c3949 100644 --- a/include/taco/util/strings.h +++ b/include/taco/util/strings.h @@ -9,8 +9,6 @@ #include #include -#include "taco/type.h" - // To get the value of a compiler macro variable #define STRINGIFY(x) #x #define TO_STRING(x) STRINGIFY(x) @@ -18,25 +16,6 @@ namespace taco { namespace util { -// /// Turn anything except floating points that can be written to a stream -// /// into a string. -// template -// typename std::enable_if::value, std::string>::type -// toStringISPC(const T &val) { - -// std::stringstream sstream; -// if (val == Int32) { -// sstream << "int32"; -// } -// else if (val == Int64) { -// sstream << "int64"; -// } -// else { -// sstream << val; -// } -// return sstream.str(); -// } - /// Turn anything except floating points that can be written to a stream /// into a string. template diff --git a/src/lower/lowerer_impl_imperative.cpp b/src/lower/lowerer_impl_imperative.cpp index 1355c80a1..eed0c4174 100644 --- a/src/lower/lowerer_impl_imperative.cpp +++ b/src/lower/lowerer_impl_imperative.cpp @@ -28,7 +28,6 @@ class LowererImplImperative::Visitor : public IndexNotationVisitorStrict { public: Visitor(LowererImplImperative* impl) : impl(impl) {} Stmt lower(IndexStmt stmt) { - // std::cout << "lowering IndexStmt to ir:Stmt - IndexStmt: " << stmt << std::endl; this->stmt = Stmt(); impl->accessibleIterators.scope(); IndexStmtVisitorStrict::visit(stmt); @@ -138,7 +137,6 @@ static bool returnsTrue(IndexExpr expr) { } void visit(const CastNode* op) { - std::cout << "visiting cast node\n"; expr = rewrite(op->a); } @@ -204,7 +202,6 @@ static std::set hasSparseInserts(IndexStmt stmt, Iterators iterators, return ret; } - Stmt LowererImplImperative::lower(IndexStmt stmt, string name, bool assemble, bool compute, bool pack, bool unpack) @@ -419,7 +416,6 @@ LowererImplImperative::lower(IndexStmt stmt, string name, Stmt LowererImplImperative::lowerAssignment(Assignment assignment) { - // std::cout << "\n\n converting assignment IndexStmt============================================ Assignment\n"; taco_iassert(generateAssembleCode() || generateComputeCode()); Stmt computeStmt; @@ -427,7 +423,6 @@ Stmt LowererImplImperative::lowerAssignment(Assignment assignment) Expr var = getTensorVar(result); const bool needComputeAssign = util::contains(needCompute, result); - // std::cout << "does assignment need compute assign: " << needComputeAssign << std::endl; Expr rhs; if (needComputeAssign) { rhs = lower(assignment.getRhs()); @@ -435,51 +430,20 @@ Stmt LowererImplImperative::lowerAssignment(Assignment assignment) // Assignment to scalar variables. if (isScalar(result.getType())) { - // std::cout << "assignment to scalar variables\n"; if (needComputeAssign) { - // std::cout << "compute assign\n"; if (!assignment.getOperator().defined()) { - // std::cout << "assignment operator is not defined\n"; - // std::cout << "var: " << var << ", rhs, : " << rhs << std::endl; computeStmt = Assign::make(var, rhs); } else { taco_iassert(isa(assignment.getOperator())); - - // std::cout << "assignment depth -- loopDepth: " << loopDepth << std::endl; - // std::cout << "is markAssignsAtomicDepth > 0: " << (markAssignsAtomicDepth > 0) << std::endl; - // for (auto &tensors_ : whereTemps) { - // // std::cout << tensors_ << ", "; - // } - // std::cout << std::endl; - // std::cout << result << std::endl; - int tempVarInitLoopDepth = whereTempsWithLoopDepth.find(result)->second; - // std::cout << "tempInitLoopDepth: " << tempVarInitLoopDepth << std::endl; - - bool reduction = false; - std::map::iterator itr; - for (itr = forUnits.begin(); itr!=forUnits.end(); ++itr) { - if (itr->first<=loopDepth && itr->first>tempVarInitLoopDepth && itr->second == ParallelUnit::CPUSimd) { - reduction = true; - } - // std::cout << itr->first << "\t" << ParallelUnit_NAMES[(int) itr->second] << std::endl; - } - - // less than or equal to loopDepth but greater than temp variable initialized loop depth - bool useAtomics = markAssignsAtomicDepth > 0 && (!util::contains(whereTemps, result) || reduction); - // std::cout << "whereTemps and result: " << !util::contains(whereTemps, result) << std::endl; - // std::cout << "assignment to scalar variables useAtomics: " << useAtomics << std::endl; + bool useAtomics = markAssignsAtomicDepth > 0 && + !util::contains(whereTemps, result); computeStmt = compoundAssign(var, rhs, useAtomics, atomicParallelUnit); - // std::cout << "computeStatment: " << computeStmt << std::endl; } } - else { - // std::cout << "not compute assign\n"; - } } // Assignments to tensor variables (non-scalar). else { - // std::cout << "assignment to tensor variables\n"; Expr values = getValuesArray(result); Expr loc = generateValueLocExpr(assignment.getLhs()); @@ -513,7 +477,6 @@ Stmt LowererImplImperative::lowerAssignment(Assignment assignment) } if (needComputeAssign && values.defined()) { - // std::cout << "assign compute statement\n"; if (!assignment.getOperator().defined()) { computeStmt = Store::make(values, loc, rhs); } @@ -624,39 +587,20 @@ LowererImplImperative::splitAppenderAndInserters(const vector& results } -// important function -/* -* This is the for loop lowering part -*/ - Stmt LowererImplImperative::lowerForall(Forall forall) { - loopDepth++; - forUnits.insert(std::pair(loopDepth,forall.getParallelUnit())); - // std::cout << "doing lowerForall: " << forall << std::endl; bool hasExactBound = provGraph.hasExactBound(forall.getIndexVar()); bool forallNeedsUnderivedGuards = !hasExactBound && emitUnderivedGuards; - - // std::cout << "printing temporary variables with their atomic depths\n"; - map::iterator itr; - for (itr = whereTempsWithLoopDepth.begin(); itr != whereTempsWithLoopDepth.end(); ++itr) { - // std::cout << itr->first << "\t" << itr->second << "\n"; - } - - if (!ignoreVectorize && forallNeedsUnderivedGuards && (forall.getParallelUnit() == ParallelUnit::CPUVector || forall.getUnrollFactor() > 0)) { - // std::cout << "calling lowerForallCloned(forall)\n"; return lowerForallCloned(forall); } - // std::cout << "inParallelLoopDepth: " << inParallelLoopDepth << "========================\n"; if (forall.getParallelUnit() != ParallelUnit::NotParallel) { inParallelLoopDepth++; } - // std::cout << "inParallelLoopDepth: " << inParallelLoopDepth << "========================\n"; // Recover any available parents that were not recoverable previously vector recoverySteps; @@ -844,23 +788,19 @@ Stmt LowererImplImperative::lowerForall(Forall forall) } if (!isWhereProducer && hasPosDescendant && underivedAncestors.size() > 1 && provGraph.isPosVariable(iterator.getIndexVar()) && posDescendant == forall.getIndexVar()) { - // std::cout << "calling lowerForallFusedPosition(forall\n"; loops = lowerForallFusedPosition(forall, iterator, locators, inserters, appenders, reducedAccesses, recoveryStmt); } else if (canAccelWithSparseIteration) { - // std::cout << "calling lowerForallDenseAcceleration(forall\n"; loops = lowerForallDenseAcceleration(forall, locators, inserters, appenders, reducedAccesses, recoveryStmt); } // Emit dimension coordinate iteration loop else if (iterator.isDimensionIterator()) { - // std::cout << "calling lowerForallDimension(forall\n"; loops = lowerForallDimension(forall, point.locators(), inserters, appenders, reducedAccesses, recoveryStmt); } // Emit position iteration loop else if (iterator.hasPosIter()) { - // std::cout << "calling lowerForallPosition(forall\n"; loops = lowerForallPosition(forall, iterator, locators, inserters, appenders, reducedAccesses, recoveryStmt); } @@ -878,12 +818,7 @@ Stmt LowererImplImperative::lowerForall(Forall forall) loops = lowerMergeLattice(lattice, underivedAncestors[0], forall.getStmt(), reducedAccesses); } - - // std::cout << "printing loops ----------------------------------------------------------------------------------------------\n"; - // std::cout << loops << std::endl; - // std::cout << "loops printed -----------------------------------------------------------------------------------------------\n"; // taco_iassert(loops.defined()); - if (!generateComputeCode() && !hasStores(loops)) { // If assembly loop does not modify output arrays, then it can be safely // omitted. @@ -898,9 +833,7 @@ Stmt LowererImplImperative::lowerForall(Forall forall) parallelUnitIndexVars.erase(forall.getParallelUnit()); parallelUnitSizes.erase(forall.getParallelUnit()); } - - forUnits.erase(loopDepth); - loopDepth--; + return Block::blanks(preInitValues, temporaryValuesInitFree[0], loops, @@ -1205,22 +1138,13 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, set reducedAccesses, ir::Stmt recoveryStmt) { - // std::cout << "1 Stmt LowererImplImperative::lowerForallDimension\n"; - // std::cout << "1 Stmt LowererImplImperative::lowerForallDimension markAssignsAtomicDepth: " << markAssignsAtomicDepth << std::endl; Expr coordinate = getCoordinateVar(forall.getIndexVar()); if (forall.getParallelUnit() != ParallelUnit::NotParallel && forall.getOutputRaceStrategy() == OutputRaceStrategy::Atomics) { markAssignsAtomicDepth++; - // std::cout << "1 Stmt LowererImplImperative::lowerForallDimension getParallelUnit() is Not NotParallel and outputRaceStrategy is Atomics\n"; - // std::cout << "markAssignsAtomicDepth: " << markAssignsAtomicDepth << std::endl; atomicParallelUnit = forall.getParallelUnit(); } - else { - // std::cout << "1 Stmt LowererImplImperative::lowerForallDimension getParallelUnit() is NotParallel or outputRaceStrategy is not Atomics\n"; - } - // std::cout << "original forall : " << forall << std::endl; - // std::cout << "inside IndexStmt: " << forall.getStmt() << std::endl; Stmt body = lowerForallBody(coordinate, forall.getStmt(), locators, inserters, appenders, reducedAccesses); @@ -1236,18 +1160,7 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, std::vector bounds = provGraph.deriveIterBounds(forall.getIndexVar(), definedIndexVarsOrdered, underivedBounds, indexVarToExprMap, iterators); LoopKind kind = LoopKind::Serial; - if (should_use_ISPC_codegen()) { - // std::cout << "Foreach compatible loop\n"; - if (forall.getParallelUnit() == ParallelUnit::CPUSimd) { - kind = LoopKind::Foreach; - } - else if (forall.getParallelUnit() == ParallelUnit::CPUSpmd - && forall.getOutputRaceStrategy() != OutputRaceStrategy::ParallelReduction - ) { - kind = LoopKind::Mul_Thread; - } - } - else if (forall.getParallelUnit() == ParallelUnit::CPUVector && !ignoreVectorize) { + if (forall.getParallelUnit() == ParallelUnit::CPUVector && !ignoreVectorize) { kind = LoopKind::Vectorized; } else if (forall.getParallelUnit() != ParallelUnit::NotParallel @@ -1255,7 +1168,6 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, kind = LoopKind::Runtime; } - // std::cout << "2 Stmt LowererImplImperative::lowerForallDimension\n"; return Block::blanks(For::make(coordinate, bounds[0], bounds[1], 1, body, kind, ignoreVectorize ? ParallelUnit::NotParallel : forall.getParallelUnit(), ignoreVectorize ? 0 : forall.getUnrollFactor()), @@ -1269,7 +1181,6 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, set reducedAccesses, ir::Stmt recoveryStmt) { - // std::cout << "1 Stmt LowererImplImperative::lowerForallDenseAcceleration\n"; taco_iassert(locators.size() == 1) << "Optimizing a dense workspace is only supported when the consumer is the only RHS tensor"; taco_iassert(provGraph.isFullyDerived(forall.getIndexVar())) << "Sparsely accelerating a dense workspace only works with fully derived index vars"; taco_iassert(forall.getParallelUnit() == ParallelUnit::NotParallel) << "Sparsely accelerating a dense workspace only works within serial loops"; @@ -1295,8 +1206,6 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, } Stmt declareVar = VarDecl::make(coordinate, Load::make(indexList, loopVar)); - // std::cout << "original forall : " << forall << std::endl; - // std::cout << "inside IndexStmt: " << forall.getStmt() << std::endl; Stmt body = lowerForallBody(coordinate, forall.getStmt(), locators, inserters, appenders, reducedAccesses); Stmt resetGuard = ir::Store::make(bitGuard, coordinate, ir::Literal::make(false), markAssignsAtomicDepth > 0, atomicParallelUnit); @@ -1309,12 +1218,7 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, Stmt posAppend = generateAppendPositions(appenders); LoopKind kind = LoopKind::Serial; - if (should_use_ISPC_codegen()) { - if (forall.getParallelUnit() == ParallelUnit::CPUSimd) { - kind = LoopKind::Foreach; - } - } - else if (forall.getParallelUnit() == ParallelUnit::CPUVector && !ignoreVectorize) { + if (forall.getParallelUnit() == ParallelUnit::CPUVector && !ignoreVectorize) { kind = LoopKind::Vectorized; } else if (forall.getParallelUnit() != ParallelUnit::NotParallel @@ -1322,7 +1226,6 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, kind = LoopKind::Runtime; } - // std::cout << "2 Stmt LowererImplImperative::lowerForallDenseAcceleration\n"; return Block::blanks(For::make(loopVar, 0, indexListSize, 1, body, kind, ignoreVectorize ? ParallelUnit::NotParallel : forall.getParallelUnit(), ignoreVectorize ? 0 : forall.getUnrollFactor()), @@ -1346,8 +1249,6 @@ Stmt LowererImplImperative::lowerForallPosition(Forall forall, Iterator iterator set reducedAccesses, ir::Stmt recoveryStmt) { - // std::cout << "1 Stmt LowererImplImperative::lowerForallPosition\n" << std::endl; - Expr coordinate = getCoordinateVar(forall.getIndexVar()); Stmt declareCoordinate = Stmt(); Stmt strideGuard = Stmt(); @@ -1379,11 +1280,6 @@ Stmt LowererImplImperative::lowerForallPosition(Forall forall, Iterator iterator markAssignsAtomicDepth++; } - // see we are inside a forall. ex: forall(i, forall(j, y(i) += A(i,j) * x(j))) - // when you call forall.getStmt it returns forall(j, y(i) += A(i,j) * x(j)) which is the - // IndexStmt inside the forall IndexStmt - // std::cout << "original forall : " << forall << std::endl; - // std::cout << "inside IndexStmt: " << forall.getStmt() << std::endl; Stmt body = lowerForallBody(coordinate, forall.getStmt(), locators, inserters, appenders, reducedAccesses); @@ -1445,7 +1341,6 @@ Stmt LowererImplImperative::lowerForallPosition(Forall forall, Iterator iterator kind = LoopKind::Runtime; } - // std::cout << "2 Stmt LowererImplImperative::lowerForallPosition\n" << std::endl; // Loop with preamble and postamble return Block::blanks( boundsCompute, @@ -1464,7 +1359,6 @@ Stmt LowererImplImperative::lowerForallFusedPosition(Forall forall, Iterator ite set reducedAccesses, ir::Stmt recoveryStmt) { - // std::cout << "1 Stmt LowererImplImperative::lowerForallFusedPosition\n" << std::endl; Expr coordinate = getCoordinateVar(forall.getIndexVar()); Stmt declareCoordinate = Stmt(); if (provGraph.isCoordVariable(forall.getIndexVar())) { @@ -1555,8 +1449,6 @@ Stmt LowererImplImperative::lowerForallFusedPosition(Forall forall, Iterator ite markAssignsAtomicDepth++; } - // std::cout << "original forall : " << forall << std::endl; - // std::cout << "inside IndexStmt: " << forall.getStmt() << std::endl; Stmt body = lowerForallBody(coordinate, forall.getStmt(), locators, inserters, appenders, reducedAccesses); @@ -1614,7 +1506,6 @@ Stmt LowererImplImperative::lowerForallFusedPosition(Forall forall, Iterator ite kind = LoopKind::Runtime; } - // std::cout << "2 Stmt LowererImplImperative::lowerForallFusedPosition\n" << std::endl; // Loop with preamble and postamble return Block::blanks(boundsCompute, Block::make(Block::make(searchForUnderivedStart), @@ -1715,7 +1606,6 @@ Stmt LowererImplImperative::lowerMergePoint(MergeLattice pointLattice, ir::Assign::make(indexSetIter.getCoordVar(), indexSetIter.getPosVar()) ); // Code to increment both iterator variables. - std::cout << "some casting stuff happening\n"; auto incr = ir::Block::make( compoundAssign(iter.getIteratorVar(), ir::Cast::make(Eq::make(iter.getCoordVar(), setMatch), iter.getIteratorVar().type())), compoundAssign(indexSetIter.getIteratorVar(), ir::Cast::make(Eq::make(indexSetIter.getCoordVar(), setMatch), indexSetIter.getIteratorVar().type())), @@ -1878,8 +1768,6 @@ Stmt LowererImplImperative::lowerForallBody(Expr coordinate, IndexStmt stmt, vector inserters, vector appenders, const set& reducedAccesses) { - - // std::cout << "lowering a forall body----------------------------------------------------\n"; Stmt initVals = resizeAndInitValues(appenders, reducedAccesses); @@ -1896,7 +1784,6 @@ Stmt LowererImplImperative::lowerForallBody(Expr coordinate, IndexStmt stmt, // Code of loop body statement Stmt body = lower(stmt); - // std::cout << "\nBefore: [" << stmt << "]\nAfter : [" << body << "]\n"; // Code to append coordinates Stmt appendCoords = appendCoordinate(appenders, coordinate); @@ -1914,12 +1801,10 @@ Expr LowererImplImperative::getTemporarySize(Where where) { TensorVar temporary = where.getTemporary(); Dimension temporarySize = temporary.getType().getShape().getDimension(0); Access temporaryAccess = getResultAccesses(where.getProducer()).first[0]; - std::cout << "temporaryAccess: " << temporaryAccess; std::vector indexVars = temporaryAccess.getIndexVars(); if(util::all(indexVars, [&](const IndexVar& var) { return provGraph.isUnderived(var);})) { // All index vars underived then use tensor properties to get tensor size - std::cout << "All index vars underived then use tensor properties to get tensor size\n"; taco_iassert(util::contains(dimensions, indexVars[0])) << "Missing " << indexVars[0]; ir::Expr size = dimensions.at(indexVars[0]); for(size_t i = 1; i < indexVars.size(); ++i) { @@ -1930,19 +1815,16 @@ Expr LowererImplImperative::getTemporarySize(Where where) { } if (temporarySize.isFixed()) { - std::cout << "temporary is fixed\n" ; return ir::Literal::make(temporarySize.getSize()); } if (temporarySize.isIndexVarSized()) { - std::cout << "temporary is index var sized\n"; IndexVar var = temporarySize.getIndexVarSize(); vector bounds = provGraph.deriveIterBounds(var, definedIndexVarsOrdered, underivedBounds, indexVarToExprMap, iterators); return ir::Sub::make(bounds[1], bounds[0]); } - std::cout << "should this be an error\n"; taco_ierror; // TODO return Expr(); } @@ -2213,10 +2095,8 @@ vector LowererImplImperative::codeToInitializeTemporaryParallel(Where wher vector LowererImplImperative::codeToInitializeTemporary(Where where) { TensorVar temporary = where.getTemporary(); - cout << "temporary found: " << temporary << std::endl; const bool accelerateDense = canAccelerateDenseTemp(where).first; - cout << "accelerateDense: " << accelerateDense << std::endl; Stmt freeTemporary = Stmt(); Stmt initializeTemporary = Stmt(); @@ -2227,7 +2107,6 @@ vector LowererImplImperative::codeToInitializeTemporary(Where where) { initializeTemporary = Block::make(initializeTemporary, initTempSet); tempToBitGuard[temporary] = tempSet; } else { - cout << "higher order temporary found: " << temporary << std::endl; // TODO: Need to support keeping track of initialized elements for // temporaries that don't have sparse accelerator taco_iassert(!util::contains(guardedTemps, temporary) || accelerateDense); @@ -2245,32 +2124,17 @@ vector LowererImplImperative::codeToInitializeTemporary(Where where) { needComputeValues(where, temporary)) { values = ir::Var::make(temporary.getName(), temporary.getType().getDataType(), true, false); - std::cout << "values: " << values << std::endl; - std::cout << "dataType: " << values.type() << std::endl; - - // taco_iassert(temporary.getType().getOrder() == 1) - // << " Temporary order was " << temporary.getType().getOrder(); // TODO - Expr size = getTemporarySize(where); - std::cout << "temporarySize: " << size << std::endl; - // no decl needed for shared memory Stmt decl = Stmt(); if ((isa(where.getProducer()) && inParallelLoopDepth == 0) || !should_use_CUDA_codegen()) { decl = VarDecl::make(values, ir::Literal::make(0)); - std::cout << "decl statement: " << decl << std::endl; } Stmt allocate = Allocate::make(values, size); - std::cout << "allocate stmt: " << allocate << std::endl; freeTemporary = Block::make(freeTemporary, Free::make(values)); - std::cout << "free temp: " << freeTemporary << std::endl; initializeTemporary = Block::make(decl, initializeTemporary, allocate); - std::cout << "initializeTemporary: " << initializeTemporary << std::endl; - - // taco_iassert(temporary.getType().getOrder() == 1) - // << " Temporary order was " << temporary.getType().getOrder(); // TODO } /// Make a struct object that lowerAssignment and lowerAccess can read @@ -2283,7 +2147,6 @@ vector LowererImplImperative::codeToInitializeTemporary(Where where) { } Stmt LowererImplImperative::lowerWhere(Where where) { - // std::cout << "\n--------------------------------------- lowering where statement: " << where << "\n\n\n"; TensorVar temporary = where.getTemporary(); bool accelerateDenseWorkSpace, sortAccelerator; std::tie(accelerateDenseWorkSpace, sortAccelerator) = @@ -2320,7 +2183,6 @@ Stmt LowererImplImperative::lowerWhere(Where where) { }) ); - // std::cout << "\ninitiating lowering of where consumer: " << where.getConsumer() << std::endl; Stmt consumer = lower(where.getConsumer()); if (accelerateDenseWorkSpace && sortAccelerator) { // We need to sort the indices array @@ -2344,13 +2206,11 @@ Stmt LowererImplImperative::lowerWhere(Where where) { true, false); Expr size = getTemporarySize(where); Stmt zeroInit = Store::make(values, p, ir::Literal::zero(temporary.getType().getDataType())); - // std::cout << "Stmt LowererImplImperative::lowerWhere\n"; Stmt loopInit = For::make(p, 0, size, 1, zeroInit, LoopKind::Serial); initializeTemporary = Block::make(initializeTemporary, loopInit); } whereConsumers.push_back(consumer); - // std::cout << "\nwhere temporaries: " << where.getTemporary() << std::endl; whereTemps.push_back(where.getTemporary()); captureNextLocatePos = true; @@ -2361,9 +2221,6 @@ Stmt LowererImplImperative::lowerWhere(Where where) { restoreAtomicDepth = true; } - whereTempsWithLoopDepth.insert(std::pair(where.getTemporary(), loopDepth)); - - // std::cout << "\ninitiating lowering of where producer: " << where.getConsumer() << std::endl; Stmt producer = lower(where.getProducer()); if (accelerateDenseWorkSpace) { const Expr indexListSizeExpr = tempToIndexListSize.at(temporary); @@ -2371,8 +2228,6 @@ Stmt LowererImplImperative::lowerWhere(Where where) { initializeTemporary = Block::make(indexListSizeDecl, initializeTemporary); } - whereTempsWithLoopDepth.erase(where.getTemporary()); - if (restoreAtomicDepth) { markAssignsAtomicDepth++; } @@ -2482,7 +2337,6 @@ Stmt LowererImplImperative::lowerAssemble(Assemble assemble) { resultModeOrdering[iter.getMode().getLevel() - 1]); Expr pos = iter.getPosVar(); Stmt initPos = VarDecl::make(pos, iter.locate(locateCoords)[0]); - // std::cout << "Stmt LowererImplImperative::lowerAssemble\n"; insertEdgeLoop = For::make(coords.back(), 0, dim, 1, Block::make(initPos, insertEdgeLoop)); } else { @@ -2520,7 +2374,7 @@ Stmt LowererImplImperative::lowerAssemble(Assemble assemble) { initAssembleStmts.push_back(initValues); } } else if (zeroInit) { - initAssembleStmts.push_back(zeroInitValues(resultTensorVar, 0, prevSize)); // init values + initAssembleStmts.push_back(zeroInitValues(resultTensorVar, 0, prevSize)); } } Stmt initAssemble = Block::make(initAssembleStmts); @@ -2564,7 +2418,6 @@ Stmt LowererImplImperative::lowerMulti(Multi multi) { } Stmt LowererImplImperative::lowerSuchThat(SuchThat suchThat) { - // std::cout << "lowering such that statement\n"; Stmt stmt = lower(suchThat.getStmt()); return Block::make(stmt); } @@ -2678,7 +2531,6 @@ Expr LowererImplImperative::lowerSqrt(Sqrt sqrt) { Expr LowererImplImperative::lowerCast(Cast cast) { - std::cout << "casting: " << cast.getA() << ", dataType: " << cast.getDataType() << std::endl; return ir::Cast::make(lower(cast.getA()), cast.getDataType()); } @@ -3046,7 +2898,7 @@ Stmt LowererImplImperative::initResultArrays(IndexVar var, vector writes util::contains(reducedAccesses, write)) { // Zero-initialize values array if might not assign to every element // in values array during compute - result.push_back(zeroInitValues(tensor, resultParentPos, stride)); // init values + result.push_back(zeroInitValues(tensor, resultParentPos, stride)); } } } @@ -3093,7 +2945,6 @@ Stmt LowererImplImperative::resizeAndInitValues(const std::vector& app Stmt LowererImplImperative::zeroInitValues(Expr tensor, Expr begin, Expr size) { - // std::cout << "1 Stmt LowererImplImperative::zeroInitValues\n"; Expr lower = simplify(ir::Mul::make(begin, size)); Expr upper = simplify(ir::Mul::make(ir::Add::make(begin, 1), size)); Expr p = Var::make("p" + util::toString(tensor), Int()); @@ -3106,11 +2957,6 @@ Stmt LowererImplImperative::zeroInitValues(Expr tensor, Expr begin, Expr size) { return ir::VarDecl::make(ir::Var::make("status", Int()), ir::Call::make("cudaMemset", {values, ir::Literal::make(0, Int()), ir::Mul::make(ir::Sub::make(upper, lower), ir::Literal::make(values.type().getNumBytes()))}, Int())); } - // std::cout << "2 Stmt LowererImplImperative::zeroInitValues\n"; - // if generating ispc code, we will keep the LoopKind as Init so that we can initializa it if tasks are used - if (should_use_ISPC_codegen()) { - return For::make(p, lower, upper, 1, zeroInit, LoopKind::Init); - } return For::make(p, lower, upper, 1, zeroInit, parallel); } diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index f4d848de0..02464ce26 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -11,7 +11,6 @@ add_executable(taco-test ${TEST_SOURCES} ${TEST_HEADERS}) target_link_libraries(taco-test taco-gtest) target_link_libraries(taco-test pthread) target_link_libraries(taco-test taco) -target_link_libraries(taco-test papi) if(${CMAKE_VERSION} VERSION_LESS "3.9.0") add_test(NAME taco-test COMMAND taco-test) diff --git a/tools/taco.cpp b/tools/taco.cpp index 7384874ec..1c22fc368 100644 --- a/tools/taco.cpp +++ b/tools/taco.cpp @@ -21,7 +21,6 @@ #include "taco/lower/lower.h" #include "taco/codegen/module.h" #include "codegen/codegen_c.h" -#include "codegen/codegen_ispc.h" #include "codegen/codegen_cuda.h" #include "codegen/codegen.h" #include "taco/util/strings.h" @@ -190,8 +189,6 @@ static void printUsageInfo() { cout << endl; printFlag("print-nocolor", "Print without colors."); cout << endl; - printFlag("ispc", "Generate ISPC code for Intel CPUs"); - cout << endl; printFlag("cuda", "Generate CUDA code for NVIDIA GPUs"); cout << endl; printFlag("schedule", "Specify parallel execution schedule"); @@ -266,7 +263,7 @@ static void printSchedulingHelp() { "an output race strategy `strat`. Since the other transformations " "expect serial code, parallelize must come last in a series of " "transformations. Possible parallel hardware units are: " - "NotParallel, GPUBlock, GPUWarp, GPUThread, CPUThread, CPUVector, CPUSimd, CPUSimd. " + "NotParallel, GPUBlock, GPUWarp, GPUThread, CPUThread, CPUVector. " "Possible output race strategies are: " "IgnoreRaces, NoRaces, Atomics, Temporary, ParallelReduction."); } @@ -283,8 +280,6 @@ static void printVersionInfo() { cout << "Built with Python support." << endl; if(TACO_FEATURE_CUDA) cout << "Built with CUDA support." << endl; - if(TACO_FEATURE_ISPC) - cout << "Built with ISPC support." << endl; cout << endl; cout << "Built on: " << TACO_BUILD_DATE << endl; cout << "CMake build type: " << TACO_BUILD_TYPE << endl; @@ -317,7 +312,6 @@ static void printCommandLine(ostream& os, int argc, char* argv[]) { static int setSchedulingCommands(vector> scheduleCommands, parser::Parser& parser, IndexStmt& stmt, Assignment assignment) { - std::cout << "setting scheduling commands\n"; auto findVar = [&stmt](string name) { ProvenanceGraph graph(stmt); for (auto v : graph.getAllIndexVars()) { @@ -330,15 +324,9 @@ static int setSchedulingCommands(vector> scheduleCommands, abort(); // to silence a warning: control reaches end of non-void function }; - int isGPU = 0; - int isISPC = 0; + bool isGPU = false; for(vector scheduleCommand : scheduleCommands) { - std::cout << "running schedluing command: "; - for (auto &command : scheduleCommand) { - std::cout << command << " "; - } - std::cout << std::endl; string command = scheduleCommand[0]; scheduleCommand.erase(scheduleCommand.begin()); @@ -561,13 +549,6 @@ static int setSchedulingCommands(vector> scheduleCommands, parallel_unit = ParallelUnit::CPUThread; } else if (unit == "CPUVector") { parallel_unit = ParallelUnit::CPUVector; - } else if (unit == "CPUSimd") { - isISPC = true; - parallel_unit = ParallelUnit::CPUSimd; - } - else if (unit == "CPUSpmd") { - parallel_unit = ParallelUnit::CPUSpmd; - isISPC = true; } else { taco_uerror << "Parallel hardware not defined."; @@ -590,8 +571,6 @@ static int setSchedulingCommands(vector> scheduleCommands, goto end; } - std::cout << "stmt before parallelizing the statement: " << stmt << endl; - std::cout << "ParallelUnit: " << ParallelUnit_NAMES[(int) parallel_unit] << ", outputRaceStrategy: " << OutputRaceStrategy_NAMES[(int) output_race_strategy] << std::endl; stmt = stmt.parallelize(findVar(i), parallel_unit, output_race_strategy); } else if (command == "assemble") { @@ -647,13 +626,7 @@ static int setSchedulingCommands(vector> scheduleCommands, end:; } - if (isGPU) { - return 1; - } - else if (isISPC) { - return 2; - } - return 0; + return isGPU; } int main(int argc, char* argv[]) { @@ -682,7 +655,6 @@ int main(int argc, char* argv[]) { bool color = true; bool readKernels = false; bool cuda = false; - bool ispc = false; bool setSchedule = false; @@ -991,10 +963,6 @@ int main(int argc, char* argv[]) { else if ("-cuda" == argName) { cuda = true; } - else if ("-ispc" == argName) { - std::cout << "ispc true\n"; - ispc = true; - } else if ("-schedule" == argName) { vector descriptor = util::split(argValue, ","); if (descriptor.size() > 2 || descriptor.empty()) { @@ -1047,8 +1015,6 @@ int main(int argc, char* argv[]) { } } - std::cout << "cuda: " << cuda << ", ispc: " << ispc << std::endl; - // Print compute is the default if nothing else was asked for if (!printAssemble && !printEvaluate && !printIterationGraph && !writeCompute && !writeAssemble && !writeKernels && !readKernels && @@ -1176,10 +1142,7 @@ int main(int argc, char* argv[]) { std::cout << "topologically reordered loops statement: " << stmt << std::endl; if (setSchedule) { - int val = setSchedulingCommands(scheduleCommands, parser, stmt, tensor.getAssignment()); - // stmt = loopFusionOverFission(stmt, tensor.getAssignment()); - cuda |= (val==1); - ispc |= (val==2); + cuda |= setSchedulingCommands(scheduleCommands, parser, stmt, tensor.getAssignment()); } else { // stmt = loopFusionOverFission(stmt, tensor.getAssignment()); @@ -1194,18 +1157,9 @@ int main(int argc, char* argv[]) { return reportError("TACO must be built for CUDA (cmake -DCUDA=ON ..) to benchmark", 2); } set_CUDA_codegen_enabled(true); - set_ISPC_codegen_enabled(false); - } - else if (ispc) { - if (!ISPC_BUILT && benchmark) { - return reportError("TACO must be built for ISPC (cmake -DISPC=ON .. to benchmark", 2); - } - set_CUDA_codegen_enabled(false); - set_ISPC_codegen_enabled(true); } else { set_CUDA_codegen_enabled(false); - set_ISPC_codegen_enabled(false); } std::cout << "running scalar promote\n" << std::endl; // @@ -1216,7 +1170,6 @@ int main(int argc, char* argv[]) { cout << stmt << endl; } - // lower index statement to ir statement Kernel kernel; if (benchmark) { if (time) cout << endl; @@ -1299,15 +1252,9 @@ int main(int argc, char* argv[]) { } } else { - std::cout << "lowering stmt: " << stmt << std::endl; compute = lower(stmt, prefix+"compute", computeWithAssemble, true); assemble = lower(stmt, prefix+"assemble", true, false); evaluate = lower(stmt, prefix+"evaluate", true, true); - - std::cout << "\n\ncompute kernel\n------------\n" << compute << std::endl << std::endl; - // compute kernel is the most basic kernel after lowering phase - - std::cout << "\n\nevaluate kernel\n------------\n" << evaluate << std::endl << std::endl; } string packComment = @@ -1362,7 +1309,6 @@ int main(int argc, char* argv[]) { } bool hasPrinted = false; - std::shared_ptr codegen = ir::CodeGen::init_default(cout, ir::CodeGen::ImplementationGen); codegen->setColor(color); if (printAssemble) { @@ -1383,7 +1329,6 @@ int main(int argc, char* argv[]) { } if (compute.defined()) { - std::cout << "Code generation\n"; codegen->compile(compute, false); } else { @@ -1441,7 +1386,7 @@ int main(int argc, char* argv[]) { } IterationGraph iterationGraph; - if (printIterationGraph) { // print iteration graph + if (printIterationGraph) { iterationGraph = IterationGraph::make(tensor.getAssignment()); } From 30c2ecf4460e887d6975d93404a5acfa26ef0a7e Mon Sep 17 00:00:00 2001 From: Adhhitha Dias Date: Wed, 11 May 2022 17:22:24 -0400 Subject: [PATCH 14/16] remove ispc related files --- include/taco/codegen/module.h | 1 - include/taco/cuda.h | 12 - src/codegen/codegen.cpp | 7 - src/codegen/codegen_ispc.cpp | 1097 ------------------------ src/codegen/codegen_ispc.h | 68 -- src/codegen/module.cpp | 66 +- src/cuda.cpp | 19 - src/index_notation/transformations.cpp | 11 - src/ir/ir_printer.cpp | 807 ++++++----------- src/tensor.cpp | 1 - 10 files changed, 246 insertions(+), 1843 deletions(-) delete mode 100644 src/codegen/codegen_ispc.cpp delete mode 100644 src/codegen/codegen_ispc.h diff --git a/include/taco/codegen/module.h b/include/taco/codegen/module.h index 4db5fcdaf..44431ef46 100644 --- a/include/taco/codegen/module.h +++ b/include/taco/codegen/module.h @@ -77,7 +77,6 @@ class Module { private: std::stringstream source; - std::stringstream additional_source; std::stringstream header; std::string libname; std::string tmpdir; diff --git a/include/taco/cuda.h b/include/taco/cuda.h index 9c4a7aae9..aad6b5229 100644 --- a/include/taco/cuda.h +++ b/include/taco/cuda.h @@ -9,19 +9,7 @@ #define CUDA_BUILT false #endif -#ifndef ISPC_BUILT - #define ISPC_BUILT false -#endif - namespace taco { - -/// Functions used by taco to interface with ISPC -bool should_use_ISPC_codegen(); -void set_ISPC_codegen_enabled(bool enabled); -bool is_ISPC_code_stream_enabled(); -void set_ISPC_code_stream_enabled(bool enabled); - - /// Functions used by taco to interface with CUDA (especially unified memory) /// Check if should use CUDA codegen bool should_use_CUDA_codegen(); diff --git a/src/codegen/codegen.cpp b/src/codegen/codegen.cpp index 6ec54a2f8..696d4971a 100644 --- a/src/codegen/codegen.cpp +++ b/src/codegen/codegen.cpp @@ -2,7 +2,6 @@ #include "taco/cuda.h" #include "codegen_cuda.h" #include "codegen_c.h" -#include "codegen_ispc.h" #include #include @@ -27,9 +26,6 @@ shared_ptr CodeGen::init_default(std::ostream &dest, OutputKind outputK if (should_use_CUDA_codegen()) { return make_shared(dest, outputKind); } - else if (should_use_ISPC_codegen()) { - return make_shared(dest, outputKind); - } else { return make_shared(dest, outputKind); } @@ -39,9 +35,6 @@ shared_ptr CodeGen::init_default(std::ostream &dest, std::ostream &dest if (should_use_CUDA_codegen()) { return make_shared(dest, outputKind); } - else if (should_use_ISPC_codegen()) { - return make_shared(dest, dest2, outputKind); - } else { return make_shared(dest, outputKind); } diff --git a/src/codegen/codegen_ispc.cpp b/src/codegen/codegen_ispc.cpp deleted file mode 100644 index d4f428ccf..000000000 --- a/src/codegen/codegen_ispc.cpp +++ /dev/null @@ -1,1097 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#include "taco/cuda.h" -#include "taco/ir/ir_printer.h" -#include "taco/ir/ir_visitor.h" -#include "taco/ir/ir_rewriter.h" -#include "taco/ir/simplify.h" - -#include "codegen_c.h" -#include "codegen_ispc.h" -#include "taco/error.h" -#include "taco/util/strings.h" -#include "taco/util/collections.h" - -using namespace std; - -namespace taco { -namespace ir { - -// Some helper functions -namespace { - -// Include stdio.h for printf -// stdlib.h for malloc/realloc -// math.h for sqrt -// MIN preprocessor macro -// This *must* be kept in sync with taco_tensor_t.h -const string cHeaders = - "#ifndef TACO_C_HEADERS\n" - "#define TACO_C_HEADERS\n" - "#include \n" - "#include \n" - "#include \n" - "#include \n" - "#include \n" - "#include \n" - "#include \n" - "#if _OPENMP\n" - "#include \n" - "#endif\n" - "#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))\n" - "#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))\n" - "#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)\n" - "#ifndef TACO_TENSOR_T_DEFINED\n" - "#define TACO_TENSOR_T_DEFINED\n" - "typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;\n" - "typedef struct {\n" - " int32_t order; // tensor order (number of modes)\n" - " int32_t* dimensions; // tensor dimensions\n" - " int32_t csize; // component size\n" - " int32_t* mode_ordering; // mode storage ordering\n" - " taco_mode_t* mode_types; // mode storage types\n" - " uint8_t*** indices; // tensor index data (per mode)\n" - " uint8_t* vals; // tensor values\n" - " int32_t vals_size; // values array size\n" - "} taco_tensor_t;\n" - "#endif\n" - "#if !_OPENMP\n" - "int omp_get_thread_num() { return 0; }\n" - "int omp_get_max_threads() { return 1; }\n" - "#endif\n" - "int cmp(const void *a, const void *b) {\n" - " return *((const int*)a) - *((const int*)b);\n" - "}\n" - "int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {\n" - " if (array[arrayStart] >= target) {\n" - " return arrayStart;\n" - " }\n" - " int lowerBound = arrayStart; // always < target\n" - " int upperBound = arrayEnd; // always >= target\n" - " while (upperBound - lowerBound > 1) {\n" - " int mid = (upperBound + lowerBound) / 2;\n" - " int midValue = array[mid];\n" - " if (midValue < target) {\n" - " lowerBound = mid;\n" - " }\n" - " else if (midValue > target) {\n" - " upperBound = mid;\n" - " }\n" - " else {\n" - " return mid;\n" - " }\n" - " }\n" - " return upperBound;\n" - "}\n" - "int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {\n" - " if (array[arrayEnd] <= target) {\n" - " return arrayEnd;\n" - " }\n" - " int lowerBound = arrayStart; // always <= target\n" - " int upperBound = arrayEnd; // always > target\n" - " while (upperBound - lowerBound > 1) {\n" - " int mid = (upperBound + lowerBound) / 2;\n" - " int midValue = array[mid];\n" - " if (midValue < target) {\n" - " lowerBound = mid;\n" - " }\n" - " else if (midValue > target) {\n" - " upperBound = mid;\n" - " }\n" - " else {\n" - " return mid;\n" - " }\n" - " }\n" - " return lowerBound;\n" - "}\n" - "taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,\n" - " int32_t* dimensions, int32_t* mode_ordering,\n" - " taco_mode_t* mode_types) {\n" - " taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));\n" - " t->order = order;\n" - " t->dimensions = (int32_t *) malloc(order * sizeof(int32_t));\n" - " t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));\n" - " t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));\n" - " t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***));\n" - " t->csize = csize;\n" - " for (int32_t i = 0; i < order; i++) {\n" - " t->dimensions[i] = dimensions[i];\n" - " t->mode_ordering[i] = mode_ordering[i];\n" - " t->mode_types[i] = mode_types[i];\n" - " switch (t->mode_types[i]) {\n" - " case taco_mode_dense:\n" - " t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));\n" - " break;\n" - " case taco_mode_sparse:\n" - " t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));\n" - " break;\n" - " }\n" - " }\n" - " return t;\n" - "}\n" - "void deinit_taco_tensor_t(taco_tensor_t* t) {\n" - " for (int i = 0; i < t->order; i++) {\n" - " free(t->indices[i]);\n" - " }\n" - " free(t->indices);\n" - " free(t->dimensions);\n" - " free(t->mode_ordering);\n" - " free(t->mode_types);\n" - " free(t);\n" - "}\n" - "#endif\n"; - -const string ispcHeaders = - "#define __TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))\n" - "#define __TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))\n" - "#define __TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)\n" - "int __cmp(const void *a, const void *b) {\n" - " return *((const int*)a) - *((const int*)b);\n" - "}\n" - "int __taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {\n" - " if (array[arrayStart] >= target) {\n" - " return arrayStart;\n" - " }\n" - " int lowerBound = arrayStart; // always < target\n" - " int upperBound = arrayEnd; // always >= target\n" - " while (upperBound - lowerBound > 1) {\n" - " int mid = (upperBound + lowerBound) / 2;\n" - " int midValue = array[mid];\n" - " if (midValue < target) {\n" - " lowerBound = mid;\n" - " }\n" - " else if (midValue > target) {\n" - " upperBound = mid;\n" - " }\n" - " else {\n" - " return mid;\n" - " }\n" - " }\n" - " return upperBound;\n" - "}\n" - "int __taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {\n" - " if (array[arrayEnd] <= target) {\n" - " return arrayEnd;\n" - " }\n" - " int lowerBound = arrayStart; // always <= target\n" - " int upperBound = arrayEnd; // always > target\n" - " while (upperBound - lowerBound > 1) {\n" - " int mid = (upperBound + lowerBound) / 2;\n" - " int midValue = array[mid];\n" - " if (midValue < target) {\n" - " lowerBound = mid;\n" - " }\n" - " else if (midValue > target) {\n" - " upperBound = mid;\n" - " }\n" - " else {\n" - " return mid;\n" - " }\n" - " }\n" - " return lowerBound;\n" - "}\n\n\n"; - -} // anonymous namespace - - - -// find variables for generating declarations -// generates a single var for each GetProperty -class CodeGen_ISPC::FindVars : public IRVisitor { -public: - map varMap; - - // the variables for which we need to add declarations - map varDecls; - - vector localVars; - - // this maps from tensor, property, mode, index to the unique var - map, string> canonicalPropertyVar; - - // this is for convenience, recording just the properties unpacked - // from the output tensor so we can re-save them at the end - map, string> outputProperties; - - // TODO: should replace this with an unordered set - vector outputTensors; - vector inputTensors; - - CodeGen_ISPC *codeGen; - - // copy inputs and outputs into the map - FindVars(vector inputs, vector outputs, CodeGen_ISPC *codeGen) - : codeGen(codeGen) { - for (auto v: inputs) { - auto var = v.as(); - taco_iassert(var) << "Inputs must be vars in codegen"; - taco_iassert(varMap.count(var)==0) << "Duplicate input found in codegen"; - inputTensors.push_back(v); - varMap[var] = var->name; - } - for (auto v: outputs) { - auto var = v.as(); - taco_iassert(var) << "Outputs must be vars in codegen"; - taco_iassert(varMap.count(var)==0) << "Duplicate output found in codegen"; - outputTensors.push_back(v); - varMap[var] = var->name; - } - } - -protected: - using IRVisitor::visit; - - virtual void visit(const Var *op) { - if (varMap.count(op) == 0) { - varMap[op] = op->is_ptr? op->name : codeGen->genUniqueName(op->name); - } - } - - virtual void visit(const VarDecl *op) { - if (!util::contains(localVars, op->var)) { - localVars.push_back(op->var); - } - op->var.accept(this); - op->rhs.accept(this); - } - - virtual void visit(const For *op) { - if (!util::contains(localVars, op->var)) { - localVars.push_back(op->var); - } - op->var.accept(this); - op->start.accept(this); - op->end.accept(this); - op->increment.accept(this); - op->contents.accept(this); - } - - virtual void visit(const GetProperty *op) { - if (!util::contains(inputTensors, op->tensor) && - !util::contains(outputTensors, op->tensor)) { - // Don't create header unpacking code for temporaries - return; - } - - if (varMap.count(op) == 0) { - auto key = - tuple(op->tensor,op->property, - (size_t)op->mode, - (size_t)op->index); - if (canonicalPropertyVar.count(key) > 0) { - varMap[op] = canonicalPropertyVar[key]; - } else { - auto unique_name = codeGen->genUniqueName(op->name); - canonicalPropertyVar[key] = unique_name; - varMap[op] = unique_name; - varDecls[op] = unique_name; - if (util::contains(outputTensors, op->tensor)) { - outputProperties[key] = unique_name; - } - } - } - } -}; - - -// Finds all for loops tagged with accelerator and adds statements to deviceFunctions -// Also tracks scope of when device function is called and -// tracks which variables must be passed to function. -class CodeGen_ISPC::FunctionCollector : public IRVisitor { -public: - vector threadFors; // contents is device function - vector initFors; // for loops to initialize statements - map scopeMap; - - // the variables to pass to each device function - vector>> functionParameters; - vector> currentParameters; // keep as vector so code generation is deterministic - set currentParameterSet; - - set variablesDeclaredInKernel; - - vector> threadIDVars; - vector> blockIDVars; - vector> warpIDVars; - vector numThreads; - vector numWarps; - - CodeGen_ISPC *codeGen; - // copy inputs and outputs into the map - FunctionCollector(vector inputs, vector outputs, CodeGen_ISPC *codeGen) : codeGen(codeGen) { - inDeviceFunction = false; - for (auto v: inputs) { - auto var = v.as(); - taco_iassert(var) << "Inputs must be vars in codegen"; - taco_iassert(scopeMap.count(var) == 0) << - "Duplicate input found in codegen"; - scopeMap[var] = var->name; - } - for (auto v: outputs) { - auto var = v.as(); - taco_iassert(var) << "Outputs must be vars in codegen"; - taco_iassert(scopeMap.count(var) == 0) << - "Duplicate output found in codegen"; - - scopeMap[var] = var->name; - } - } - -protected: - bool inDeviceFunction; - using IRVisitor::visit; - - virtual void visit(const For *op) { - if (op->parallel_unit == ParallelUnit::CPUSpmd) { - std::cout << "ParallelUnit::CPUSpmd directive found\n"; - - inDeviceFunction = false; - op->var.accept(this); - inDeviceFunction = true; - - threadFors.push_back(op); - std::cout << "scopeMap: [" << scopeMap[op->var] << "], varExpr: [" << op->var << "]\n"; - threadIDVars.push_back(pair(scopeMap[op->var], op->var)); - Expr blockSize = ir::simplify(ir::Div::make(ir::Sub::make(op->end, op->start), op->increment)); - numThreads.push_back(blockSize); - - } - else if (op->parallel_unit == ParallelUnit::CPUSimd) { - std::cout << "************************************************************************** CPUSimd For node\n"; - } - else if (op->kind == LoopKind::Init) { - std::cout << "************************************************************************* Init loop kind found\n"; - initFors.push_back(op); - } - else{ - op->var.accept(this); - } - op->start.accept(this); - op->end.accept(this); - op->increment.accept(this); - op->contents.accept(this); - } - - virtual void visit(const Var *op) { - if (scopeMap.count(op) == 0) { - string name = codeGen->genUniqueName(op->name); - if (!inDeviceFunction) { - scopeMap[op] = name; - } - } - else if (scopeMap.count(op) == 1 && inDeviceFunction && currentParameterSet.count(op) == 0 - && (threadIDVars.empty() || op != threadIDVars.back().second) - && !variablesDeclaredInKernel.count(op)) { - currentParameters.push_back(pair(scopeMap[op], op)); - currentParameterSet.insert(op); - } - } - - virtual void visit(const VarDecl *op) { - if (inDeviceFunction) { - variablesDeclaredInKernel.insert(op->var); - } - op->var.accept(this); - op->rhs.accept(this); - } - - virtual void visit(const GetProperty *op) { - if (scopeMap.count(op->tensor) == 0 && !inDeviceFunction) { - auto key = - tuple(op->tensor,op->property, - (size_t)op->mode, - (size_t)op->index); - auto unique_name = codeGen->genUniqueName(op->name); - scopeMap[op->tensor] = unique_name; - } - else if (scopeMap.count(op->tensor) == 1 && inDeviceFunction && currentParameterSet.count(op->tensor) == 0) { - currentParameters.push_back(pair(op->tensor.as()->name, op->tensor)); - currentParameterSet.insert(op->tensor); - } - } -}; - - -CodeGen_ISPC::CodeGen_ISPC(std::ostream &dest, OutputKind outputKind, bool simplify) - : CodeGen_C(dest, dest, outputKind, simplify) {} - -CodeGen_ISPC::CodeGen_ISPC(std::ostream &dest, std::ostream &dest2, OutputKind outputKind, bool simplify) - : CodeGen_C(dest, dest2, outputKind, simplify) {} - -CodeGen_ISPC::~CodeGen_ISPC() {} - -void CodeGen_ISPC::compile(Stmt stmt, bool isFirst) { - varMap = {}; - localVars = {}; - - if (isFirst) { - // output the headers - out << cHeaders; - - if (&out != &out2) { - out2 << ispcHeaders; - } - } - out << endl; - // generate code for the Stmt - std::cout << "Compiling the code\n"; - stmt.accept(this); -} - - - -string CodeGen_ISPC::printCallISPCFunc(const std::string& funcName, map varMap, - vector &sortedProps) { - std::stringstream ret; - ret << " "; - unordered_set propsAlreadyGenerated; - - ret << "__" << funcName << "("; - - - for (unsigned long i=0; i < sortedProps.size(); i++) { - ret << varMap[sortedProps[i]]; - if (i != sortedProps.size()-1) { - ret << ", "; - } - propsAlreadyGenerated.insert(varMap[sortedProps[i]]); - } - - ret << ");\n"; - return ret.str(); -} - -// varMap is already sorted <- make sure to pass the sorted varMap -void CodeGen_ISPC::printISPCFunc(const Function *func, map varMap, - vector &sortedProps) { - - FunctionCollector functionCollector(func->inputs, func->outputs, this); - func->body.accept(&functionCollector); - - vector inputs = func->inputs; - vector outputs = func->outputs; - unordered_set propsAlreadyGenerated; - - for (unsigned long i=0; i < sortedProps.size(); i++) { - auto prop = sortedProps[i]; - bool isOutputProp = (find(outputs.begin(), outputs.end(), - prop->tensor) != outputs.end()); - - auto var = prop->tensor.as(); - if (var->is_parameter) { - if (isOutputProp) { - funcVariables << " " << printTensorProperty(varMap[prop], prop, false) << ";" << endl; - } else { - break; - } - } else { - funcVariables << getUnpackedTensorArgument(varMap[prop], prop, isOutputProp); - } - propsAlreadyGenerated.insert(varMap[prop]); - - if (i!=sortedProps.size()-1) { - funcVariables << ", "; - } - if (i%2==0) { - funcVariables << "\n\t"; - } - } - - resetUniqueNameCounters(); - - // threadFors code generation - for (size_t i = 0; i < functionCollector.threadFors.size(); i++) { - - const For *threadloop = to(functionCollector.threadFors[i]); - taco_iassert(threadloop->parallel_unit == ParallelUnit::CPUSpmd); - Stmt function = threadloop->contents; - std::cout << "threadloop function: " << function << std::endl; - - out2 << "\nstatic task void __" << func->name << "__ ("; - out2 << funcVariables.str(); - out2 << "\n) {\n\n"; - - indent++; - // output body of the threadloop - taskCode = true; - print(threadloop); - indent--; - out2 << "}\n\n"; - - } - - taskCode = false; - out2 << "export void __" << func->name << " ("; - out2 << funcVariables.str(); - out2 << "\n) {\n\n"; - - indent++; - // output body - print(func->body); - indent--; - out2 << "}\n"; - -} - -void CodeGen_ISPC::sendToStream(std::stringstream &stream) { - if (is_ISPC_code_stream_enabled()) { - this->out2 << stream.str(); - } - else { - CodeGen_C::sendToStream(stream); - } -} - -void CodeGen_ISPC::visit(const Function* func) { - set_ISPC_code_stream_enabled(false); - - // if generating a header, protect the function declaration with a guard - if (func->name == "assemble") { - if (outputKind == HeaderGen) { - out << "#ifndef TACO_GENERATED_" << func->name << "\n"; - out << "#define TACO_GENERATED_" << func->name << "\n"; - } - - int numYields = countYields(func); - emittingCoroutine = (numYields > 0); - funcName = func->name; - labelCount = 0; - - resetUniqueNameCounters(); - FindVars inputVarFinder(func->inputs, {}, this); - func->body.accept(&inputVarFinder); - FindVars outputVarFinder({}, func->outputs, this); - func->body.accept(&outputVarFinder); - - // output function declaration - doIndent(); - out << printFuncName(func, inputVarFinder.varDecls, outputVarFinder.varDecls); - - // if we're just generating a header, this is all we need to do - if (outputKind == HeaderGen) { - out << ";\n"; - out << "#endif\n"; - return; - } - - out << " {\n"; - - indent++; - - // find all the vars that are not inputs or outputs and declare them - resetUniqueNameCounters(); - FindVars varFinder(func->inputs, func->outputs, this); - func->body.accept(&varFinder); - varMap = varFinder.varMap; - localVars = varFinder.localVars; - - // Print variable declarations - out << printDecls(varFinder.varDecls, func->inputs, func->outputs) << endl; - - if (emittingCoroutine) { - out << printContextDeclAndInit(varMap, localVars, numYields, func->name) - << endl; - } - - // output body - print(func->body); - - // output repack only if we allocated memory - if (checkForAlloc(func)) - out << endl << printPack(varFinder.outputProperties, func->outputs); - - if (emittingCoroutine) { - out << printCoroutineFinish(numYields, funcName); - } - - doIndent(); - out << "return 0;\n"; - indent--; - - doIndent(); - out << "}\n"; - return; - - } - - - if (outputKind == HeaderGen) { - out << "#ifndef TACO_GENERATED_" << func->name << "\n"; - out << "#define TACO_GENERATED_" << func->name << "\n"; - } - - int numYields = countYields(func); - emittingCoroutine = (numYields > 0); - funcName = func->name; - labelCount = 0; - - resetUniqueNameCounters(); - FindVars inputVarFinder(func->inputs, {}, this); - func->body.accept(&inputVarFinder); - FindVars outputVarFinder({}, func->outputs, this); - func->body.accept(&outputVarFinder); - - // output function declaration - doIndent(); - out << printFuncName(func, inputVarFinder.varDecls, outputVarFinder.varDecls); - - // if we're just generating a header, this is all we need to do - if (outputKind == HeaderGen) { - out << ";\n"; - out << "#endif\n"; - return; - } - - out << " {\n"; - - indent++; - - // find all the vars that are not inputs or outputs and declare them - resetUniqueNameCounters(); - FindVars varFinder(func->inputs, func->outputs, this); - func->body.accept(&varFinder); - varMap = varFinder.varMap; - localVars = varFinder.localVars; - - // Print variable declarations - out << printDecls(varFinder.varDecls, func->inputs, func->outputs) << endl; - - sortedProps = {}; - vector inputs = func->inputs; - vector outputs = func->outputs; - getSortedProps(varFinder.varDecls, sortedProps, inputs, outputs); - out << printCallISPCFunc(func->name, varFinder.varDecls, sortedProps); - - if (emittingCoroutine) { - out << printContextDeclAndInit(varMap, localVars, numYields, func->name) - << endl; - } - - // output repack only if we allocated memory - if (checkForAlloc(func)) - out << endl << printPack(varFinder.outputProperties, func->outputs); - - if (emittingCoroutine) { - out << printCoroutineFinish(numYields, funcName); - } - - doIndent(); - out << "return 0;\n"; - indent--; - - doIndent(); - out << "}\n\n"; - - set_ISPC_code_stream_enabled(true); - printISPCFunc(func, varFinder.varDecls, sortedProps); - set_ISPC_code_stream_enabled(false); - -} - -void CodeGen_ISPC::visit(const VarDecl* op) { - // std::stringstream stream; - if (is_ISPC_code_stream_enabled()) { - if (emittingCoroutine) { - doIndent(); - op->var.accept(this); - parentPrecedence = Precedence::TOP; - stream2 << " = "; - op->rhs.accept(this); - stream2 << ";"; - stream2 << endl; - } else { - IRPrinter::visit(op); - } - } - else { - CodeGen_C::visit(op); - } - - // sendToStream(stream); -} - -void CodeGen_ISPC::visit(const Yield* op) { - printYield(op, localVars, varMap, labelCount, funcName); -} - -// For Vars, we replace their names with the generated name, -// since we match by reference (not name) -void CodeGen_ISPC::visit(const Var* op) { - if (is_ISPC_code_stream_enabled()) { - taco_iassert(varMap.count(op) > 0) << - "Var " << op->name << " not found in varMap"; - if (emittingCoroutine) { - // out << "TACO_DEREF("; - } - out2 << varMap[op]; - if (emittingCoroutine) { - // out << ")"; - } - } - else { - CodeGen_C::visit(op); - } -} - -static string genVectorizePragma(int width) { - stringstream ret; - ret << "#pragma clang loop interleave(enable) "; - if (!width) - ret << "vectorize(enable)"; - else - ret << "vectorize_width(" << width << ")"; - - return ret.str(); -} - -// static string getParallelizePragma(LoopKind kind) { -// stringstream ret; -// ret << "#pragma omp parallel for schedule"; -// switch (kind) { -// case LoopKind::Static: -// ret << "(static, 1)"; -// break; -// case LoopKind::Dynamic: -// ret << "(dynamic, 1)"; -// break; -// case LoopKind::Runtime: -// ret << "(runtime)"; -// break; -// case LoopKind::Static_Chunked: -// ret << "(static)"; -// break; -// default: -// break; -// } -// return ret.str(); -// } - -// static string getUnrollPragma(size_t unrollFactor) { -// return "#pragma unroll " + std::to_string(unrollFactor); -// } - -static string getAtomicPragma() { - return "#pragma omp atomic"; -} - -// The next two need to output the correct pragmas depending -// on the loop kind (Serial, Static, Dynamic, Vectorized) -// -// Docs for vectorization pragmas: -// http://clang.llvm.org/docs/LanguageExtensions.html#extensions-for-loop-hint-optimizations -void CodeGen_ISPC::visit(const For* op) { - if (!is_ISPC_code_stream_enabled()) { - CodeGen_C::visit(op); - return; - } - doIndent(); - - if (op->kind == LoopKind::Mul_Thread) { - if (!taskCode) { - out2 << "launch[4] " << printCallISPCFunc(funcName+"__", varMap, sortedProps) << "\n"; - return; - } - stream2 << "uniform unsigned int chunk_size = ("; - op->end.accept(this); - stream2 << " - "; - op->start.accept(this); - stream2 << ") / taskCount;\n"; - stream2 << " uniform unsigned int modulo = ("; - op->end.accept(this); - stream2 << " - "; - op->start.accept(this); - stream2 << ") % taskCount;\n"; - - stream2 << " uniform unsigned int start = "; - op->start.accept(this); - stream2 << " + chunk_size * taskIndex;\n"; - - stream2 << " if (taskIndex != 0) {\n"; - stream2 << " start += modulo;\n"; - stream2 << " }\n"; - - stream2 << " uniform unsigned int end = start + chunk_size;\n"; - stream2 << " if (taskIndex == 0) {\n"; - stream2 << " end += modulo;\n"; - stream2 << " }\n\n"; - - stream2 << keywordString(" for") << " ("; - if (!emittingCoroutine) { - if (op->var.type() == Int32) { - stream2 << "int32 "; - } - else if (op->var.type() == Int64) { - stream2 << "int64 "; - } - - } - op->var.accept(this); - stream2 << " = "; - stream2 << "start"; - // op->start.accept(this); - stream2 << keywordString("; "); - op->var.accept(this); - stream2 << " < "; - parentPrecedence = BOTTOM; - stream2 << "end"; - // op->end.accept(this); - stream2 << keywordString("; "); - op->var.accept(this); - - auto lit = op->increment.as(); - if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || - (lit->type.isUInt() && lit->equalsScalar(1)))) { - stream2 << "++"; - } - else { - stream2 << " += "; - op->increment.accept(this); - } - - } - - else if (op->kind == LoopKind::Foreach) { - stream2 << keywordString("foreach") << " ("; - - op->var.accept(this); - stream2 << " = "; - op->start.accept(this); - stream2 << keywordString(" ... "); - op->end.accept(this); - - } else { - stream2 << keywordString("for") << " ("; - if (!emittingCoroutine) { - if (op->var.type() == Int32) { - stream2 << "int32 "; - } - else if (op->var.type() == Int64) { - stream2 << "int64 "; - } - - } - op->var.accept(this); - stream2 << " = "; - op->start.accept(this); - stream2 << keywordString("; "); - op->var.accept(this); - stream2 << " < "; - parentPrecedence = BOTTOM; - op->end.accept(this); - stream2 << keywordString("; "); - op->var.accept(this); - - auto lit = op->increment.as(); - if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || - (lit->type.isUInt() && lit->equalsScalar(1)))) { - stream2 << "++"; - } - else { - stream2 << " += "; - op->increment.accept(this); - } - - } - - stream2 << ") {\n"; - op->contents.accept(this); - doIndent(); - stream2 << "}"; - stream2 << endl; - -} - -void CodeGen_ISPC::visit(const While* op) { - // it's not clear from documentation that clang will vectorize - // while loops - // however, we'll output the pragmas anyway - if (op->kind == LoopKind::Vectorized) { - doIndent(); - out << genVectorizePragma(op->vec_width); - out << "\n"; - } - - CodeGen_C::visit(op); -} - -void CodeGen_ISPC::visit(const GetProperty* op) { - taco_iassert(varMap.count(op) > 0) << - "Property " << Expr(op) << " of " << op->tensor << " not found in varMap"; - if (is_ISPC_code_stream_enabled()) { - out2 << varMap[op]; - } - else { - out << varMap[op]; - } - -} - -void CodeGen_ISPC::visit(const Min* op) { - if (op->operands.size() == 1) { - op->operands[0].accept(this); - return; - } - for (size_t i=0; ioperands.size()-1; i++) { - stream << "TACO_MIN("; - op->operands[i].accept(this); - stream << ","; - } - op->operands.back().accept(this); - for (size_t i=0; ioperands.size()-1; i++) { - stream << ")"; - } -} - -void CodeGen_ISPC::visit(const Max* op) { - if (op->operands.size() == 1) { - op->operands[0].accept(this); - return; - } - for (size_t i=0; ioperands.size()-1; i++) { - stream << "TACO_MAX("; - op->operands[i].accept(this); - stream << ","; - } - op->operands.back().accept(this); - for (size_t i=0; ioperands.size()-1; i++) { - stream << ")"; - } -} - -void CodeGen_ISPC::visit(const Allocate* op) { - - - if (is_ISPC_code_stream_enabled()) { - string elementType = printCType(op->var.type(), false); - doIndent(); - - op->var.accept(this); - stream2 << " = "; - // stream2 << " = ("; - // stream2 << elementType << "*"; - // stream2 << ")"; - if (op->is_realloc) { - stream2 << "realloc("; - op->var.accept(this); - stream2 << ", "; - } - else { - // If the allocation was requested to clear the allocated memory, - // use calloc instead of malloc. - if (op->clear) { - stream2 << "calloc(1, "; - } else { - stream2 << "new "; - } - } - stream2 << elementType << "["; - parentPrecedence = MUL; - op->num_elements.accept(this); - parentPrecedence = TOP; - stream2 << "];"; - stream2 << endl; - - - } else { - CodeGen_C::visit(op); - - } - - -} - -void CodeGen_ISPC::visit(const Sqrt* op) { - taco_tassert(op->type.isFloat() && op->type.getNumBits() == 64) << - "Codegen doesn't currently support non-double sqrt"; - stream << "sqrt("; - op->a.accept(this); - stream << ")"; -} - -void CodeGen_ISPC::visit(const Assign* op) { - if (is_ISPC_code_stream_enabled()) { - doIndent(); - op->lhs.accept(this); - parentPrecedence = Precedence::TOP; - bool printed = false; - if (simplify) { - if (isa(op->rhs)) { - auto add = to(op->rhs); - if (add->a == op->lhs) { - const Literal* lit = add->b.as(); - if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || - (lit->type.isUInt() && lit->equalsScalar(1)))) { - stream2 << "++"; - } - else { - if (op->use_atomics) { - stream2 << " += reduce_add("; - add->b.accept(this); - stream2 << ")"; - } - else { - stream2 << " += "; - add->b.accept(this); - } - } - printed = true; - } - } - else if (isa(op->rhs)) { - auto mul = to(op->rhs); - if (mul->a == op->lhs) { - stream2 << " *= "; - mul->b.accept(this); - printed = true; - } - } - else if (isa(op->rhs)) { - auto bitOr = to(op->rhs); - if (bitOr->a == op->lhs) { - stream2 << " |= "; - bitOr->b.accept(this); - printed = true; - } - } - } - if (!printed) { - stream2 << " = "; - op->rhs.accept(this); - } - - stream2 << ";"; - stream2 << endl; - - IRPrinter::visit(op); - } - else { - CodeGen_C::visit(op); - - } - - -} - -void CodeGen_ISPC::visit(const Store* op) { - if (is_ISPC_code_stream_enabled()) { - if (op->use_atomics) { - doIndent(); - stream2 << getAtomicPragma() << endl; - } - } - else { - if (op->use_atomics) { - doIndent(); - stream << getAtomicPragma() << endl; - } - } - IRPrinter::visit(op); -} - -} -} diff --git a/src/codegen/codegen_ispc.h b/src/codegen/codegen_ispc.h deleted file mode 100644 index 62d2897ca..000000000 --- a/src/codegen/codegen_ispc.h +++ /dev/null @@ -1,68 +0,0 @@ -#ifndef TACO_BACKEND_ISPC_H -#define TACO_BACKEND_ISPC_H -#include -#include -#include - -#include "taco/ir/ir.h" -#include "taco/ir/ir_printer.h" -#include "codegen_c.h" - -namespace taco { -namespace ir { - - -class CodeGen_ISPC : public CodeGen_C { -public: - /// Initialize a code generator that generates code to an - /// output stream. - CodeGen_ISPC(std::ostream &dest, OutputKind outputKind, bool simplify=true); - CodeGen_ISPC(std::ostream &dest, std::ostream &dest2, OutputKind outputKind, bool simplify=true); - ~CodeGen_ISPC(); - - /// Compile a lowered function - void compile(Stmt stmt, bool isFirst=false); - - /// Generate shims that unpack an array of pointers representing - /// a mix of taco_tensor_t* and scalars into a function call - static void generateShim(const Stmt& func, std::stringstream &stream); - -protected: - using CodeGen_C::visit; - - void visit(const Function*); - void visit(const VarDecl*); - void visit(const Yield*); - void visit(const Var*); - void visit(const For*); - void visit(const While*); - void visit(const GetProperty*); - void visit(const Min*); - void visit(const Max*); - void visit(const Allocate*); - void visit(const Sqrt*); - void visit(const Store*); - void visit(const Assign*); - - Stmt simplifyFunctionBodies(Stmt stmt); - std::string printCallISPCFunc(const std::string& funcName, std::map varMap, - std::vector &sortedProps); - void printISPCFunc(const Function *func, std::map varMap, - std::vector &sortedProps); - - bool taskCode = false; - - std::stringstream funcVariables; - std::vector sortedProps; - - class FindVars; - class FunctionCollector; - -private: - virtual std::string restrictKeyword() const { return "restrict"; } - void sendToStream(std::stringstream &stream); -}; - -} // namespace ir -} // namespace taco -#endif diff --git a/src/codegen/module.cpp b/src/codegen/module.cpp index 6f631d40e..c0192f243 100644 --- a/src/codegen/module.cpp +++ b/src/codegen/module.cpp @@ -4,7 +4,6 @@ #include #include #include -// #include #if USE_OPENMP #include #endif @@ -14,7 +13,6 @@ #include "taco/util/strings.h" #include "taco/util/env.h" #include "codegen/codegen_c.h" -#include "codegen/codegen_ispc.h" #include "codegen/codegen_cuda.h" #include "taco/cuda.h" @@ -53,13 +51,11 @@ void Module::compileToSource(string path, string prefix) { header.clear(); source.str(""); source.clear(); - additional_source.str(""); - additional_source.clear(); taco_tassert(target.arch == Target::C99) << "Only C99 codegen supported currently"; std::shared_ptr sourcegen = - CodeGen::init_default(source, additional_source, CodeGen::ImplementationGen); + CodeGen::init_default(source, CodeGen::ImplementationGen); std::shared_ptr headergen = CodeGen::init_default(header, CodeGen::HeaderGen); @@ -73,17 +69,8 @@ void Module::compileToSource(string path, string prefix) { ofstream source_file; string file_ending = should_use_CUDA_codegen() ? ".cu" : ".c"; source_file.open(path+prefix+file_ending); - if (should_use_ISPC_codegen()) { - source_file << "#include \"" << path+prefix+"_ispc.h\"\n"; - } source_file << source.str(); source_file.close(); - - ofstream additional_source_file; - string file_ending2 = ".ispc"; - additional_source_file.open(path+prefix+file_ending2); - additional_source_file << additional_source.str(); - additional_source_file.close(); ofstream header_file; header_file.open(path+prefix+".h"); @@ -103,9 +90,6 @@ void writeShims(vector funcs, string path, string prefix) { if (should_use_CUDA_codegen()) { CodeGen_CUDA::generateShim(func, shims); } - // else if (should_use_ISPC_codegen()) { - // CodeGen_ISPC::generateShim(func, shims); - // } else { CodeGen_C::generateShim(func, shims); } @@ -115,9 +99,6 @@ void writeShims(vector funcs, string path, string prefix) { if (should_use_CUDA_codegen()) { shims_file.open(path+prefix+"_shims.cpp"); } - // else if (should_use_ISPC_codegen()) { - // shims_file.open(path+prefix+".c", ios::app); - // } else { shims_file.open(path+prefix+".c", ios::app); } @@ -144,13 +125,6 @@ string Module::compile() { file_ending = ".cu"; shims_file = prefix + "_shims.cpp"; } - // else if (should_use_ISPC_codegen()) { - // cc = util::getFromEnv("TACO_ISPC", "ispc"); - // cflags = util::getFromEnv("TACO_ISPC_FLAGS", - // " --target=sse2-i32x4,sse4-i32x8,avx1-i32x8,avx2-i32x8,avx512knl-i32x16,avx512skx-i32x16 --pic -O3 --addressing=64 --arch=x86-64" - // ) + " "; - - // } else { cc = util::getFromEnv(target.compiler_env, target.compiler); cflags = util::getFromEnv("TACO_CFLAGS", @@ -184,36 +158,10 @@ string Module::compile() { } std::cout << tmpdir << std::endl << libname << std::endl; - if (should_use_ISPC_codegen()) { - string ispc = util::getFromEnv("TACO_ISPC", "ispc"); - string ispcflags = util::getFromEnv("TACO_ISPC_FLAGS", - " --target=sse2-i32x4,sse4-i32x8,avx1-i32x8,avx2-i32x8,avx512knl-i32x16,avx512skx-i32x16 --pic -O3 --addressing=64 --arch=x86-64" - ) + " "; - string cmd = ispc + " " + ispcflags + " -o " + prefix + ".ispc.o " + " --emit-obj " + prefix + ".ispc " + "-h " + prefix + "_ispc.h"; - - // now compile the ispc file to generate the object file and the ispc header file - std::cout << "--------------------------------------------------------------------------------cmd: " << cmd << std::endl; - int err = system(cmd.data()); - taco_uassert(err == 0) << "Compilation command failed:\n" << cmd - << "\nreturned " << err; - - string ispc_object_file = " " + prefix + ".ispc.o "; - string ispc_object_files_for_diff_targets = " " + prefix + ".ispc_* "; - cmd = cc + " " + cflags + " " + - prefix + file_ending + " " + ispc_object_file + ispc_object_files_for_diff_targets + shims_file + " " + - "-o " + fullpath + " -lm -lrt "; - - // now compile the c file linking the ispc object file. ispc header is added to the top of the c file - std::cout << "--------------------------------------------------------------------------------cmd: " << cmd << std::endl; - err = system(cmd.data()); - taco_uassert(err == 0) << "Compilation command failed:\n" << cmd - << "\nreturned " << err; - } else { - // now compile it - int err = system(cmd.data()); - taco_uassert(err == 0) << "Compilation command failed:\n" << cmd - << "\nreturned " << err; - } + // now compile it + int err = system(cmd.data()); + taco_uassert(err == 0) << "Compilation command failed:\n" << cmd + << "\nreturned " << err; // use dlsym() to open the compiled library if (lib_handle) { @@ -318,11 +266,7 @@ int Module::callFuncPackedRaw(std::string name, void** args) { #endif std::cout << "calling the function\n"; - // CALLGRIND_START_INSTRUMENTATION; - // CALLGRIND_TOGGLE_COLLECT; int ret = func_ptr(args); - // CALLGRIND_TOGGLE_COLLECT; - // CALLGRIND_STOP_INSTRUMENTATION; std::cout << "function call completed\n"; #if USE_OPENMP diff --git a/src/cuda.cpp b/src/cuda.cpp index 68e49fe98..059c60105 100644 --- a/src/cuda.cpp +++ b/src/cuda.cpp @@ -7,25 +7,6 @@ using namespace std; namespace taco { - -static bool ISPC_codegen_enabled = ISPC_BUILT; -static bool ISPC_code_stream_enabled = false; -bool should_use_ISPC_codegen() { - return ISPC_codegen_enabled; -} - -bool is_ISPC_code_stream_enabled() { - return ISPC_code_stream_enabled; -} - -void set_ISPC_codegen_enabled(bool enabled) { - ISPC_codegen_enabled = enabled; -} - -void set_ISPC_code_stream_enabled(bool enabled) { - ISPC_code_stream_enabled = enabled; -} - /// Functions used by taco to interface with CUDA (especially unified memory) static bool CUDA_codegen_enabled = CUDA_BUILT; static bool CUDA_unified_memory_enabled = CUDA_BUILT; diff --git a/src/index_notation/transformations.cpp b/src/index_notation/transformations.cpp index c1d82a9fd..3846da6a8 100644 --- a/src/index_notation/transformations.cpp +++ b/src/index_notation/transformations.cpp @@ -1241,17 +1241,6 @@ IndexStmt parallelizeOuterLoop(IndexStmt stmt) { } return parallelized256; } - else if (should_use_ISPC_codegen()) { - std::cout << "outer loop parallelization for ISPC codegen\n"; - // IndexStmt parallelized = Parallelize(forall.getIndexVar(), ParallelUnit::CPUSpmd, OutputRaceStrategy::NoRaces).apply(stmt, &reason); - // if (parallelized == IndexStmt()) { - // // can't parallelize - // return stmt; - // } - // return parallelized; - - return stmt; - } else { std::cout << "outer loop parallelization for CPU codgen index statement\n"; IndexStmt parallelized = Parallelize(forall.getIndexVar(), ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces).apply(stmt, &reason); diff --git a/src/ir/ir_printer.cpp b/src/ir/ir_printer.cpp index eddca3f29..a874fb3ea 100644 --- a/src/ir/ir_printer.cpp +++ b/src/ir/ir_printer.cpp @@ -64,167 +64,85 @@ void IRPrinter::print(Stmt stmt) { } void IRPrinter::visit(const Literal* op) { - if (is_ISPC_code_stream_enabled()) { - if (color) { - stream2 << blue ; - } - - // It seems this is where all the types get printed in the final code generation. - // Come up with a way to generate different values if stream2 is used to generate ispc code - switch (op->type.getKind()) { - case Datatype::Bool: - stream2 << op->getValue(); - break; - case Datatype::UInt8: - stream2 << static_cast(op->getValue()); - break; - case Datatype::UInt16: - stream2 << op->getValue(); - break; - case Datatype::UInt32: - stream2 << op->getValue(); - break; - case Datatype::UInt64: - stream2 << op->getValue(); - break; - case Datatype::UInt128: - taco_not_supported_yet; - break; - case Datatype::Int8: - stream2 << static_cast(op->getValue()); - break; - case Datatype::Int16: - stream2 << op->getValue(); - break; - case Datatype::Int32: - stream2 << op->getValue(); - break; - case Datatype::Int64: - stream2 << op->getValue(); - break; - case Datatype::Int128: - taco_not_supported_yet; - break; - case Datatype::Float32: - stream2 << ((op->getValue() != 0.0) - ? util::toString(op->getValue()) : "0.0"); - break; - case Datatype::Float64: - stream2 << ((op->getValue()!=0.0) - ? util::toString(op->getValue()) : "0.0"); - break; - case Datatype::Complex64: { - std::complex val = op->getValue>(); - stream2 << val.real() << " + I*" << val.imag(); - } - break; - case Datatype::Complex128: { - std::complex val = op->getValue>(); - stream2 << val.real() << " + I*" << val.imag(); - } - break; - case Datatype::Undefined: - taco_ierror << "Undefined type in IR"; - break; - } - if (color) { - stream2 << nc; - } + if (color) { + stream << blue ; + } + + // It seems this is where all the types get printed in the final code generation. + // Come up with a way to generate different values if stream2 is used to generate ispc code + switch (op->type.getKind()) { + case Datatype::Bool: + stream << op->getValue(); + break; + case Datatype::UInt8: + stream << static_cast(op->getValue()); + break; + case Datatype::UInt16: + stream << op->getValue(); + break; + case Datatype::UInt32: + stream << op->getValue(); + break; + case Datatype::UInt64: + stream << op->getValue(); + break; + case Datatype::UInt128: + taco_not_supported_yet; + break; + case Datatype::Int8: + stream << static_cast(op->getValue()); + break; + case Datatype::Int16: + stream << op->getValue(); + break; + case Datatype::Int32: + stream << op->getValue(); + break; + case Datatype::Int64: + stream << op->getValue(); + break; + case Datatype::Int128: + taco_not_supported_yet; + break; + case Datatype::Float32: + stream << ((op->getValue() != 0.0) + ? util::toString(op->getValue()) : "0.0"); + break; + case Datatype::Float64: + stream << ((op->getValue()!=0.0) + ? util::toString(op->getValue()) : "0.0"); + break; + case Datatype::Complex64: { + std::complex val = op->getValue>(); + stream << val.real() << " + I*" << val.imag(); } + break; + case Datatype::Complex128: { + std::complex val = op->getValue>(); + stream << val.real() << " + I*" << val.imag(); + } + break; + case Datatype::Undefined: + taco_ierror << "Undefined type in IR"; + break; + } - - - else { - - if (color) { - stream << blue ; - } - - // It seems this is where all the types get printed in the final code generation. - // Come up with a way to generate different values if stream2 is used to generate ispc code - switch (op->type.getKind()) { - case Datatype::Bool: - stream << op->getValue(); - break; - case Datatype::UInt8: - stream << static_cast(op->getValue()); - break; - case Datatype::UInt16: - stream << op->getValue(); - break; - case Datatype::UInt32: - stream << op->getValue(); - break; - case Datatype::UInt64: - stream << op->getValue(); - break; - case Datatype::UInt128: - taco_not_supported_yet; - break; - case Datatype::Int8: - stream << static_cast(op->getValue()); - break; - case Datatype::Int16: - stream << op->getValue(); - break; - case Datatype::Int32: - stream << op->getValue(); - break; - case Datatype::Int64: - stream << op->getValue(); - break; - case Datatype::Int128: - taco_not_supported_yet; - break; - case Datatype::Float32: - stream << ((op->getValue() != 0.0) - ? util::toString(op->getValue()) : "0.0"); - break; - case Datatype::Float64: - stream << ((op->getValue()!=0.0) - ? util::toString(op->getValue()) : "0.0"); - break; - case Datatype::Complex64: { - std::complex val = op->getValue>(); - stream << val.real() << " + I*" << val.imag(); - } - break; - case Datatype::Complex128: { - std::complex val = op->getValue>(); - stream << val.real() << " + I*" << val.imag(); - } - break; - case Datatype::Undefined: - taco_ierror << "Undefined type in IR"; - break; - } - - if (color) { - stream << nc; - } + if (color) { + stream << nc; + } - } + } void IRPrinter::visit(const Var* op) { - if (is_ISPC_code_stream_enabled()) { - if (varNames.contains(op)) { - stream2 << varNames.get(op); - } - else { - stream2 << op->name; - } + if (varNames.contains(op)) { + stream << varNames.get(op); } else { - if (varNames.contains(op)) { - stream << varNames.get(op); - } - else { - stream << op->name; - } + stream << op->name; } } @@ -333,100 +251,51 @@ void IRPrinter::visit(const Cast* op) { } void IRPrinter::visit(const Call* op) { - if (!is_ISPC_code_stream_enabled()) { - stream << op->func << "("; - parentPrecedence = Precedence::CALL; - acceptJoin(this, stream, op->args, ", "); - stream << ")"; - } else { - // statically added function to the ispc file has __ in the front - stream2 << "__" << op->func << "("; - parentPrecedence = Precedence::CALL; - acceptJoin(this, stream2, op->args, ", "); - stream2 << ")"; - } + stream << op->func << "("; + parentPrecedence = Precedence::CALL; + acceptJoin(this, stream, op->args, ", "); + stream << ")"; } void IRPrinter::visit(const IfThenElse* op) { taco_iassert(op->cond.defined()); taco_iassert(op->then.defined()); doIndent(); - if (is_ISPC_code_stream_enabled()) { - stream2 << keywordString("if "); - stream2 << "("; - parentPrecedence = Precedence::TOP; - op->cond.accept(this); - stream2 << ")"; - - Stmt scopedStmt = Stmt(to(op->then)->scopedStmt); - if (isa(scopedStmt)) { - stream2 << " {" << endl; - op->then.accept(this); - doIndent(); - stream2 << "}"; - } - else if (isa(scopedStmt)) { - int tmp = indent; - indent = 0; - stream2 << " "; - scopedStmt.accept(this); - indent = tmp; - } - else { - stream2 << endl; - op->then.accept(this); - } + stream << keywordString("if "); + stream << "("; + parentPrecedence = Precedence::TOP; + op->cond.accept(this); + stream << ")"; - if (op->otherwise.defined()) { - stream2 << "\n"; - doIndent(); - stream2 << keywordString("else"); - stream2 << " {\n"; - op->otherwise.accept(this); - doIndent(); - stream2 << "}"; - } - stream2 << endl; + Stmt scopedStmt = Stmt(to(op->then)->scopedStmt); + if (isa(scopedStmt)) { + stream << " {" << endl; + op->then.accept(this); + doIndent(); + stream << "}"; + } + else if (isa(scopedStmt)) { + int tmp = indent; + indent = 0; + stream << " "; + scopedStmt.accept(this); + indent = tmp; } - - else { - stream << keywordString("if "); - stream << "("; - parentPrecedence = Precedence::TOP; - op->cond.accept(this); - stream << ")"; - - Stmt scopedStmt = Stmt(to(op->then)->scopedStmt); - if (isa(scopedStmt)) { - stream << " {" << endl; - op->then.accept(this); - doIndent(); - stream << "}"; - } - else if (isa(scopedStmt)) { - int tmp = indent; - indent = 0; - stream << " "; - scopedStmt.accept(this); - indent = tmp; - } - else { - stream << endl; - op->then.accept(this); - } + stream << endl; + op->then.accept(this); + } - if (op->otherwise.defined()) { - stream << "\n"; - doIndent(); - stream << keywordString("else"); - stream << " {\n"; - op->otherwise.accept(this); - doIndent(); - stream << "}"; - } - stream << endl; + if (op->otherwise.defined()) { + stream << "\n"; + doIndent(); + stream << keywordString("else"); + stream << " {\n"; + op->otherwise.accept(this); + doIndent(); + stream << "}"; } + stream << endl; } @@ -490,22 +359,12 @@ void IRPrinter::visit(const Switch* op) { } void IRPrinter::visit(const Load* op) { - if (is_ISPC_code_stream_enabled()) { - parentPrecedence = Precedence::LOAD; - op->arr.accept(this); - stream2 << "["; - parentPrecedence = Precedence::LOAD; - op->loc.accept(this); - stream2 << "]"; - } - else { - parentPrecedence = Precedence::LOAD; - op->arr.accept(this); - stream << "["; - parentPrecedence = Precedence::LOAD; - op->loc.accept(this); - stream << "]"; - } + parentPrecedence = Precedence::LOAD; + op->arr.accept(this); + stream << "["; + parentPrecedence = Precedence::LOAD; + op->loc.accept(this); + stream << "]"; } void IRPrinter::visit(const Malloc* op) { @@ -522,149 +381,72 @@ void IRPrinter::visit(const Sizeof* op) { } void IRPrinter::visit(const Store* op) { - if (is_ISPC_code_stream_enabled()) { - doIndent(); - op->arr.accept(this); - stream2 << "["; - parentPrecedence = Precedence::TOP; - op->loc.accept(this); - stream2 << "] = "; - parentPrecedence = Precedence::TOP; - op->data.accept(this); - stream2 << ";"; - stream2 << endl; - } - else { - doIndent(); - op->arr.accept(this); - stream << "["; - parentPrecedence = Precedence::TOP; - op->loc.accept(this); - stream << "] = "; - parentPrecedence = Precedence::TOP; - op->data.accept(this); - stream << ";"; - stream << endl; - } + doIndent(); + op->arr.accept(this); + stream << "["; + parentPrecedence = Precedence::TOP; + op->loc.accept(this); + stream << "] = "; + parentPrecedence = Precedence::TOP; + op->data.accept(this); + stream << ";"; + stream << endl; } void IRPrinter::visit(const For* op) { - // std::cout << "This is IRPrinter::visit For op method\n"; - if (is_ISPC_code_stream_enabled()) { - doIndent(); - stream2 << keywordString("for") << " (" - << keywordString(util::toString(op->var.type())) << " "; - op->var.accept(this); - stream2 << " = "; - op->start.accept(this); - stream2 << keywordString("; "); - op->var.accept(this); - stream2 << " < "; - parentPrecedence = BOTTOM; - op->end.accept(this); - stream2 << keywordString("; "); - op->var.accept(this); - - auto lit = op->increment.as(); - if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || - (lit->type.isUInt() && lit->equalsScalar(1)))) { - stream2 << "++"; - } - else { - stream2 << " += "; - op->increment.accept(this); - } - stream2 << ") {\n"; + doIndent(); + stream << keywordString("for") << " (" + << keywordString(util::toString(op->var.type())) << " "; + op->var.accept(this); + stream << " = "; + op->start.accept(this); + stream << keywordString("; "); + op->var.accept(this); + stream << " < "; + parentPrecedence = BOTTOM; + op->end.accept(this); + stream << keywordString("; "); + op->var.accept(this); - op->contents.accept(this); - doIndent(); - stream2 << "}"; - stream2 << endl; + auto lit = op->increment.as(); + if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || + (lit->type.isUInt() && lit->equalsScalar(1)))) { + stream << "++"; } - - else { - doIndent(); - stream << keywordString("for") << " (" - << keywordString(util::toString(op->var.type())) << " "; - op->var.accept(this); - stream << " = "; - op->start.accept(this); - stream << keywordString("; "); - op->var.accept(this); - stream << " < "; - parentPrecedence = BOTTOM; - op->end.accept(this); - stream << keywordString("; "); - op->var.accept(this); - - auto lit = op->increment.as(); - if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || - (lit->type.isUInt() && lit->equalsScalar(1)))) { - stream << "++"; - } - else { - stream << " += "; - op->increment.accept(this); - } - stream << ") {\n"; - - op->contents.accept(this); - doIndent(); - stream << "}"; - stream << endl; + stream << " += "; + op->increment.accept(this); } + stream << ") {\n"; + + op->contents.accept(this); + doIndent(); + stream << "}"; + stream << endl; } void IRPrinter::sendToStream(std::stringstream &stream) { - if (is_ISPC_code_stream_enabled()) { - this->stream2 << stream.str(); - } - else { - this->stream << stream.str(); - } + this->stream << stream.str(); } void IRPrinter::visit(const While* op) { - // std::stringstream stream; - if (is_ISPC_code_stream_enabled()) { - doIndent(); - stream2 << keywordString("while "); - stream2 << "("; - parentPrecedence = Precedence::TOP; - op->cond.accept(this); - stream2 << ")"; - stream2 << " {\n"; - op->contents.accept(this); - doIndent(); - stream2 << "}"; - stream2 << endl; - } - else { - doIndent(); - stream << keywordString("while "); - stream << "("; - parentPrecedence = Precedence::TOP; - op->cond.accept(this); - stream << ")"; - stream << " {\n"; - op->contents.accept(this); - doIndent(); - stream << "}"; - stream << endl; - } - // sendToStream(stream); + doIndent(); + stream << keywordString("while "); + stream << "("; + parentPrecedence = Precedence::TOP; + op->cond.accept(this); + stream << ")"; + stream << " {\n"; + op->contents.accept(this); + doIndent(); + stream << "}"; + stream << endl; } void IRPrinter::visit(const Block* op) { - if (is_ISPC_code_stream_enabled()) { - acceptJoin(this, stream2, op->contents, ""); - } - else { - acceptJoin(this, stream, op->contents, ""); - } + acceptJoin(this, stream, op->contents, ""); } void IRPrinter::visit(const Scope* op) { @@ -676,140 +458,87 @@ void IRPrinter::visit(const Scope* op) { } void IRPrinter::visit(const Function* op) { - if (is_ISPC_code_stream_enabled()) { - stream2 << keywordString("void ") << op->name; - stream2 << "("; - if (op->outputs.size() > 0) stream2 << "Tensor "; - acceptJoin(this, stream2, op->outputs, ", Tensor "); - if (op->outputs.size() > 0 && op->inputs.size()) stream2 << ", "; - if (op->inputs.size() > 0) stream2 << "Tensor "; - acceptJoin(this, stream2, op->inputs, ", Tensor "); - stream2 << ") {" << endl; - - resetNameCounters(); - op->body.accept(this); - - doIndent(); - stream2 << "}"; - } - else { - stream << keywordString("void ") << op->name; - stream << "("; - if (op->outputs.size() > 0) stream << "Tensor "; - acceptJoin(this, stream, op->outputs, ", Tensor "); - if (op->outputs.size() > 0 && op->inputs.size()) stream << ", "; - if (op->inputs.size() > 0) stream << "Tensor "; - acceptJoin(this, stream, op->inputs, ", Tensor "); - stream << ") {" << endl; + stream << keywordString("void ") << op->name; + stream << "("; + if (op->outputs.size() > 0) stream << "Tensor "; + acceptJoin(this, stream, op->outputs, ", Tensor "); + if (op->outputs.size() > 0 && op->inputs.size()) stream << ", "; + if (op->inputs.size() > 0) stream << "Tensor "; + acceptJoin(this, stream, op->inputs, ", Tensor "); + stream << ") {" << endl; - resetNameCounters(); - op->body.accept(this); + resetNameCounters(); + op->body.accept(this); - doIndent(); - stream << "}"; - } + doIndent(); + stream << "}"; } void IRPrinter::visit(const VarDecl* op) { - if (is_ISPC_code_stream_enabled()) { - doIndent(); - if (op->var.type() == Int32) { - stream2 << keywordString("int32"); - } - else if (op->var.type() == Int64) { - stream2 << keywordString("int64"); - } else { - stream2 << keywordString(util::toString(op->var.type())); - } - taco_iassert(isa(op->var)); - if (to(op->var)->is_ptr) { - stream2 << "* "; // removed restrict keyword from here - } - stream2 << " "; - string varName = varNameGenerator.getUniqueName(util::toString(op->var)); - varNames.insert({op->var, varName}); - op->var.accept(this); - parentPrecedence = Precedence::TOP; - stream2 << " = "; - op->rhs.accept(this); - stream2 << ";"; - stream2 << endl; - } - else { - doIndent(); - stream << keywordString(util::toString(op->var.type())); - taco_iassert(isa(op->var)); - if (to(op->var)->is_ptr) { - stream << "* restrict"; - } - stream << " "; - string varName = varNameGenerator.getUniqueName(util::toString(op->var)); - varNames.insert({op->var, varName}); - op->var.accept(this); - parentPrecedence = Precedence::TOP; - stream << " = "; - op->rhs.accept(this); - stream << ";"; - stream << endl; - } + doIndent(); + stream << keywordString(util::toString(op->var.type())); + taco_iassert(isa(op->var)); + if (to(op->var)->is_ptr) { + stream << "* restrict"; + } + stream << " "; + string varName = varNameGenerator.getUniqueName(util::toString(op->var)); + varNames.insert({op->var, varName}); + op->var.accept(this); + parentPrecedence = Precedence::TOP; + stream << " = "; + op->rhs.accept(this); + stream << ";"; + stream << endl; } void IRPrinter::visit(const Assign* op) { - if (is_ISPC_code_stream_enabled()) { - - } - - - - else { - doIndent(); - op->lhs.accept(this); - parentPrecedence = Precedence::TOP; - bool printed = false; - if (simplify) { - if (isa(op->rhs)) { - auto add = to(op->rhs); - if (add->a == op->lhs) { - const Literal* lit = add->b.as(); - if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || - (lit->type.isUInt() && lit->equalsScalar(1)))) { - stream << "++"; - } - else { - stream << " += "; - add->b.accept(this); - } - printed = true; + doIndent(); + op->lhs.accept(this); + parentPrecedence = Precedence::TOP; + bool printed = false; + if (simplify) { + if (isa(op->rhs)) { + auto add = to(op->rhs); + if (add->a == op->lhs) { + const Literal* lit = add->b.as(); + if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || + (lit->type.isUInt() && lit->equalsScalar(1)))) { + stream << "++"; } - } - else if (isa(op->rhs)) { - auto mul = to(op->rhs); - if (mul->a == op->lhs) { - stream << " *= "; - mul->b.accept(this); - printed = true; + else { + stream << " += "; + add->b.accept(this); } + printed = true; } - else if (isa(op->rhs)) { - auto bitOr = to(op->rhs); - if (bitOr->a == op->lhs) { - stream << " |= "; - bitOr->b.accept(this); - printed = true; - } + } + else if (isa(op->rhs)) { + auto mul = to(op->rhs); + if (mul->a == op->lhs) { + stream << " *= "; + mul->b.accept(this); + printed = true; } } - if (!printed) { - stream << " = "; - op->rhs.accept(this); + else if (isa(op->rhs)) { + auto bitOr = to(op->rhs); + if (bitOr->a == op->lhs) { + stream << " |= "; + bitOr->b.accept(this); + printed = true; + } } - - stream << ";"; - stream << endl; + } + if (!printed) { + stream << " = "; + op->rhs.accept(this); } + stream << ";"; + stream << endl; } void IRPrinter::visit(const Yield* op) { @@ -837,22 +566,12 @@ void IRPrinter::visit(const Allocate* op) { } void IRPrinter::visit(const Free* op) { - if (is_ISPC_code_stream_enabled()) { - doIndent(); - stream2 << "delete[] "; - parentPrecedence = Precedence::TOP; - op->var.accept(this); - stream2 << ";"; - stream2 << endl; - } - else { - doIndent(); - stream << "free("; - parentPrecedence = Precedence::TOP; - op->var.accept(this); - stream << ");"; - stream << endl; - } + doIndent(); + stream << "free("; + parentPrecedence = Precedence::TOP; + op->var.accept(this); + stream << ");"; + stream << endl; } void IRPrinter::visit(const Comment* op) { @@ -862,32 +581,17 @@ void IRPrinter::visit(const Comment* op) { } void IRPrinter::visit(const BlankLine*) { - if (is_ISPC_code_stream_enabled()) { - stream2 << endl; - } - else { - stream << endl; - } + stream << endl; } void IRPrinter::visit(const Continue*) { doIndent(); - if (!is_ISPC_code_stream_enabled()) { - stream << "continue;" << endl; - } - else { - stream2 << "continue;" << endl; - } + stream << "continue;" << endl; } void IRPrinter::visit(const Break*) { doIndent(); - if (!is_ISPC_code_stream_enabled()) { - stream << "break;" << endl; - } - else { - stream2 << "break;" << endl; - } + stream << "break;" << endl; } void IRPrinter::visit(const Print* op) { @@ -903,12 +607,7 @@ void IRPrinter::visit(const Print* op) { } void IRPrinter::visit(const GetProperty* op) { - if (is_ISPC_code_stream_enabled()) { - stream2 << op->name; - } - else { - stream << op->name; - } + stream << op->name; } void IRPrinter::visit(const Sort* op) { @@ -966,47 +665,23 @@ void IRPrinter::resetNameCounters() { } void IRPrinter::doIndent() { - if (is_ISPC_code_stream_enabled()) { - for (int i=0; i 0) { const Format bufferFormat = COO(format.getOrder(), false, true, false, format.getModeOrdering()); From 078522f38d9e9d079374a0e4d9078d06124ca6a1 Mon Sep 17 00:00:00 2001 From: Adhhitha Dias Date: Wed, 11 May 2022 17:29:24 -0400 Subject: [PATCH 15/16] remove more ispc related content --- include/taco/ir/ir.h | 2 +- include/taco/ir/ir_printer.h | 3 --- include/taco/ir_tags.h | 2 +- src/codegen/codegen.cpp | 9 --------- src/codegen/codegen.h | 3 --- src/codegen/codegen_c.cpp | 3 --- src/ir/ir_printer.cpp | 10 +--------- 7 files changed, 3 insertions(+), 29 deletions(-) diff --git a/include/taco/ir/ir.h b/include/taco/ir/ir.h index 96dc7d034..f852f26b1 100644 --- a/include/taco/ir/ir.h +++ b/include/taco/ir/ir.h @@ -591,7 +591,7 @@ struct Switch : public StmtNode { static const IRNodeType _type_info = IRNodeType::Switch; }; -enum class LoopKind {Serial, Static, Dynamic, Runtime, Vectorized, Static_Chunked, Foreach, Mul_Thread, Init}; +enum class LoopKind {Serial, Static, Dynamic, Runtime, Vectorized, Static_Chunked}; /** A for loop from start to end by increment. * A vectorized loop will require the increment to be 1 and the diff --git a/include/taco/ir/ir_printer.h b/include/taco/ir/ir_printer.h index c2c505bf5..4e50764e9 100644 --- a/include/taco/ir/ir_printer.h +++ b/include/taco/ir/ir_printer.h @@ -16,7 +16,6 @@ class IRPrinter : public IRVisitorStrict { public: IRPrinter(std::ostream& stream); IRPrinter(std::ostream& stream, bool color, bool simplify); - IRPrinter(std::ostream& stream, std::ostream& stream2, bool color, bool simplify); virtual ~IRPrinter(); void setColor(bool color); @@ -73,7 +72,6 @@ class IRPrinter : public IRVisitorStrict { virtual void visit(const Break*); std::ostream &stream; - std::ostream &stream2; int indent; bool color; bool simplify; @@ -111,7 +109,6 @@ class IRPrinter : public IRVisitorStrict { void doIndent(); void printBinOp(Expr a, Expr b, std::string op, Precedence precedence); bool needsParentheses(Precedence precedence); - void sendToStream(std::stringstream &stream); std::string keywordString(std::string); std::string commentString(std::string); diff --git a/include/taco/ir_tags.h b/include/taco/ir_tags.h index 6a74be173..5858a13e3 100644 --- a/include/taco/ir_tags.h +++ b/include/taco/ir_tags.h @@ -9,7 +9,7 @@ namespace taco { /// ParallelUnit::GPUWarp can be optionally used to allow for GPU warp-level primitives /// ParallelUnit::GPUThread causes for every iteration to be executed on a separate GPU thread enum class ParallelUnit { - NotParallel, DefaultUnit, GPUBlock, GPUWarp, GPUThread, CPUThread, CPUVector, CPUThreadGroupReduction, GPUBlockReduction, GPUWarpReduction, CPUSimd, CPUSpmd + NotParallel, DefaultUnit, GPUBlock, GPUWarp, GPUThread, CPUThread, CPUVector, CPUThreadGroupReduction, GPUBlockReduction, GPUWarpReduction }; extern const char *ParallelUnit_NAMES[]; diff --git a/src/codegen/codegen.cpp b/src/codegen/codegen.cpp index 696d4971a..64c8b3f02 100644 --- a/src/codegen/codegen.cpp +++ b/src/codegen/codegen.cpp @@ -31,15 +31,6 @@ shared_ptr CodeGen::init_default(std::ostream &dest, OutputKind outputK } } -shared_ptr CodeGen::init_default(std::ostream &dest, std::ostream &dest2, OutputKind outputKind) { - if (should_use_CUDA_codegen()) { - return make_shared(dest, outputKind); - } - else { - return make_shared(dest, outputKind); - } -} - int CodeGen::countYields(const Function *func) { struct CountYields : public IRVisitor { int yields = 0; diff --git a/src/codegen/codegen.h b/src/codegen/codegen.h index db891f995..48540904e 100644 --- a/src/codegen/codegen.h +++ b/src/codegen/codegen.h @@ -18,11 +18,8 @@ class CodeGen : public IRPrinter { CodeGen(std::ostream& stream, CodeGenType type) : IRPrinter(stream), codeGenType(type) {}; CodeGen(std::ostream& stream, bool color, bool simplify, CodeGenType type) : IRPrinter(stream, color, simplify), codeGenType(type) {}; - CodeGen(std::ostream& stream, std::ostream& stream2, bool color, bool simplify, CodeGenType type) - : IRPrinter(stream, stream2, color, simplify), codeGenType(type) {}; /// Initialize the default code generator static std::shared_ptr init_default(std::ostream &dest, OutputKind outputKind); - static std::shared_ptr init_default(std::ostream &dest, std::ostream &dest2, OutputKind outputKind); /// Compile a lowered function virtual void compile(Stmt stmt, bool isFirst=false) =0; diff --git a/src/codegen/codegen_c.cpp b/src/codegen/codegen_c.cpp index 83da7aaab..6dd39107d 100644 --- a/src/codegen/codegen_c.cpp +++ b/src/codegen/codegen_c.cpp @@ -242,9 +242,6 @@ class CodeGen_C::FindVars : public IRVisitor { CodeGen_C::CodeGen_C(std::ostream &dest, OutputKind outputKind, bool simplify) : CodeGen(dest, false, simplify, C), out(dest), out2(dest), outputKind(outputKind) {} - -CodeGen_C::CodeGen_C(std::ostream &dest, std::ostream &dest2, OutputKind outputKind, bool simplify) - : CodeGen(dest, dest2, false, simplify, C), out(dest), out2(dest2), outputKind(outputKind) {} CodeGen_C::~CodeGen_C() {} diff --git a/src/ir/ir_printer.cpp b/src/ir/ir_printer.cpp index a874fb3ea..0bc848148 100644 --- a/src/ir/ir_printer.cpp +++ b/src/ir/ir_printer.cpp @@ -35,11 +35,7 @@ IRPrinter::IRPrinter(ostream &s) : IRPrinter(s, false, false) { } IRPrinter::IRPrinter(ostream &s, bool color, bool simplify) - : stream(s), stream2(s), indent(0), color(color), simplify(simplify) { -} - -IRPrinter::IRPrinter(ostream &s, ostream &s2, bool color, bool simplify) - : stream(s), stream2(s2), indent(0), color(color), simplify(simplify) { + : stream(s), indent(0), color(color), simplify(simplify) { } IRPrinter::~IRPrinter() { @@ -427,10 +423,6 @@ void IRPrinter::visit(const For* op) { } -void IRPrinter::sendToStream(std::stringstream &stream) { - this->stream << stream.str(); -} - void IRPrinter::visit(const While* op) { doIndent(); stream << keywordString("while "); From d2723efaef2c5ea1ced26e96750f9706680f0673 Mon Sep 17 00:00:00 2001 From: Adhhitha Dias Date: Wed, 11 May 2022 17:51:57 -0400 Subject: [PATCH 16/16] remove more ispc related content --- CMakeLists.txt | 7 ------- include/taco/taco_tensor_t.h | 1 - include/taco/util/strings.h | 1 - include/taco/version.h.in | 1 - src/codegen/codegen_c.cpp | 16 +--------------- src/codegen/codegen_c.h | 2 -- src/codegen/codegen_cuda.cpp | 1 - src/ir_tags.cpp | 2 +- src/lower/iteration_graph.cpp | 11 +---------- src/lower/iterator.cpp | 3 --- src/lower/lowerer_impl_imperative.cpp | 13 +++---------- 11 files changed, 6 insertions(+), 52 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index bbc678c72..4f8b54eee 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,12 +10,10 @@ project(taco LANGUAGES C CXX ) option(CUDA "Build for NVIDIA GPU (CUDA must be preinstalled)" OFF) -option(ISPC "Build for Intel ISPC Compiler (ISPC Compiler must be preinstalled)" OFF) option(PYTHON "Build TACO for python environment" OFF) option(OPENMP "Build with OpenMP execution support" ON) option(COVERAGE "Build with code coverage analysis" OFF) set(TACO_FEATURE_CUDA 0) -set(TACO_FEATURE_ISPC 0) set(TACO_FEATURE_OPENMP 1) set(TACO_FEATURE_PYTHON 0) if(CUDA) @@ -24,11 +22,6 @@ if(CUDA) add_definitions(-DCUDA_BUILT) set(TACO_FEATURE_CUDA 1) endif(CUDA) -if(ISPC) - message("-- Searching for ISPC Installation") - add_definitions(-DISPC_BUILT) - set(TACO_FEATURE_ISPC 1) -endif(ISPC) if(OPENMP) message("-- Will use OpenMP for parallel execution") add_definitions(-DUSE_OPENMP) diff --git a/include/taco/taco_tensor_t.h b/include/taco/taco_tensor_t.h index f27acd9c7..20d78bb51 100644 --- a/include/taco/taco_tensor_t.h +++ b/include/taco/taco_tensor_t.h @@ -6,7 +6,6 @@ #ifndef TACO_TENSOR_T_DEFINED #define TACO_TENSOR_T_DEFINED -#include #include typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t; diff --git a/include/taco/util/strings.h b/include/taco/util/strings.h index 35d2c3949..5dfb2f174 100644 --- a/include/taco/util/strings.h +++ b/include/taco/util/strings.h @@ -1,7 +1,6 @@ #ifndef TACO_UTIL_STRINGS_H #define TACO_UTIL_STRINGS_H -#include "taco/cuda.h" #include #include #include diff --git a/include/taco/version.h.in b/include/taco/version.h.in index 8ef507598..bc5559d7d 100644 --- a/include/taco/version.h.in +++ b/include/taco/version.h.in @@ -20,6 +20,5 @@ #define TACO_FEATURE_OPENMP @TACO_FEATURE_OPENMP@ #define TACO_FEATURE_PYTHON @TACO_FEATURE_PYTHON@ #define TACO_FEATURE_CUDA @TACO_FEATURE_CUDA@ -#define TACO_FEATURE_ISPC @TACO_FEATURE_ISPC@ #endif /* TACO_VERSION_H */ diff --git a/src/codegen/codegen_c.cpp b/src/codegen/codegen_c.cpp index 6dd39107d..d55adbe58 100644 --- a/src/codegen/codegen_c.cpp +++ b/src/codegen/codegen_c.cpp @@ -241,7 +241,7 @@ class CodeGen_C::FindVars : public IRVisitor { }; CodeGen_C::CodeGen_C(std::ostream &dest, OutputKind outputKind, bool simplify) - : CodeGen(dest, false, simplify, C), out(dest), out2(dest), outputKind(outputKind) {} + : CodeGen(dest, false, simplify, C), out(dest), outputKind(outputKind) {} CodeGen_C::~CodeGen_C() {} @@ -300,17 +300,14 @@ void CodeGen_C::visit(const Function* func) { // Print variable declarations out << printDecls(varFinder.varDecls, func->inputs, func->outputs) << endl; - // out << "printf(\"declarations added\\n\");" << std::endl; if (emittingCoroutine) { out << printContextDeclAndInit(varMap, localVars, numYields, func->name) << endl; } - // out << "printf(\"declarations added2\\n\");" << std::endl; // output body print(func->body); - // out << "printf(\"function body added " << count++ << "\\n\"); // " << std::endl; // output repack only if we allocated memory @@ -409,8 +406,6 @@ static string getAtomicPragma() { // http://clang.llvm.org/docs/LanguageExtensions.html#extensions-for-loop-hint-optimizations void CodeGen_C::visit(const For* op) { - // out << " printf(\"adding for loop " << count++ << "\\n\"); //" << std::endl; - switch (op->kind) { case LoopKind::Vectorized: doIndent(); @@ -460,14 +455,6 @@ void CodeGen_C::visit(const For* op) { } stream << ") {\n"; - // out << " printf(\"loop " << count++ << " : %d , dim: %d, %d\\n\","; - // op->var.accept(this); - // out << ", "; - // op->start.accept(this); - // out << ", "; - // op->end.accept(this); - // out << "); // " << count++ << std::endl; - op->contents.accept(this); doIndent(); stream << "}"; @@ -488,7 +475,6 @@ void CodeGen_C::visit(const While* op) { } void CodeGen_C::visit(const GetProperty* op) { - // std::cout << "GetProperty* " << op << std::endl; taco_iassert(varMap.count(op) > 0) << "Property " << Expr(op) << " of " << op->tensor << " not found in varMap"; out << varMap[op]; diff --git a/src/codegen/codegen_c.h b/src/codegen/codegen_c.h index c8505a3bb..37bda6046 100644 --- a/src/codegen/codegen_c.h +++ b/src/codegen/codegen_c.h @@ -16,7 +16,6 @@ class CodeGen_C : public CodeGen { /// Initialize a code generator that generates code to an /// output stream. CodeGen_C(std::ostream &dest, OutputKind outputKind, bool simplify=true); - CodeGen_C(std::ostream &dest, std::ostream &dest2, OutputKind outputKind, bool simplify=true); ~CodeGen_C(); /// Compile a lowered function @@ -46,7 +45,6 @@ class CodeGen_C : public CodeGen { std::map varMap; std::vector localVars; std::ostream &out; - std::ostream &out2; int count = 0; OutputKind outputKind; diff --git a/src/codegen/codegen_cuda.cpp b/src/codegen/codegen_cuda.cpp index 14505f740..77cf0cd88 100644 --- a/src/codegen/codegen_cuda.cpp +++ b/src/codegen/codegen_cuda.cpp @@ -646,7 +646,6 @@ void CodeGen_CUDA::printDeviceFunctions(const Function* func) { // Collect device functions resetUniqueNameCounters(); deviceFunctionLoopDepth = 0; - // here they calculate the device FunctionCollecor DeviceFunctionCollector deviceFunctionCollector(func->inputs, func->outputs, this); func->body.accept(&deviceFunctionCollector); deviceFunctions = deviceFunctionCollector.blockFors; diff --git a/src/ir_tags.cpp b/src/ir_tags.cpp index e7365d6c2..af3dbd775 100644 --- a/src/ir_tags.cpp +++ b/src/ir_tags.cpp @@ -2,7 +2,7 @@ namespace taco { -const char *ParallelUnit_NAMES[] = {"NotParallel", "DefaultUnit", "GPUBlock", "GPUWarp", "GPUThread", "CPUThread", "CPUVector", "CPUThreadGroupReduction", "GPUBlockReduction", "GPUWarpReduction", "CPUSimd", "CPUSpmd"}; +const char *ParallelUnit_NAMES[] = {"NotParallel", "DefaultUnit", "GPUBlock", "GPUWarp", "GPUThread", "CPUThread", "CPUVector", "CPUThreadGroupReduction", "GPUBlockReduction", "GPUWarpReduction"}; const char *OutputRaceStrategy_NAMES[] = {"IgnoreRaces", "NoRaces", "Atomics", "Temporary", "ParallelReduction"}; const char *BoundType_NAMES[] = {"MinExact", "MinConstraint", "MaxExact", "MaxConstraint"}; const char *AssembleStrategy_NAMES[] = {"Append", "Insert"}; diff --git a/src/lower/iteration_graph.cpp b/src/lower/iteration_graph.cpp index 482d84aae..b25f820c1 100644 --- a/src/lower/iteration_graph.cpp +++ b/src/lower/iteration_graph.cpp @@ -48,8 +48,6 @@ struct IterationGraph::Content { IterationGraph::IterationGraph() { } -// remember that iteration graph does not have an ordering -// I got the ordering from topologically reorder index Ryan wrote IterationGraph IterationGraph::make(Assignment assignment) { TensorVar tensor = assignment.getLhs().getTensorVar(); IndexExpr expr = assignment.getRhs(); @@ -68,14 +66,7 @@ IterationGraph IterationGraph::make(Assignment assignment) { // access nodes of right hand side match(expr, - function([&](const AccessNode* op) { - std::cout << "access node: " << op->tensorVar << " <- " << IndexExpr(op) << std::endl; - std::cout << "index var: "; - for (auto indexVar : op->indexVars) { - std::cout << indexVar << " "; - } - std::cout << std::endl; - + function([&](const AccessNode* op) { auto type = op->tensorVar.getType(); taco_iassert((size_t)type.getShape().getOrder() == op->indexVars.size()) << "Tensor access " << IndexExpr(op) << " but tensor format only has " diff --git a/src/lower/iterator.cpp b/src/lower/iterator.cpp index eb3d8ac3b..0f0c024c5 100644 --- a/src/lower/iterator.cpp +++ b/src/lower/iterator.cpp @@ -569,9 +569,6 @@ void Iterators::createAccessIterators(Access access, Format format, Expr tensorI ProvenanceGraph provGraph, const map &tensorVars) { TensorVar tensorConcrete = access.getTensorVar(); - cout << "tensor: " << tensorConcrete << " " ; - cout << "tensorConcrete order: " << tensorConcrete.getOrder(); - cout << ", format order: " << format.getOrder() << endl; taco_iassert(tensorConcrete.getOrder() == format.getOrder()) << tensorConcrete << ", Format" << format; Shape shape = tensorConcrete.getType().getShape(); diff --git a/src/lower/lowerer_impl_imperative.cpp b/src/lower/lowerer_impl_imperative.cpp index eed0c4174..e8947337d 100644 --- a/src/lower/lowerer_impl_imperative.cpp +++ b/src/lower/lowerer_impl_imperative.cpp @@ -1,6 +1,4 @@ #include -#include "taco/cuda.h" -#include "taco/ir_tags.h" #include "taco/lower/lowerer_impl_imperative.h" #include "taco/lower/lowerer_impl.h" @@ -591,7 +589,6 @@ Stmt LowererImplImperative::lowerForall(Forall forall) { bool hasExactBound = provGraph.hasExactBound(forall.getIndexVar()); bool forallNeedsUnderivedGuards = !hasExactBound && emitUnderivedGuards; - if (!ignoreVectorize && forallNeedsUnderivedGuards && (forall.getParallelUnit() == ParallelUnit::CPUVector || forall.getUnrollFactor() > 0)) { @@ -833,7 +830,6 @@ Stmt LowererImplImperative::lowerForall(Forall forall) parallelUnitIndexVars.erase(forall.getParallelUnit()); parallelUnitSizes.erase(forall.getParallelUnit()); } - return Block::blanks(preInitValues, temporaryValuesInitFree[0], loops, @@ -1505,7 +1501,6 @@ Stmt LowererImplImperative::lowerForallFusedPosition(Forall forall, Iterator ite && forall.getOutputRaceStrategy() != OutputRaceStrategy::ParallelReduction && !ignoreVectorize) { kind = LoopKind::Runtime; } - // Loop with preamble and postamble return Block::blanks(boundsCompute, Block::make(Block::make(searchForUnderivedStart), @@ -1768,7 +1763,6 @@ Stmt LowererImplImperative::lowerForallBody(Expr coordinate, IndexStmt stmt, vector inserters, vector appenders, const set& reducedAccesses) { - Stmt initVals = resizeAndInitValues(appenders, reducedAccesses); // Inserter positions @@ -1893,7 +1887,6 @@ vector LowererImplImperative::codeToInitializeDenseAcceleratorArrays(Where Expr p = Var::make("p" + temporary.getName(), Int()); Stmt guardZeroInit = Store::make(alreadySetArr, p, ir::Literal::zero(bitGuardType)); - // std::cout << "vector LowererImplImperative::codeToInitializeDenseAcceleratorArrays\n" << std::endl; Stmt zeroInitLoop = For::make(p, 0, bitGuardSize, 1, guardZeroInit, LoopKind::Serial); Stmt inits = Block::make(alreadySetDecl, indexListDecl, allocateAlreadySet, allocateIndexList, zeroInitLoop); return {inits, freeTemps}; @@ -2124,6 +2117,8 @@ vector LowererImplImperative::codeToInitializeTemporary(Where where) { needComputeValues(where, temporary)) { values = ir::Var::make(temporary.getName(), temporary.getType().getDataType(), true, false); + taco_iassert(temporary.getType().getOrder() == 1) + << " Temporary order was " << temporary.getType().getOrder(); // TODO Expr size = getTemporarySize(where); // no decl needed for shared memory @@ -2747,7 +2742,7 @@ Stmt LowererImplImperative::initResultArrays(vector writes, // iteration of all the iterators is not full. We can check this by seeing if we can recover a // full iterator from our set of iterators. Expr size = generateAssembleCode() ? getCapacityVar(tensor) : parentSize; - result.push_back(zeroInitValues(tensor, 0, size)); // init values + result.push_back(zeroInitValues(tensor, 0, size)); } } return result.empty() ? Stmt() : Block::blanks(result); @@ -3238,7 +3233,6 @@ Stmt LowererImplImperative::codeToIncIteratorVars(Expr coordinate, IndexVar coor for (auto& iterator : levelIterators) { Expr ivar = iterator.getIteratorVar(); if (iterator.isUnique()) { - std::cout << "casting \n"; Expr increment = iterator.isFull() ? 1 : ir::Cast::make(Eq::make(iterator.getCoordVar(), @@ -3509,7 +3503,6 @@ Expr LowererImplImperative::generateAssembleGuard(IndexExpr expr) { } void visit(const CastNode* node) { - std::cout << "lowering to cast node\n"; expr = lower(node->a); }