From 2edd480ba9eed293e0d9c9eab8ee1bb3792286ea Mon Sep 17 00:00:00 2001 From: Adhhitha Dias Date: Mon, 28 Jun 2021 16:30:46 -0400 Subject: [PATCH 1/8] initial commit of ispc codegen files --- src/codegen/codegen_ispc.cpp | 606 +++++++++++++++++++++++++++++++++++ src/codegen/codegen_ispc.h | 63 ++++ 2 files changed, 669 insertions(+) create mode 100644 src/codegen/codegen_ispc.cpp create mode 100644 src/codegen/codegen_ispc.h diff --git a/src/codegen/codegen_ispc.cpp b/src/codegen/codegen_ispc.cpp new file mode 100644 index 000000000..4b0e82903 --- /dev/null +++ b/src/codegen/codegen_ispc.cpp @@ -0,0 +1,606 @@ +#include +#include +#include +#include +#include +#include + +#include "taco/ir/ir_visitor.h" +#include "codegen_ispc.h" +#include "taco/error.h" +#include "taco/util/strings.h" +#include "taco/util/collections.h" + +using namespace std; + +namespace taco { +namespace ir { + +// Some helper functions +namespace { + +// Include stdio.h for printf +// stdlib.h for malloc/realloc +// math.h for sqrt +// MIN preprocessor macro +// This *must* be kept in sync with taco_tensor_t.h +const string cHeaders = + "#ifndef TACO_C_HEADERS\n" + "#define TACO_C_HEADERS\n" + "#include \n" + "#include \n" + "#include \n" + "#include \n" + "#include \n" + "#include \n" + "#include \n" + "#if _OPENMP\n" + "#include \n" + "#endif\n" + "#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))\n" + "#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))\n" + "#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)\n" + "#ifndef TACO_TENSOR_T_DEFINED\n" + "#define TACO_TENSOR_T_DEFINED\n" + "typedef enum { taco_mode_dense, taco_mode_sparse } taco_mode_t;\n" + "typedef struct {\n" + " int32_t order; // tensor order (number of modes)\n" + " int32_t* dimensions; // tensor dimensions\n" + " int32_t csize; // component size\n" + " int32_t* mode_ordering; // mode storage ordering\n" + " taco_mode_t* mode_types; // mode storage types\n" + " uint8_t*** indices; // tensor index data (per mode)\n" + " uint8_t* vals; // tensor values\n" + " int32_t vals_size; // values array size\n" + "} taco_tensor_t;\n" + "#endif\n" + "#if !_OPENMP\n" + "int omp_get_thread_num() { return 0; }\n" + "int omp_get_max_threads() { return 1; }\n" + "#endif\n" + "int cmp(const void *a, const void *b) {\n" + " return *((const int*)a) - *((const int*)b);\n" + "}\n" + "int taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {\n" + " if (array[arrayStart] >= target) {\n" + " return arrayStart;\n" + " }\n" + " int lowerBound = arrayStart; // always < target\n" + " int upperBound = arrayEnd; // always >= target\n" + " while (upperBound - lowerBound > 1) {\n" + " int mid = (upperBound + lowerBound) / 2;\n" + " int midValue = array[mid];\n" + " if (midValue < target) {\n" + " lowerBound = mid;\n" + " }\n" + " else if (midValue > target) {\n" + " upperBound = mid;\n" + " }\n" + " else {\n" + " return mid;\n" + " }\n" + " }\n" + " return upperBound;\n" + "}\n" + "int taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {\n" + " if (array[arrayEnd] <= target) {\n" + " return arrayEnd;\n" + " }\n" + " int lowerBound = arrayStart; // always <= target\n" + " int upperBound = arrayEnd; // always > target\n" + " while (upperBound - lowerBound > 1) {\n" + " int mid = (upperBound + lowerBound) / 2;\n" + " int midValue = array[mid];\n" + " if (midValue < target) {\n" + " lowerBound = mid;\n" + " }\n" + " else if (midValue > target) {\n" + " upperBound = mid;\n" + " }\n" + " else {\n" + " return mid;\n" + " }\n" + " }\n" + " return lowerBound;\n" + "}\n" + "taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,\n" + " int32_t* dimensions, int32_t* mode_ordering,\n" + " taco_mode_t* mode_types) {\n" + " taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));\n" + " t->order = order;\n" + " t->dimensions = (int32_t *) malloc(order * sizeof(int32_t));\n" + " t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));\n" + " t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));\n" + " t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***));\n" + " t->csize = csize;\n" + " for (int32_t i = 0; i < order; i++) {\n" + " t->dimensions[i] = dimensions[i];\n" + " t->mode_ordering[i] = mode_ordering[i];\n" + " t->mode_types[i] = mode_types[i];\n" + " switch (t->mode_types[i]) {\n" + " case taco_mode_dense:\n" + " t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));\n" + " break;\n" + " case taco_mode_sparse:\n" + " t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));\n" + " break;\n" + " }\n" + " }\n" + " return t;\n" + "}\n" + "void deinit_taco_tensor_t(taco_tensor_t* t) {\n" + " for (int i = 0; i < t->order; i++) {\n" + " free(t->indices[i]);\n" + " }\n" + " free(t->indices);\n" + " free(t->dimensions);\n" + " free(t->mode_ordering);\n" + " free(t->mode_types);\n" + " free(t);\n" + "}\n" + "#endif\n"; +} // anonymous namespace + +// find variables for generating declarations +// generates a single var for each GetProperty +class CodeGen_ISPC::FindVars : public IRVisitor { +public: + map varMap; + + // the variables for which we need to add declarations + map varDecls; + + vector localVars; + + // this maps from tensor, property, mode, index to the unique var + map, string> canonicalPropertyVar; + + // this is for convenience, recording just the properties unpacked + // from the output tensor so we can re-save them at the end + map, string> outputProperties; + + // TODO: should replace this with an unordered set + vector outputTensors; + vector inputTensors; + + CodeGen_ISPC *codeGen; + + // copy inputs and outputs into the map + FindVars(vector inputs, vector outputs, CodeGen_ISPC *codeGen) + : codeGen(codeGen) { + for (auto v: inputs) { + auto var = v.as(); + taco_iassert(var) << "Inputs must be vars in codegen"; + taco_iassert(varMap.count(var)==0) << "Duplicate input found in codegen"; + inputTensors.push_back(v); + varMap[var] = var->name; + } + for (auto v: outputs) { + auto var = v.as(); + taco_iassert(var) << "Outputs must be vars in codegen"; + taco_iassert(varMap.count(var)==0) << "Duplicate output found in codegen"; + outputTensors.push_back(v); + varMap[var] = var->name; + } + } + +protected: + using IRVisitor::visit; + + virtual void visit(const Var *op) { + if (varMap.count(op) == 0) { + varMap[op] = op->is_ptr? op->name : codeGen->genUniqueName(op->name); + } + } + + virtual void visit(const VarDecl *op) { + if (!util::contains(localVars, op->var)) { + localVars.push_back(op->var); + } + op->var.accept(this); + op->rhs.accept(this); + } + + virtual void visit(const For *op) { + if (!util::contains(localVars, op->var)) { + localVars.push_back(op->var); + } + op->var.accept(this); + op->start.accept(this); + op->end.accept(this); + op->increment.accept(this); + op->contents.accept(this); + } + + virtual void visit(const GetProperty *op) { + if (!util::contains(inputTensors, op->tensor) && + !util::contains(outputTensors, op->tensor)) { + // Don't create header unpacking code for temporaries + return; + } + + if (varMap.count(op) == 0) { + auto key = + tuple(op->tensor,op->property, + (size_t)op->mode, + (size_t)op->index); + if (canonicalPropertyVar.count(key) > 0) { + varMap[op] = canonicalPropertyVar[key]; + } else { + auto unique_name = codeGen->genUniqueName(op->name); + canonicalPropertyVar[key] = unique_name; + varMap[op] = unique_name; + varDecls[op] = unique_name; + if (util::contains(outputTensors, op->tensor)) { + outputProperties[key] = unique_name; + } + } + } + } +}; + +CodeGen_ISPC::CodeGen_ISPC(std::ostream &dest, OutputKind outputKind, bool simplify) + : CodeGen(dest, false, simplify, C), out(dest), outputKind(outputKind) {} + +CodeGen_ISPC::~CodeGen_ISPC() {} + +void CodeGen_ISPC::compile(Stmt stmt, bool isFirst) { + varMap = {}; + localVars = {}; + + if (isFirst) { + // output the headers + out << cHeaders; + } + out << endl; + // generate code for the Stmt + stmt.accept(this); +} + +void CodeGen_ISPC::visit(const Function* func) { + // if generating a header, protect the function declaration with a guard + if (outputKind == HeaderGen) { + out << "#ifndef TACO_GENERATED_" << func->name << "\n"; + out << "#define TACO_GENERATED_" << func->name << "\n"; + } + + int numYields = countYields(func); + emittingCoroutine = (numYields > 0); + funcName = func->name; + labelCount = 0; + + resetUniqueNameCounters(); + FindVars inputVarFinder(func->inputs, {}, this); + func->body.accept(&inputVarFinder); + FindVars outputVarFinder({}, func->outputs, this); + func->body.accept(&outputVarFinder); + + // output function declaration + doIndent(); + out << printFuncName(func, inputVarFinder.varDecls, outputVarFinder.varDecls); + + // if we're just generating a header, this is all we need to do + if (outputKind == HeaderGen) { + out << ";\n"; + out << "#endif\n"; + return; + } + + out << " {\n"; + + indent++; + + // find all the vars that are not inputs or outputs and declare them + resetUniqueNameCounters(); + FindVars varFinder(func->inputs, func->outputs, this); + func->body.accept(&varFinder); + varMap = varFinder.varMap; + localVars = varFinder.localVars; + + // Print variable declarations + out << printDecls(varFinder.varDecls, func->inputs, func->outputs) << endl; + + if (emittingCoroutine) { + out << printContextDeclAndInit(varMap, localVars, numYields, func->name) + << endl; + } + + // output body + print(func->body); + + // output repack only if we allocated memory + if (checkForAlloc(func)) + out << endl << printPack(varFinder.outputProperties, func->outputs); + + if (emittingCoroutine) { + out << printCoroutineFinish(numYields, funcName); + } + + doIndent(); + out << "return 0;\n"; + indent--; + + doIndent(); + out << "}\n"; +} + +void CodeGen_ISPC::visit(const VarDecl* op) { + if (emittingCoroutine) { + doIndent(); + op->var.accept(this); + parentPrecedence = Precedence::TOP; + stream << " = "; + op->rhs.accept(this); + stream << ";"; + stream << endl; + } else { + IRPrinter::visit(op); + } +} + +void CodeGen_ISPC::visit(const Yield* op) { + printYield(op, localVars, varMap, labelCount, funcName); +} + +// For Vars, we replace their names with the generated name, +// since we match by reference (not name) +void CodeGen_ISPC::visit(const Var* op) { + taco_iassert(varMap.count(op) > 0) << + "Var " << op->name << " not found in varMap"; + if (emittingCoroutine) { +// out << "TACO_DEREF("; + } + out << varMap[op]; + if (emittingCoroutine) { +// out << ")"; + } +} + +static string genVectorizePragma(int width) { + stringstream ret; + ret << "#pragma clang loop interleave(enable) "; + if (!width) + ret << "vectorize(enable)"; + else + ret << "vectorize_width(" << width << ")"; + + return ret.str(); +} + +static string getParallelizePragma(LoopKind kind) { + stringstream ret; + ret << "#pragma omp parallel for schedule"; + switch (kind) { + case LoopKind::Static: + ret << "(static, 1)"; + break; + case LoopKind::Dynamic: + ret << "(dynamic, 1)"; + break; + case LoopKind::Runtime: + ret << "(runtime)"; + break; + case LoopKind::Static_Chunked: + ret << "(static)"; + break; + default: + break; + } + return ret.str(); +} + +static string getUnrollPragma(size_t unrollFactor) { + return "#pragma unroll " + std::to_string(unrollFactor); +} + +static string getAtomicPragma() { + return "#pragma omp atomic"; +} + +// The next two need to output the correct pragmas depending +// on the loop kind (Serial, Static, Dynamic, Vectorized) +// +// Docs for vectorization pragmas: +// http://clang.llvm.org/docs/LanguageExtensions.html#extensions-for-loop-hint-optimizations +void CodeGen_ISPC::visit(const For* op) { + switch (op->kind) { + case LoopKind::Vectorized: + doIndent(); + out << genVectorizePragma(op->vec_width); + out << "\n"; + break; + case LoopKind::Static: + case LoopKind::Dynamic: + case LoopKind::Runtime: + case LoopKind::Static_Chunked: + doIndent(); + out << getParallelizePragma(op->kind); + out << "\n"; + break; + default: + if (op->unrollFactor > 0) { + doIndent(); + out << getUnrollPragma(op->unrollFactor) << endl; + } + break; + } + + doIndent(); + stream << keywordString("for") << " ("; + if (!emittingCoroutine) { + stream << keywordString(util::toString(op->var.type())) << " "; + } + op->var.accept(this); + stream << " = "; + op->start.accept(this); + stream << keywordString("; "); + op->var.accept(this); + stream << " < "; + parentPrecedence = BOTTOM; + op->end.accept(this); + stream << keywordString("; "); + op->var.accept(this); + + auto lit = op->increment.as(); + if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || + (lit->type.isUInt() && lit->equalsScalar(1)))) { + stream << "++"; + } + else { + stream << " += "; + op->increment.accept(this); + } + stream << ") {\n"; + + op->contents.accept(this); + doIndent(); + stream << "}"; + stream << endl; +} + +void CodeGen_ISPC::visit(const While* op) { + // it's not clear from documentation that clang will vectorize + // while loops + // however, we'll output the pragmas anyway + if (op->kind == LoopKind::Vectorized) { + doIndent(); + out << genVectorizePragma(op->vec_width); + out << "\n"; + } + + IRPrinter::visit(op); +} + +void CodeGen_ISPC::visit(const GetProperty* op) { + taco_iassert(varMap.count(op) > 0) << + "Property " << Expr(op) << " of " << op->tensor << " not found in varMap"; + out << varMap[op]; +} + +void CodeGen_ISPC::visit(const Min* op) { + if (op->operands.size() == 1) { + op->operands[0].accept(this); + return; + } + for (size_t i=0; ioperands.size()-1; i++) { + stream << "TACO_MIN("; + op->operands[i].accept(this); + stream << ","; + } + op->operands.back().accept(this); + for (size_t i=0; ioperands.size()-1; i++) { + stream << ")"; + } +} + +void CodeGen_ISPC::visit(const Max* op) { + if (op->operands.size() == 1) { + op->operands[0].accept(this); + return; + } + for (size_t i=0; ioperands.size()-1; i++) { + stream << "TACO_MAX("; + op->operands[i].accept(this); + stream << ","; + } + op->operands.back().accept(this); + for (size_t i=0; ioperands.size()-1; i++) { + stream << ")"; + } +} + +void CodeGen_ISPC::visit(const Allocate* op) { + string elementType = printCType(op->var.type(), false); + + doIndent(); + op->var.accept(this); + stream << " = ("; + stream << elementType << "*"; + stream << ")"; + if (op->is_realloc) { + stream << "realloc("; + op->var.accept(this); + stream << ", "; + } + else { + // If the allocation was requested to clear the allocated memory, + // use calloc instead of malloc. + if (op->clear) { + stream << "calloc(1, "; + } else { + stream << "malloc("; + } + } + stream << "sizeof(" << elementType << ")"; + stream << " * "; + parentPrecedence = MUL; + op->num_elements.accept(this); + parentPrecedence = TOP; + stream << ");"; + stream << endl; +} + +void CodeGen_ISPC::visit(const Sqrt* op) { + taco_tassert(op->type.isFloat() && op->type.getNumBits() == 64) << + "Codegen doesn't currently support non-double sqrt"; + stream << "sqrt("; + op->a.accept(this); + stream << ")"; +} + +void CodeGen_ISPC::visit(const Assign* op) { + if (op->use_atomics) { + doIndent(); + stream << getAtomicPragma() << endl; + } + IRPrinter::visit(op); +} + +void CodeGen_ISPC::visit(const Store* op) { + if (op->use_atomics) { + doIndent(); + stream << getAtomicPragma() << endl; + } + IRPrinter::visit(op); +} + +void CodeGen_ISPC::generateShim(const Stmt& func, stringstream &ret) { + const Function *funcPtr = func.as(); + + ret << "int _shim_" << funcPtr->name << "(void** parameterPack) {\n"; + ret << " return " << funcPtr->name << "("; + + size_t i=0; + string delimiter = ""; + + const auto returnType = funcPtr->getReturnType(); + if (returnType.second != Datatype()) { + ret << "(void**)(parameterPack[0]), "; + ret << "(char*)(parameterPack[1]), "; + ret << "(" << returnType.second << "*)(parameterPack[2]), "; + ret << "(int32_t*)(parameterPack[3])"; + + i = 4; + delimiter = ", "; + } + + for (auto output : funcPtr->outputs) { + auto var = output.as(); + auto cast_type = var->is_tensor ? "taco_tensor_t*" + : printCType(var->type, var->is_ptr); + + ret << delimiter << "(" << cast_type << ")(parameterPack[" << i++ << "])"; + delimiter = ", "; + } + for (auto input : funcPtr->inputs) { + auto var = input.as(); + auto cast_type = var->is_tensor ? "taco_tensor_t*" + : printCType(var->type, var->is_ptr); + ret << delimiter << "(" << cast_type << ")(parameterPack[" << i++ << "])"; + delimiter = ", "; + } + ret << ");\n"; + ret << "}\n"; +} +} +} diff --git a/src/codegen/codegen_ispc.h b/src/codegen/codegen_ispc.h new file mode 100644 index 000000000..e3c87ece5 --- /dev/null +++ b/src/codegen/codegen_ispc.h @@ -0,0 +1,63 @@ +#ifndef TACO_BACKEND_C_H +#define TACO_BACKEND_C_H +#include +#include + +#include "taco/ir/ir.h" +#include "taco/ir/ir_printer.h" +#include "codegen.h" + +namespace taco { +namespace ir { + + +class CodeGen_ISPC : public CodeGen { +public: + /// Initialize a code generator that generates code to an + /// output stream. + CodeGen_ISPC(std::ostream &dest, OutputKind outputKind, bool simplify=true); + ~CodeGen_ISPC(); + + /// Compile a lowered function + void compile(Stmt stmt, bool isFirst=false); + + /// Generate shims that unpack an array of pointers representing + /// a mix of taco_tensor_t* and scalars into a function call + static void generateShim(const Stmt& func, std::stringstream &stream); + +protected: + using IRPrinter::visit; + + void visit(const Function*); + void visit(const VarDecl*); + void visit(const Yield*); + void visit(const Var*); + void visit(const For*); + void visit(const While*); + void visit(const GetProperty*); + void visit(const Min*); + void visit(const Max*); + void visit(const Allocate*); + void visit(const Sqrt*); + void visit(const Store*); + void visit(const Assign*); + + std::map varMap; + std::vector localVars; + std::ostream &out; + + OutputKind outputKind; + + std::string funcName; + int labelCount; + bool emittingCoroutine; + + class FindVars; + +private: + virtual std::string restrictKeyword() const { return "restrict"; } +}; + +} // namespace ir +} // namespace taco +#endif From 7d4b8b66415709d996061a6311ea2d6fdba78cf5 Mon Sep 17 00:00:00 2001 From: Adhhitha Dias Date: Mon, 28 Jun 2021 17:36:53 -0400 Subject: [PATCH 2/8] minimal changes to support ispc exec --- .gitignore | 3 ++ CMakeLists.txt | 7 ++++ include/taco/cuda.h | 10 +++++ include/taco/version.h.in | 1 + src/codegen/codegen.cpp | 4 ++ src/codegen/codegen_ispc.h | 4 +- src/codegen/module.cpp | 7 ++++ src/cuda.cpp | 11 ++++++ test/tests-scheduling-eval.cpp | 70 +++++++++++++++++++++++++++++++++- tools/taco.cpp | 19 +++++++++ 10 files changed, 132 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 16389f34e..9abc3adc7 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,6 @@ CMakeCache.txt doc apps/tensor_times_vector/tensor_times_vector + +.cache +compile_commands.json diff --git a/CMakeLists.txt b/CMakeLists.txt index a6a80d9d1..7e9359e01 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,10 +10,12 @@ project(taco LANGUAGES C CXX ) option(CUDA "Build for NVIDIA GPU (CUDA must be preinstalled)" OFF) +option(ISPC "Build for Intel ISPC Compiler (ISPC Compiler must be preinstalled)" OFF) option(PYTHON "Build TACO for python environment" OFF) option(OPENMP "Build with OpenMP execution support" OFF) option(COVERAGE "Build with code coverage analysis" OFF) set(TACO_FEATURE_CUDA 0) +set(TACO_FEATURE_ISPC 0) set(TACO_FEATURE_OPENMP 0) set(TACO_FEATURE_PYTHON 0) if(CUDA) @@ -22,6 +24,11 @@ if(CUDA) add_definitions(-DCUDA_BUILT) set(TACO_FEATURE_CUDA 1) endif(CUDA) +if(ISPC) + message("-- Searching for ISPC Installation") + add_definitions(-DISPC_BUILT) + set(TACO_FEATURE_ISPC 1) +endif(ISPC) if(OPENMP) message("-- Will use OpenMP for parallel execution") add_definitions(-DUSE_OPENMP) diff --git a/include/taco/cuda.h b/include/taco/cuda.h index aad6b5229..7ed545c6d 100644 --- a/include/taco/cuda.h +++ b/include/taco/cuda.h @@ -9,7 +9,17 @@ #define CUDA_BUILT false #endif +#ifndef ISPC_BUILT + #define ISPC_BUILT false +#endif + namespace taco { + +/// Functions used by taco to interface with ISPC +bool should_use_ISPC_codegen(); +void set_ISPC_codegen_enabled(bool enabled); + + /// Functions used by taco to interface with CUDA (especially unified memory) /// Check if should use CUDA codegen bool should_use_CUDA_codegen(); diff --git a/include/taco/version.h.in b/include/taco/version.h.in index bc5559d7d..8ef507598 100644 --- a/include/taco/version.h.in +++ b/include/taco/version.h.in @@ -20,5 +20,6 @@ #define TACO_FEATURE_OPENMP @TACO_FEATURE_OPENMP@ #define TACO_FEATURE_PYTHON @TACO_FEATURE_PYTHON@ #define TACO_FEATURE_CUDA @TACO_FEATURE_CUDA@ +#define TACO_FEATURE_ISPC @TACO_FEATURE_ISPC@ #endif /* TACO_VERSION_H */ diff --git a/src/codegen/codegen.cpp b/src/codegen/codegen.cpp index f0c09d98a..f57f9950f 100644 --- a/src/codegen/codegen.cpp +++ b/src/codegen/codegen.cpp @@ -2,6 +2,7 @@ #include "taco/cuda.h" #include "codegen_cuda.h" #include "codegen_c.h" +#include "codegen_ispc.h" #include #include @@ -26,6 +27,9 @@ shared_ptr CodeGen::init_default(std::ostream &dest, OutputKind outputK if (should_use_CUDA_codegen()) { return make_shared(dest, outputKind); } + else if (should_use_ISPC_codegen()) { + return make_shared(dest, outputKind); + } else { return make_shared(dest, outputKind); } diff --git a/src/codegen/codegen_ispc.h b/src/codegen/codegen_ispc.h index e3c87ece5..35da5a01b 100644 --- a/src/codegen/codegen_ispc.h +++ b/src/codegen/codegen_ispc.h @@ -1,5 +1,5 @@ -#ifndef TACO_BACKEND_C_H -#define TACO_BACKEND_C_H +#ifndef TACO_BACKEND_ISPC_H +#define TACO_BACKEND_ISPC_H #include #include diff --git a/src/codegen/module.cpp b/src/codegen/module.cpp index bd0f487b1..409ed4a83 100644 --- a/src/codegen/module.cpp +++ b/src/codegen/module.cpp @@ -13,6 +13,7 @@ #include "taco/util/strings.h" #include "taco/util/env.h" #include "codegen/codegen_c.h" +#include "codegen/codegen_ispc.h" #include "codegen/codegen_cuda.h" #include "taco/cuda.h" @@ -89,6 +90,9 @@ void writeShims(vector funcs, string path, string prefix) { if (should_use_CUDA_codegen()) { CodeGen_CUDA::generateShim(func, shims); } + else if (should_use_ISPC_codegen()) { + CodeGen_ISPC::generateShim(func, shims); + } else { CodeGen_C::generateShim(func, shims); } @@ -98,6 +102,9 @@ void writeShims(vector funcs, string path, string prefix) { if (should_use_CUDA_codegen()) { shims_file.open(path+prefix+"_shims.cpp"); } + else if (should_use_ISPC_codegen()) { + shims_file.open(path+prefix+".ispc", ios::app); + } else { shims_file.open(path+prefix+".c", ios::app); } diff --git a/src/cuda.cpp b/src/cuda.cpp index 059c60105..85139f874 100644 --- a/src/cuda.cpp +++ b/src/cuda.cpp @@ -7,6 +7,17 @@ using namespace std; namespace taco { + +static bool ISPC_codegen_enabled = ISPC_BUILT; +bool should_use_ISPC_codegen() { + return ISPC_codegen_enabled; +} + +void set_ISPC_codegen_enabled(bool enabled) { + ISPC_codegen_enabled = enabled; +} + + /// Functions used by taco to interface with CUDA (especially unified memory) static bool CUDA_codegen_enabled = CUDA_BUILT; static bool CUDA_unified_memory_enabled = CUDA_BUILT; diff --git a/test/tests-scheduling-eval.cpp b/test/tests-scheduling-eval.cpp index 52bd74ab4..f59359081 100644 --- a/test/tests-scheduling-eval.cpp +++ b/test/tests-scheduling-eval.cpp @@ -1,5 +1,7 @@ +#include #include #include +#include #include #include #include "test.h" @@ -44,6 +46,14 @@ IndexStmt scheduleSpMVCPU(IndexStmt stmt, int CHUNK_SIZE=16) { .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); } +IndexStmt scheduleSpMVISPC(IndexStmt stmt, int CHUNK_SIZE=16) { + IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1"); + return stmt; + // return stmt.split(i, i0, i1, CHUNK_SIZE) + // .reorder({i0, i1, j}) + // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); +} + IndexStmt scheduleSpMMCPU(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); return stmt.split(i, i0, i1, CHUNK_SIZE) @@ -1463,7 +1473,63 @@ TEST(scheduling_eval, mttkrpGPU) { ASSERT_TENSOR_EQ(expected, A); } -TEST(generate_evaluation_files, DISABLED_cpu) { + + +TEST(generate_ispc_evaluation_files, ispc) { + std::cout << "Hi Adhitha!\n" << std::endl ; + set_CUDA_codegen_enabled(false); + set_ISPC_codegen_enabled(true); + + vector> spmv_parameters = {{32}}; + vector> spmspv_parameters = {{8}}; + + // 4 to 512 and 4, 8, 16 + vector> spmm_dcsr_parameters = {{16, 8}}; + vector> spmm_parameters = {{16,4}}; + + vector> mttkrp_parameters = {}; + mttkrp_parameters.push_back({64,0}); + + vector> sddmm_parameters = {{8, 8}}; + vector> ttv_parameters = {{32}}; + + int NUM_I = 100; + int NUM_J = 100; + + string file_ending = ".ispc"; + string file_path = "eval_prepared_ispc/"; + mkdir(file_path.c_str(), 0777); + + // spmv + { + stringstream source; + std::shared_ptr codegen = ir::CodeGen::init_default(source, ir::CodeGen::ImplementationGen); + Tensor A("A", {NUM_I, NUM_J}, CSR); + Tensor x("x", {NUM_J}, {Dense}); + Tensor y("y", {NUM_I}, {Dense}); + y(i) = A(i, j) * x(j); + std::cout << "concretizing the assignment statement\n"; + IndexStmt stmt = y.getAssignment().concretize(); + std::cout << "Printing the original IndexStmt: " << stmt << std::endl; + for (auto paramSet : spmv_parameters) { + std::cout << "param set: " << paramSet[0] << std::endl; + IndexStmt scheduled = scheduleSpMVISPC(stmt, paramSet[0]); + std::cout << "scheduled IndexStmt: " << scheduled << std::endl; + ir::Stmt compute = lower(scheduled, "spmv_csr_ispc_taco", false, true); + std::cout << "computed statement: \n" << compute << std::endl; + codegen->compile(compute, false); + } + ofstream source_file; + source_file.open(file_path + "spmv_csr_ispc_taco.h"); + source_file << source.str(); + source_file.close(); + } + + + return; +} + +TEST(generate_evaluation_files, cpu) { if (should_use_CUDA_codegen()) { return; } @@ -1779,7 +1845,7 @@ TEST(generate_evaluation_files, DISABLED_cpu) { } } -TEST(generate_evaluation_files, DISABLED_gpu) { +TEST(generate_evaluation_files, gpu) { if (!should_use_CUDA_codegen()) { return; } diff --git a/tools/taco.cpp b/tools/taco.cpp index cd351a203..ce03b61e1 100644 --- a/tools/taco.cpp +++ b/tools/taco.cpp @@ -20,6 +20,7 @@ #include "taco/lower/lower.h" #include "taco/codegen/module.h" #include "codegen/codegen_c.h" +#include "codegen/codegen_ispc.h" #include "codegen/codegen_cuda.h" #include "codegen/codegen.h" #include "taco/util/strings.h" @@ -188,6 +189,8 @@ static void printUsageInfo() { cout << endl; printFlag("print-nocolor", "Print without colors."); cout << endl; + printFlag("ispc", "Generate ISPC code for Intel CPUs"); + cout << endl; printFlag("cuda", "Generate CUDA code for NVIDIA GPUs"); cout << endl; printFlag("schedule", "Specify parallel execution schedule"); @@ -279,6 +282,8 @@ static void printVersionInfo() { cout << "Built with Python support." << endl; if(TACO_FEATURE_CUDA) cout << "Built with CUDA support." << endl; + if(TACO_FEATURE_ISPC) + cout << "Built with ISPC support." << endl; cout << endl; cout << "Built on: " << TACO_BUILD_DATE << endl; cout << "CMake build type: " << TACO_BUILD_TYPE << endl; @@ -641,6 +646,7 @@ int main(int argc, char* argv[]) { bool color = true; bool readKernels = false; bool cuda = false; + bool ispc = false; bool setSchedule = false; @@ -949,6 +955,10 @@ int main(int argc, char* argv[]) { else if ("-cuda" == argName) { cuda = true; } + else if ("-ispc" == argName) { + std::cout << "ispc true\n"; + ispc = true; + } else if ("-schedule" == argName) { vector descriptor = util::split(argValue, ","); if (descriptor.size() > 2 || descriptor.empty()) { @@ -1129,9 +1139,18 @@ int main(int argc, char* argv[]) { return reportError("TACO must be built for CUDA (cmake -DCUDA=ON ..) to benchmark", 2); } set_CUDA_codegen_enabled(true); + set_ISPC_codegen_enabled(false); + } + else if (ispc) { + if (!ISPC_BUILT && benchmark) { + return reportError("TACO must be built for ISPC (cmake -DISPC=ON .. to benchmark", 2); + } + set_CUDA_codegen_enabled(false); + set_ISPC_codegen_enabled(true); } else { set_CUDA_codegen_enabled(false); + set_ISPC_codegen_enabled(false); } stmt = scalarPromote(stmt); From dd693feb9a56c0ab528fb602e0f30c3d014e3648 Mon Sep 17 00:00:00 2001 From: Adhhitha Dias Date: Mon, 12 Jul 2021 14:10:46 -0400 Subject: [PATCH 3/8] separate ispc code to another stream and smaller conversions to match ispc code --- include/taco/cuda.h | 2 + include/taco/ir/ir.h | 2 +- include/taco/ir/ir_printer.h | 3 + include/taco/util/strings.h | 22 + src/codegen/codegen.cpp | 141 ++++- src/codegen/codegen.h | 15 +- src/codegen/codegen_ispc.cpp | 278 ++++++--- src/codegen/codegen_ispc.h | 3 + src/codegen/module.cpp | 13 + src/cuda.cpp | 8 + src/ir/ir_printer.cpp | 824 +++++++++++++++++++------- src/ir/ir_rewriter.cpp | 1 + src/lower/lowerer_impl_imperative.cpp | 69 ++- src/tensor.cpp | 7 + test/tests-scheduling-eval.cpp | 79 ++- tools/taco.cpp | 2 + 16 files changed, 1127 insertions(+), 342 deletions(-) diff --git a/include/taco/cuda.h b/include/taco/cuda.h index 7ed545c6d..9c4a7aae9 100644 --- a/include/taco/cuda.h +++ b/include/taco/cuda.h @@ -18,6 +18,8 @@ namespace taco { /// Functions used by taco to interface with ISPC bool should_use_ISPC_codegen(); void set_ISPC_codegen_enabled(bool enabled); +bool is_ISPC_code_stream_enabled(); +void set_ISPC_code_stream_enabled(bool enabled); /// Functions used by taco to interface with CUDA (especially unified memory) diff --git a/include/taco/ir/ir.h b/include/taco/ir/ir.h index f852f26b1..cb46b5142 100644 --- a/include/taco/ir/ir.h +++ b/include/taco/ir/ir.h @@ -591,7 +591,7 @@ struct Switch : public StmtNode { static const IRNodeType _type_info = IRNodeType::Switch; }; -enum class LoopKind {Serial, Static, Dynamic, Runtime, Vectorized, Static_Chunked}; +enum class LoopKind {Serial, Static, Dynamic, Runtime, Vectorized, Static_Chunked, Foreach}; /** A for loop from start to end by increment. * A vectorized loop will require the increment to be 1 and the diff --git a/include/taco/ir/ir_printer.h b/include/taco/ir/ir_printer.h index 4e50764e9..c2c505bf5 100644 --- a/include/taco/ir/ir_printer.h +++ b/include/taco/ir/ir_printer.h @@ -16,6 +16,7 @@ class IRPrinter : public IRVisitorStrict { public: IRPrinter(std::ostream& stream); IRPrinter(std::ostream& stream, bool color, bool simplify); + IRPrinter(std::ostream& stream, std::ostream& stream2, bool color, bool simplify); virtual ~IRPrinter(); void setColor(bool color); @@ -72,6 +73,7 @@ class IRPrinter : public IRVisitorStrict { virtual void visit(const Break*); std::ostream &stream; + std::ostream &stream2; int indent; bool color; bool simplify; @@ -109,6 +111,7 @@ class IRPrinter : public IRVisitorStrict { void doIndent(); void printBinOp(Expr a, Expr b, std::string op, Precedence precedence); bool needsParentheses(Precedence precedence); + void sendToStream(std::stringstream &stream); std::string keywordString(std::string); std::string commentString(std::string); diff --git a/include/taco/util/strings.h b/include/taco/util/strings.h index 5dfb2f174..a3c3d863f 100644 --- a/include/taco/util/strings.h +++ b/include/taco/util/strings.h @@ -1,6 +1,7 @@ #ifndef TACO_UTIL_STRINGS_H #define TACO_UTIL_STRINGS_H +#include "taco/cuda.h" #include #include #include @@ -8,6 +9,8 @@ #include #include +#include "taco/type.h" + // To get the value of a compiler macro variable #define STRINGIFY(x) #x #define TO_STRING(x) STRINGIFY(x) @@ -15,6 +18,25 @@ namespace taco { namespace util { +// /// Turn anything except floating points that can be written to a stream +// /// into a string. +// template +// typename std::enable_if::value, std::string>::type +// toStringISPC(const T &val) { + +// std::stringstream sstream; +// if (val == Int32) { +// sstream << "int32"; +// } +// else if (val == Int64) { +// sstream << "int64"; +// } +// else { +// sstream << val; +// } +// return sstream.str(); +// } + /// Turn anything except floating points that can be written to a stream /// into a string. template diff --git a/src/codegen/codegen.cpp b/src/codegen/codegen.cpp index f57f9950f..750f33516 100644 --- a/src/codegen/codegen.cpp +++ b/src/codegen/codegen.cpp @@ -35,6 +35,18 @@ shared_ptr CodeGen::init_default(std::ostream &dest, OutputKind outputK } } +shared_ptr CodeGen::init_default(std::ostream &dest, std::ostream &dest2, OutputKind outputKind) { + if (should_use_CUDA_codegen()) { + return make_shared(dest, outputKind); + } + else if (should_use_ISPC_codegen()) { + return make_shared(dest, dest2, outputKind); + } + else { + return make_shared(dest, outputKind); + } +} + int CodeGen::countYields(const Function *func) { struct CountYields : public IRVisitor { int yields = 0; @@ -233,6 +245,49 @@ string CodeGen::printTensorProperty(string varname, const GetProperty* op, bool return ret.str(); } +string CodeGen::getUnpackedTensorArgument(string varname, const GetProperty* op, + bool is_output_prop) { + stringstream ret; + ret << ""; + + auto tensor = op->tensor.as(); + if (op->property == TensorProperty::Values) { + // for the values, it's in the last slot + ret << "uniform " << printType(tensor->type, false) << " " << varname << "[]"; + return ret.str(); + } else if (op->property == TensorProperty::ValuesSize) { + ret << "int32 " << varname; + return ret.str(); + } + + // for a Dense level, nnz is an int + // for a Fixed level, ptr is an int + // all others are int* + if (op->property == TensorProperty::Dimension) { + if (op->type == Int32) { + ret << "int32 "; + } else if (op->type == Int64) { + ret << "int64 "; + } else { + ret << "int "; + } + ret << varname; + + } else { + taco_iassert(op->property == TensorProperty::Indices); + if (op->type == Int32) { + ret << "uniform int32 "; + } else if (op->type == Int64) { + ret << "uniform int64 "; + } else { + ret << "uniform int "; + } + ret << varname << "[]"; + } + + return ret.str(); +} + string CodeGen::unpackTensorProperty(string varname, const GetProperty* op, bool is_output_prop) { stringstream ret; @@ -314,13 +369,9 @@ string CodeGen::pointTensorProperty(std::string varname) { return ret.str(); } -// helper to print declarations -string CodeGen::printDecls(map varMap, - vector inputs, vector outputs) { - stringstream ret; - unordered_set propsAlreadyGenerated; - - vector sortedProps; +void CodeGen::getSortedProps(map &varMap, + vector &sortedProps, vector &inputs, + vector &outputs) { for (auto const& p: varMap) { if (p.first.as()) @@ -359,6 +410,17 @@ string CodeGen::printDecls(map varMap, return a->index < b->index; }); +} + +// helper to print declarations +string CodeGen::printDecls(map varMap, + vector inputs, vector outputs) { + stringstream ret; + unordered_set propsAlreadyGenerated; + + vector sortedProps; + getSortedProps(varMap, sortedProps, inputs, outputs); + for (auto prop: sortedProps) { bool isOutputProp = (find(outputs.begin(), outputs.end(), prop->tensor) != outputs.end()); @@ -379,6 +441,71 @@ string CodeGen::printDecls(map varMap, return ret.str(); } +string CodeGen::printCallISPCFunc(const Function *func, map varMap, + vector &sortedProps) { + std::stringstream ret; + ret << " "; + unordered_set propsAlreadyGenerated; + + ret << "__" << func->name << "("; + + vector inputs = func->inputs; + vector outputs = func->outputs; + getSortedProps(varMap, sortedProps, inputs, outputs); + + for (unsigned long i=0; i < sortedProps.size(); i++) { + ret << varMap[sortedProps[i]]; + if (i != sortedProps.size()-1) { + ret << ", "; + } + propsAlreadyGenerated.insert(varMap[sortedProps[i]]); + } + + ret << ");\n"; + return ret.str(); +} + +string CodeGen::printISPCFunc(const Function *func, map varMap, + vector &sortedProps) { + std::stringstream ret; + ret << "export void "; + unordered_set propsAlreadyGenerated; + + ret << "__" << func->name << "("; + + vector inputs = func->inputs; + vector outputs = func->outputs; + // getSortedProps(varMap, sortedProps, inputs, outputs); + + for (unsigned long i=0; i < sortedProps.size(); i++) { + auto prop = sortedProps[i]; + bool isOutputProp = (find(outputs.begin(), outputs.end(), + prop->tensor) != outputs.end()); + + auto var = prop->tensor.as(); + if (var->is_parameter) { + if (isOutputProp) { + ret << " " << printTensorProperty(varMap[prop], prop, false) << ";" << endl; + } else { + break; + } + } else { + ret << getUnpackedTensorArgument(varMap[prop], prop, isOutputProp); + } + propsAlreadyGenerated.insert(varMap[prop]); + + if (i!=sortedProps.size()-1) { + ret << ", "; + } + if (i%2==0) { + ret << "\n\t"; + } + } + ret << ") {\n"; + + return ret.str(); +} + string CodeGen::printPack(map, string> outputProperties, vector outputs) { diff --git a/src/codegen/codegen.h b/src/codegen/codegen.h index cc25c80d6..641239834 100644 --- a/src/codegen/codegen.h +++ b/src/codegen/codegen.h @@ -16,9 +16,13 @@ class CodeGen : public IRPrinter { enum CodeGenType { C, CUDA }; CodeGen(std::ostream& stream, CodeGenType type) : IRPrinter(stream), codeGenType(type) {}; - CodeGen(std::ostream& stream, bool color, bool simplify, CodeGenType type) : IRPrinter(stream, color, simplify), codeGenType(type) {}; + CodeGen(std::ostream& stream, bool color, bool simplify, CodeGenType type) + : IRPrinter(stream, color, simplify), codeGenType(type) {}; + CodeGen(std::ostream& stream, std::ostream& stream2, bool color, bool simplify, CodeGenType type) + : IRPrinter(stream, stream2, color, simplify), codeGenType(type) {}; /// Initialize the default code generator static std::shared_ptr init_default(std::ostream &dest, OutputKind outputKind); + static std::shared_ptr init_default(std::ostream &dest, std::ostream &dest2, OutputKind outputKind); /// Compile a lowered function virtual void compile(Stmt stmt, bool isFirst=false) =0; @@ -26,6 +30,9 @@ class CodeGen : public IRPrinter { protected: static bool checkForAlloc(const Function *func); static int countYields(const Function *func); + void getSortedProps(std::map &varMap, + std::vector &sortedProps, std::vector &inputs, + std::vector &outputs); static std::string printCType(Datatype type, bool is_ptr); static std::string printCUDAType(Datatype type, bool is_ptr); @@ -42,6 +49,10 @@ class CodeGen : public IRPrinter { std::string printContextDeclAndInit(std::map varMap, std::vector localVars, int labels, std::string funcName); + std::string printCallISPCFunc(const Function *func, std::map varMap, + std::vector &sortedProps); + std::string printISPCFunc(const Function *func, std::map varMap, + std::vector &sortedProps); std::string printDecls(std::map varMap, std::vector inputs, std::vector outputs); std::string printPack(std::map, @@ -64,6 +75,8 @@ class CodeGen : public IRPrinter { std::string printTensorProperty(std::string varname, const GetProperty* op, bool is_ptr); std::string unpackTensorProperty(std::string varname, const GetProperty* op, bool is_output_prop); + std::string getUnpackedTensorArgument(std::string varname, const GetProperty* op, + bool is_output_prop); std::string packTensorProperty(std::string varname, Expr tnsr, TensorProperty property, int mode, int index); std::string pointTensorProperty(std::string varname); diff --git a/src/codegen/codegen_ispc.cpp b/src/codegen/codegen_ispc.cpp index 4b0e82903..f107728cc 100644 --- a/src/codegen/codegen_ispc.cpp +++ b/src/codegen/codegen_ispc.cpp @@ -5,6 +5,7 @@ #include #include +#include "taco/cuda.h" #include "taco/ir/ir_visitor.h" #include "codegen_ispc.h" #include "taco/error.h" @@ -240,7 +241,10 @@ class CodeGen_ISPC::FindVars : public IRVisitor { }; CodeGen_ISPC::CodeGen_ISPC(std::ostream &dest, OutputKind outputKind, bool simplify) - : CodeGen(dest, false, simplify, C), out(dest), outputKind(outputKind) {} + : CodeGen(dest, false, simplify, C), out(dest), out2(dest), outputKind(outputKind) {} + +CodeGen_ISPC::CodeGen_ISPC(std::ostream &dest, std::ostream &dest2, OutputKind outputKind, bool simplify) + : CodeGen(dest, dest2, false, simplify, C), out(dest), out2(dest2), outputKind(outputKind) {} CodeGen_ISPC::~CodeGen_ISPC() {} @@ -254,9 +258,19 @@ void CodeGen_ISPC::compile(Stmt stmt, bool isFirst) { } out << endl; // generate code for the Stmt + std::cout << "Compiling the code\n"; stmt.accept(this); } +void CodeGen_ISPC::sendToStream(std::stringstream &stream) { + if (is_ISPC_code_stream_enabled()) { + this->out2 << stream.str(); + } + else { + this->out << stream.str(); + } +} + void CodeGen_ISPC::visit(const Function* func) { // if generating a header, protect the function declaration with a guard if (outputKind == HeaderGen) { @@ -300,14 +314,14 @@ void CodeGen_ISPC::visit(const Function* func) { // Print variable declarations out << printDecls(varFinder.varDecls, func->inputs, func->outputs) << endl; + vector sortedProps; + out << printCallISPCFunc(func, varFinder.varDecls, sortedProps); + if (emittingCoroutine) { out << printContextDeclAndInit(varMap, localVars, numYields, func->name) << endl; } - // output body - print(func->body); - // output repack only if we allocated memory if (checkForAlloc(func)) out << endl << printPack(varFinder.outputProperties, func->outputs); @@ -321,21 +335,50 @@ void CodeGen_ISPC::visit(const Function* func) { indent--; doIndent(); - out << "}\n"; + out << "}\n\n"; + + set_ISPC_code_stream_enabled(true); + out2 << printISPCFunc(func, varFinder.varDecls, sortedProps); + indent++; + doIndent(); + // output body + print(func->body); + indent--; + out2 << "}\n"; + set_ISPC_code_stream_enabled(false); + } void CodeGen_ISPC::visit(const VarDecl* op) { - if (emittingCoroutine) { - doIndent(); - op->var.accept(this); - parentPrecedence = Precedence::TOP; - stream << " = "; - op->rhs.accept(this); - stream << ";"; - stream << endl; - } else { - IRPrinter::visit(op); + // std::stringstream stream; + if (is_ISPC_code_stream_enabled()) { + if (emittingCoroutine) { + doIndent(); + op->var.accept(this); + parentPrecedence = Precedence::TOP; + stream2 << " = "; + op->rhs.accept(this); + stream2 << ";"; + stream2 << endl; + } else { + IRPrinter::visit(op); + } } + else { + if (emittingCoroutine) { + doIndent(); + op->var.accept(this); + parentPrecedence = Precedence::TOP; + stream << " = "; + op->rhs.accept(this); + stream << ";"; + stream << endl; + } else { + IRPrinter::visit(op); + } + } + + // sendToStream(stream); } void CodeGen_ISPC::visit(const Yield* op) { @@ -345,14 +388,27 @@ void CodeGen_ISPC::visit(const Yield* op) { // For Vars, we replace their names with the generated name, // since we match by reference (not name) void CodeGen_ISPC::visit(const Var* op) { - taco_iassert(varMap.count(op) > 0) << - "Var " << op->name << " not found in varMap"; - if (emittingCoroutine) { -// out << "TACO_DEREF("; + if (is_ISPC_code_stream_enabled()) { + taco_iassert(varMap.count(op) > 0) << + "Var " << op->name << " not found in varMap"; + if (emittingCoroutine) { + // out << "TACO_DEREF("; + } + out2 << varMap[op]; + if (emittingCoroutine) { + // out << ")"; + } } - out << varMap[op]; - if (emittingCoroutine) { -// out << ")"; + else { + taco_iassert(varMap.count(op) > 0) << + "Var " << op->name << " not found in varMap"; + if (emittingCoroutine) { + // out << "TACO_DEREF("; + } + out << varMap[op]; + if (emittingCoroutine) { + // out << ")"; + } } } @@ -367,31 +423,31 @@ static string genVectorizePragma(int width) { return ret.str(); } -static string getParallelizePragma(LoopKind kind) { - stringstream ret; - ret << "#pragma omp parallel for schedule"; - switch (kind) { - case LoopKind::Static: - ret << "(static, 1)"; - break; - case LoopKind::Dynamic: - ret << "(dynamic, 1)"; - break; - case LoopKind::Runtime: - ret << "(runtime)"; - break; - case LoopKind::Static_Chunked: - ret << "(static)"; - break; - default: - break; - } - return ret.str(); -} - -static string getUnrollPragma(size_t unrollFactor) { - return "#pragma unroll " + std::to_string(unrollFactor); -} +// static string getParallelizePragma(LoopKind kind) { +// stringstream ret; +// ret << "#pragma omp parallel for schedule"; +// switch (kind) { +// case LoopKind::Static: +// ret << "(static, 1)"; +// break; +// case LoopKind::Dynamic: +// ret << "(dynamic, 1)"; +// break; +// case LoopKind::Runtime: +// ret << "(runtime)"; +// break; +// case LoopKind::Static_Chunked: +// ret << "(static)"; +// break; +// default: +// break; +// } +// return ret.str(); +// } + +// static string getUnrollPragma(size_t unrollFactor) { +// return "#pragma unroll " + std::to_string(unrollFactor); +// } static string getAtomicPragma() { return "#pragma omp atomic"; @@ -404,58 +460,75 @@ static string getAtomicPragma() { // http://clang.llvm.org/docs/LanguageExtensions.html#extensions-for-loop-hint-optimizations void CodeGen_ISPC::visit(const For* op) { switch (op->kind) { + // TODO - add ISPC based multi threaded execution handling case LoopKind::Vectorized: - doIndent(); - out << genVectorizePragma(op->vec_width); - out << "\n"; - break; case LoopKind::Static: case LoopKind::Dynamic: case LoopKind::Runtime: case LoopKind::Static_Chunked: - doIndent(); - out << getParallelizePragma(op->kind); - out << "\n"; - break; default: - if (op->unrollFactor > 0) { - doIndent(); - out << getUnrollPragma(op->unrollFactor) << endl; - } break; } doIndent(); - stream << keywordString("for") << " ("; - if (!emittingCoroutine) { - stream << keywordString(util::toString(op->var.type())) << " "; - } - op->var.accept(this); - stream << " = "; - op->start.accept(this); - stream << keywordString("; "); - op->var.accept(this); - stream << " < "; - parentPrecedence = BOTTOM; - op->end.accept(this); - stream << keywordString("; "); - op->var.accept(this); - auto lit = op->increment.as(); - if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || - (lit->type.isUInt() && lit->equalsScalar(1)))) { - stream << "++"; - } - else { - stream << " += "; - op->increment.accept(this); + if (op->kind == LoopKind::Foreach) { + stream2 << keywordString("foreach") << " ("; + // if (!emittingCoroutine) { + // if (op->var.type() == Int32) { + // stream << "int32 "; + // } + // else if (op->var.type() == Int64) { + // stream << "int64 "; + // } + + // } + op->var.accept(this); + stream2 << " = "; + op->start.accept(this); + stream2 << keywordString(" ... "); + op->end.accept(this); + stream2 << ") {\n"; + + } else { + stream2 << keywordString("for") << " ("; + if (!emittingCoroutine) { + if (op->var.type() == Int32) { + stream2 << "int32 "; + } + else if (op->var.type() == Int64) { + stream2 << "int64 "; + } + + } + op->var.accept(this); + stream2 << " = "; + op->start.accept(this); + stream2 << keywordString("; "); + op->var.accept(this); + stream2 << " < "; + parentPrecedence = BOTTOM; + op->end.accept(this); + stream2 << keywordString("; "); + op->var.accept(this); + + auto lit = op->increment.as(); + if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || + (lit->type.isUInt() && lit->equalsScalar(1)))) { + stream2 << "++"; + } + else { + stream2 << " += "; + op->increment.accept(this); + } + stream2 << ") {\n"; } - stream << ") {\n"; op->contents.accept(this); doIndent(); - stream << "}"; - stream << endl; + stream2 << "}"; + stream2 << endl; + } void CodeGen_ISPC::visit(const While* op) { @@ -474,7 +547,13 @@ void CodeGen_ISPC::visit(const While* op) { void CodeGen_ISPC::visit(const GetProperty* op) { taco_iassert(varMap.count(op) > 0) << "Property " << Expr(op) << " of " << op->tensor << " not found in varMap"; - out << varMap[op]; + if (is_ISPC_code_stream_enabled()) { + out2 << varMap[op]; + } + else { + out << varMap[op]; + } + } void CodeGen_ISPC::visit(const Min* op) { @@ -549,17 +628,34 @@ void CodeGen_ISPC::visit(const Sqrt* op) { } void CodeGen_ISPC::visit(const Assign* op) { - if (op->use_atomics) { - doIndent(); - stream << getAtomicPragma() << endl; + if (is_ISPC_code_stream_enabled()) { + if (op->use_atomics) { + doIndent(); + stream2 << getAtomicPragma() << endl; + } + } + else { + if (op->use_atomics) { + doIndent(); + stream << getAtomicPragma() << endl; + } } + IRPrinter::visit(op); } void CodeGen_ISPC::visit(const Store* op) { - if (op->use_atomics) { - doIndent(); - stream << getAtomicPragma() << endl; + if (is_ISPC_code_stream_enabled()) { + if (op->use_atomics) { + doIndent(); + stream2 << getAtomicPragma() << endl; + } + } + else { + if (op->use_atomics) { + doIndent(); + stream << getAtomicPragma() << endl; + } } IRPrinter::visit(op); } diff --git a/src/codegen/codegen_ispc.h b/src/codegen/codegen_ispc.h index 35da5a01b..8abd1cc09 100644 --- a/src/codegen/codegen_ispc.h +++ b/src/codegen/codegen_ispc.h @@ -16,6 +16,7 @@ class CodeGen_ISPC : public CodeGen { /// Initialize a code generator that generates code to an /// output stream. CodeGen_ISPC(std::ostream &dest, OutputKind outputKind, bool simplify=true); + CodeGen_ISPC(std::ostream &dest, std::ostream &dest2, OutputKind outputKind, bool simplify=true); ~CodeGen_ISPC(); /// Compile a lowered function @@ -45,6 +46,7 @@ class CodeGen_ISPC : public CodeGen { std::map varMap; std::vector localVars; std::ostream &out; + std::ostream &out2; OutputKind outputKind; @@ -56,6 +58,7 @@ class CodeGen_ISPC : public CodeGen { private: virtual std::string restrictKeyword() const { return "restrict"; } + void sendToStream(std::stringstream &stream); }; } // namespace ir diff --git a/src/codegen/module.cpp b/src/codegen/module.cpp index 409ed4a83..d9cbe2edc 100644 --- a/src/codegen/module.cpp +++ b/src/codegen/module.cpp @@ -116,6 +116,7 @@ void writeShims(vector funcs, string path, string prefix) { } // anonymous namespace string Module::compile() { + std::cout << "Module::compile\n"; string prefix = tmpdir+libname; string fullpath = prefix + ".so"; @@ -130,6 +131,12 @@ string Module::compile() { file_ending = ".cu"; shims_file = prefix + "_shims.cpp"; } + else if (should_use_ISPC_codegen()) { + cc = util::getFromEnv(target.compiler_env, target.compiler); + cflags = util::getFromEnv("TACO_CFLAGS", + "-O3 -ffast-math -std=c99") + " -shared -fPIC"; + + } else { cc = util::getFromEnv(target.compiler_env, target.compiler); cflags = util::getFromEnv("TACO_CFLAGS", @@ -150,6 +157,12 @@ string Module::compile() { // write out the shims writeShims(funcs, tmpdir, libname); + for (auto &statement : funcs) { + std::cout << "----- statement --------" << std::endl; + std::cout << statement; + std::cout << std::endl; + } + std::cout << tmpdir << std::endl << libname << std::endl; // now compile it int err = system(cmd.data()); diff --git a/src/cuda.cpp b/src/cuda.cpp index 85139f874..68e49fe98 100644 --- a/src/cuda.cpp +++ b/src/cuda.cpp @@ -9,14 +9,22 @@ using namespace std; namespace taco { static bool ISPC_codegen_enabled = ISPC_BUILT; +static bool ISPC_code_stream_enabled = false; bool should_use_ISPC_codegen() { return ISPC_codegen_enabled; } +bool is_ISPC_code_stream_enabled() { + return ISPC_code_stream_enabled; +} + void set_ISPC_codegen_enabled(bool enabled) { ISPC_codegen_enabled = enabled; } +void set_ISPC_code_stream_enabled(bool enabled) { + ISPC_code_stream_enabled = enabled; +} /// Functions used by taco to interface with CUDA (especially unified memory) static bool CUDA_codegen_enabled = CUDA_BUILT; diff --git a/src/ir/ir_printer.cpp b/src/ir/ir_printer.cpp index a1997a9b7..f96251c5a 100644 --- a/src/ir/ir_printer.cpp +++ b/src/ir/ir_printer.cpp @@ -1,6 +1,7 @@ #include #include +#include "taco/cuda.h" #include "taco/ir/ir.h" #include "taco/ir/ir_printer.h" #include "taco/ir/simplify.h" @@ -34,7 +35,11 @@ IRPrinter::IRPrinter(ostream &s) : IRPrinter(s, false, false) { } IRPrinter::IRPrinter(ostream &s, bool color, bool simplify) - : stream(s), indent(0), color(color), simplify(simplify) { + : stream(s), stream2(s), indent(0), color(color), simplify(simplify) { +} + +IRPrinter::IRPrinter(ostream &s, ostream &s2, bool color, bool simplify) + : stream(s), stream2(s2), indent(0), color(color), simplify(simplify) { } IRPrinter::~IRPrinter() { @@ -59,79 +64,169 @@ void IRPrinter::print(Stmt stmt) { } void IRPrinter::visit(const Literal* op) { - if (color) { - stream << blue ; - } - - switch (op->type.getKind()) { - case Datatype::Bool: - stream << op->getValue(); - break; - case Datatype::UInt8: - stream << static_cast(op->getValue()); - break; - case Datatype::UInt16: - stream << op->getValue(); - break; - case Datatype::UInt32: - stream << op->getValue(); - break; - case Datatype::UInt64: - stream << op->getValue(); - break; - case Datatype::UInt128: - taco_not_supported_yet; - break; - case Datatype::Int8: - stream << static_cast(op->getValue()); - break; - case Datatype::Int16: - stream << op->getValue(); - break; - case Datatype::Int32: - stream << op->getValue(); - break; - case Datatype::Int64: - stream << op->getValue(); - break; - case Datatype::Int128: - taco_not_supported_yet; - break; - case Datatype::Float32: - stream << ((op->getValue() != 0.0) - ? util::toString(op->getValue()) : "0.0"); - break; - case Datatype::Float64: - stream << ((op->getValue()!=0.0) - ? util::toString(op->getValue()) : "0.0"); - break; - case Datatype::Complex64: { - std::complex val = op->getValue>(); - stream << val.real() << " + I*" << val.imag(); - } - break; - case Datatype::Complex128: { - std::complex val = op->getValue>(); - stream << val.real() << " + I*" << val.imag(); - } - break; - case Datatype::Undefined: - taco_ierror << "Undefined type in IR"; - break; - } + if (is_ISPC_code_stream_enabled()) { + if (color) { + stream2 << blue ; + } - if (color) { - stream << nc; + // It seems this is where all the types get printed in the final code generation. + // Come up with a way to generate different values if stream2 is used to generate ispc code + switch (op->type.getKind()) { + case Datatype::Bool: + stream2 << op->getValue(); + break; + case Datatype::UInt8: + stream2 << static_cast(op->getValue()); + break; + case Datatype::UInt16: + stream2 << op->getValue(); + break; + case Datatype::UInt32: + stream2 << op->getValue(); + break; + case Datatype::UInt64: + stream2 << op->getValue(); + break; + case Datatype::UInt128: + taco_not_supported_yet; + break; + case Datatype::Int8: + stream2 << static_cast(op->getValue()); + break; + case Datatype::Int16: + stream2 << op->getValue(); + break; + case Datatype::Int32: + stream2 << op->getValue(); + break; + case Datatype::Int64: + stream2 << op->getValue(); + break; + case Datatype::Int128: + taco_not_supported_yet; + break; + case Datatype::Float32: + stream2 << ((op->getValue() != 0.0) + ? util::toString(op->getValue()) : "0.0"); + break; + case Datatype::Float64: + stream2 << ((op->getValue()!=0.0) + ? util::toString(op->getValue()) : "0.0"); + break; + case Datatype::Complex64: { + std::complex val = op->getValue>(); + stream2 << val.real() << " + I*" << val.imag(); + } + break; + case Datatype::Complex128: { + std::complex val = op->getValue>(); + stream2 << val.real() << " + I*" << val.imag(); + } + break; + case Datatype::Undefined: + taco_ierror << "Undefined type in IR"; + break; + } + + if (color) { + stream2 << nc; + } + } + + + + else { + + if (color) { + stream << blue ; + } + + // It seems this is where all the types get printed in the final code generation. + // Come up with a way to generate different values if stream2 is used to generate ispc code + switch (op->type.getKind()) { + case Datatype::Bool: + stream << op->getValue(); + break; + case Datatype::UInt8: + stream << static_cast(op->getValue()); + break; + case Datatype::UInt16: + stream << op->getValue(); + break; + case Datatype::UInt32: + stream << op->getValue(); + break; + case Datatype::UInt64: + stream << op->getValue(); + break; + case Datatype::UInt128: + taco_not_supported_yet; + break; + case Datatype::Int8: + stream << static_cast(op->getValue()); + break; + case Datatype::Int16: + stream << op->getValue(); + break; + case Datatype::Int32: + stream << op->getValue(); + break; + case Datatype::Int64: + stream << op->getValue(); + break; + case Datatype::Int128: + taco_not_supported_yet; + break; + case Datatype::Float32: + stream << ((op->getValue() != 0.0) + ? util::toString(op->getValue()) : "0.0"); + break; + case Datatype::Float64: + stream << ((op->getValue()!=0.0) + ? util::toString(op->getValue()) : "0.0"); + break; + case Datatype::Complex64: { + std::complex val = op->getValue>(); + stream << val.real() << " + I*" << val.imag(); + } + break; + case Datatype::Complex128: { + std::complex val = op->getValue>(); + stream << val.real() << " + I*" << val.imag(); + } + break; + case Datatype::Undefined: + taco_ierror << "Undefined type in IR"; + break; + } + + if (color) { + stream << nc; + } + + } + } void IRPrinter::visit(const Var* op) { - if (varNames.contains(op)) { - stream << varNames.get(op); + if (is_ISPC_code_stream_enabled()) { + if (varNames.contains(op)) { + stream2 << varNames.get(op); + } + else { + stream2 << op->name; + } } else { - stream << op->name; + if (varNames.contains(op)) { + stream << varNames.get(op); + } + else { + stream << op->name; + } } + } void IRPrinter::visit(const Neg* op) { @@ -248,41 +343,83 @@ void IRPrinter::visit(const IfThenElse* op) { taco_iassert(op->cond.defined()); taco_iassert(op->then.defined()); doIndent(); - stream << keywordString("if "); - stream << "("; - parentPrecedence = Precedence::TOP; - op->cond.accept(this); - stream << ")"; + if (is_ISPC_code_stream_enabled()) { + stream2 << keywordString("if "); + stream2 << "("; + parentPrecedence = Precedence::TOP; + op->cond.accept(this); + stream2 << ")"; + + Stmt scopedStmt = Stmt(to(op->then)->scopedStmt); + if (isa(scopedStmt)) { + stream2 << " {" << endl; + op->then.accept(this); + doIndent(); + stream2 << "}"; + } + else if (isa(scopedStmt)) { + int tmp = indent; + indent = 0; + stream2 << " "; + scopedStmt.accept(this); + indent = tmp; + } + else { + stream2 << endl; + op->then.accept(this); + } - Stmt scopedStmt = Stmt(to(op->then)->scopedStmt); - if (isa(scopedStmt)) { - stream << " {" << endl; - op->then.accept(this); - doIndent(); - stream << "}"; - } - else if (isa(scopedStmt)) { - int tmp = indent; - indent = 0; - stream << " "; - scopedStmt.accept(this); - indent = tmp; + if (op->otherwise.defined()) { + stream2 << "\n"; + doIndent(); + stream2 << keywordString("else"); + stream2 << " {\n"; + op->otherwise.accept(this); + doIndent(); + stream2 << "}"; + } + stream2 << endl; } + + else { - stream << endl; - op->then.accept(this); - } + stream << keywordString("if "); + stream << "("; + parentPrecedence = Precedence::TOP; + op->cond.accept(this); + stream << ")"; - if (op->otherwise.defined()) { - stream << "\n"; - doIndent(); - stream << keywordString("else"); - stream << " {\n"; - op->otherwise.accept(this); - doIndent(); - stream << "}"; + Stmt scopedStmt = Stmt(to(op->then)->scopedStmt); + if (isa(scopedStmt)) { + stream << " {" << endl; + op->then.accept(this); + doIndent(); + stream << "}"; + } + else if (isa(scopedStmt)) { + int tmp = indent; + indent = 0; + stream << " "; + scopedStmt.accept(this); + indent = tmp; + } + else { + stream << endl; + op->then.accept(this); + } + + if (op->otherwise.defined()) { + stream << "\n"; + doIndent(); + stream << keywordString("else"); + stream << " {\n"; + op->otherwise.accept(this); + doIndent(); + stream << "}"; + } + stream << endl; } - stream << endl; + } void IRPrinter::visit(const Case* op) { @@ -345,12 +482,22 @@ void IRPrinter::visit(const Switch* op) { } void IRPrinter::visit(const Load* op) { - parentPrecedence = Precedence::LOAD; - op->arr.accept(this); - stream << "["; - parentPrecedence = Precedence::LOAD; - op->loc.accept(this); - stream << "]"; + if (is_ISPC_code_stream_enabled()) { + parentPrecedence = Precedence::LOAD; + op->arr.accept(this); + stream2 << "["; + parentPrecedence = Precedence::LOAD; + op->loc.accept(this); + stream2 << "]"; + } + else { + parentPrecedence = Precedence::LOAD; + op->arr.accept(this); + stream << "["; + parentPrecedence = Precedence::LOAD; + op->loc.accept(this); + stream << "]"; + } } void IRPrinter::visit(const Malloc* op) { @@ -367,66 +514,149 @@ void IRPrinter::visit(const Sizeof* op) { } void IRPrinter::visit(const Store* op) { - doIndent(); - op->arr.accept(this); - stream << "["; - parentPrecedence = Precedence::TOP; - op->loc.accept(this); - stream << "] = "; - parentPrecedence = Precedence::TOP; - op->data.accept(this); - stream << ";"; - stream << endl; + if (is_ISPC_code_stream_enabled()) { + doIndent(); + op->arr.accept(this); + stream2 << "["; + parentPrecedence = Precedence::TOP; + op->loc.accept(this); + stream2 << "] = "; + parentPrecedence = Precedence::TOP; + op->data.accept(this); + stream2 << ";"; + stream2 << endl; + } + else { + doIndent(); + op->arr.accept(this); + stream << "["; + parentPrecedence = Precedence::TOP; + op->loc.accept(this); + stream << "] = "; + parentPrecedence = Precedence::TOP; + op->data.accept(this); + stream << ";"; + stream << endl; + } + } void IRPrinter::visit(const For* op) { - doIndent(); - stream << keywordString("for") << " (" - << keywordString(util::toString(op->var.type())) << " "; - op->var.accept(this); - stream << " = "; - op->start.accept(this); - stream << keywordString("; "); - op->var.accept(this); - stream << " < "; - parentPrecedence = BOTTOM; - op->end.accept(this); - stream << keywordString("; "); - op->var.accept(this); + std::cout << "This is IRPrinter::visit For op method\n"; + if (is_ISPC_code_stream_enabled()) { + doIndent(); + stream2 << keywordString("for") << " (" + << keywordString(util::toString(op->var.type())) << " "; + op->var.accept(this); + stream2 << " = "; + op->start.accept(this); + stream2 << keywordString("; "); + op->var.accept(this); + stream2 << " < "; + parentPrecedence = BOTTOM; + op->end.accept(this); + stream2 << keywordString("; "); + op->var.accept(this); + + auto lit = op->increment.as(); + if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || + (lit->type.isUInt() && lit->equalsScalar(1)))) { + stream2 << "++"; + } + else { + stream2 << " += "; + op->increment.accept(this); + } + stream2 << ") {\n"; - auto lit = op->increment.as(); - if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || - (lit->type.isUInt() && lit->equalsScalar(1)))) { - stream << "++"; + op->contents.accept(this); + doIndent(); + stream2 << "}"; + stream2 << endl; } + + else { - stream << " += "; - op->increment.accept(this); + doIndent(); + stream << keywordString("for") << " (" + << keywordString(util::toString(op->var.type())) << " "; + op->var.accept(this); + stream << " = "; + op->start.accept(this); + stream << keywordString("; "); + op->var.accept(this); + stream << " < "; + parentPrecedence = BOTTOM; + op->end.accept(this); + stream << keywordString("; "); + op->var.accept(this); + + auto lit = op->increment.as(); + if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || + (lit->type.isUInt() && lit->equalsScalar(1)))) { + stream << "++"; + } + else { + stream << " += "; + op->increment.accept(this); + } + stream << ") {\n"; + + op->contents.accept(this); + doIndent(); + stream << "}"; + stream << endl; } - stream << ") {\n"; - op->contents.accept(this); - doIndent(); - stream << "}"; - stream << endl; +} + +void IRPrinter::sendToStream(std::stringstream &stream) { + if (is_ISPC_code_stream_enabled()) { + this->stream2 << stream.str(); + } + else { + this->stream << stream.str(); + } } void IRPrinter::visit(const While* op) { - doIndent(); - stream << keywordString("while "); - stream << "("; - parentPrecedence = Precedence::TOP; - op->cond.accept(this); - stream << ")"; - stream << " {\n"; - op->contents.accept(this); - doIndent(); - stream << "}"; - stream << endl; + // std::stringstream stream; + if (is_ISPC_code_stream_enabled()) { + doIndent(); + stream2 << keywordString("while "); + stream2 << "("; + parentPrecedence = Precedence::TOP; + op->cond.accept(this); + stream2 << ")"; + stream2 << " {\n"; + op->contents.accept(this); + doIndent(); + stream2 << "}"; + stream2 << endl; + } + else { + doIndent(); + stream << keywordString("while "); + stream << "("; + parentPrecedence = Precedence::TOP; + op->cond.accept(this); + stream << ")"; + stream << " {\n"; + op->contents.accept(this); + doIndent(); + stream << "}"; + stream << endl; + } + // sendToStream(stream); } void IRPrinter::visit(const Block* op) { - acceptJoin(this, stream, op->contents, ""); + if (is_ISPC_code_stream_enabled()) { + acceptJoin(this, stream2, op->contents, ""); + } + else { + acceptJoin(this, stream, op->contents, ""); + } } void IRPrinter::visit(const Scope* op) { @@ -438,85 +668,183 @@ void IRPrinter::visit(const Scope* op) { } void IRPrinter::visit(const Function* op) { - stream << keywordString("void ") << op->name; - stream << "("; - if (op->outputs.size() > 0) stream << "Tensor "; - acceptJoin(this, stream, op->outputs, ", Tensor "); - if (op->outputs.size() > 0 && op->inputs.size()) stream << ", "; - if (op->inputs.size() > 0) stream << "Tensor "; - acceptJoin(this, stream, op->inputs, ", Tensor "); - stream << ") {" << endl; + if (is_ISPC_code_stream_enabled()) { + stream2 << keywordString("void ") << op->name; + stream2 << "("; + if (op->outputs.size() > 0) stream2 << "Tensor "; + acceptJoin(this, stream2, op->outputs, ", Tensor "); + if (op->outputs.size() > 0 && op->inputs.size()) stream2 << ", "; + if (op->inputs.size() > 0) stream2 << "Tensor "; + acceptJoin(this, stream2, op->inputs, ", Tensor "); + stream2 << ") {" << endl; + + resetNameCounters(); + op->body.accept(this); + + doIndent(); + stream2 << "}"; + } + else { + stream << keywordString("void ") << op->name; + stream << "("; + if (op->outputs.size() > 0) stream << "Tensor "; + acceptJoin(this, stream, op->outputs, ", Tensor "); + if (op->outputs.size() > 0 && op->inputs.size()) stream << ", "; + if (op->inputs.size() > 0) stream << "Tensor "; + acceptJoin(this, stream, op->inputs, ", Tensor "); + stream << ") {" << endl; - resetNameCounters(); - op->body.accept(this); + resetNameCounters(); + op->body.accept(this); + + doIndent(); + stream << "}"; + } - doIndent(); - stream << "}"; } void IRPrinter::visit(const VarDecl* op) { - doIndent(); - stream << keywordString(util::toString(op->var.type())); - taco_iassert(isa(op->var)); - if (to(op->var)->is_ptr) { - stream << "* restrict"; - } - stream << " "; - string varName = varNameGenerator.getUniqueName(util::toString(op->var)); - varNames.insert({op->var, varName}); - op->var.accept(this); - parentPrecedence = Precedence::TOP; - stream << " = "; - op->rhs.accept(this); - stream << ";"; - stream << endl; + if (is_ISPC_code_stream_enabled()) { + doIndent(); + if (op->var.type() == Int32) { + stream2 << keywordString("int32"); + } + else if (op->var.type() == Int64) { + stream2 << keywordString("int64"); + } else { + stream2 << keywordString(util::toString(op->var.type())); + } + taco_iassert(isa(op->var)); + if (to(op->var)->is_ptr) { + stream2 << "* restrict"; + } + stream2 << " "; + string varName = varNameGenerator.getUniqueName(util::toString(op->var)); + varNames.insert({op->var, varName}); + op->var.accept(this); + parentPrecedence = Precedence::TOP; + stream2 << " = "; + op->rhs.accept(this); + stream2 << ";"; + stream2 << endl; + } + else { + doIndent(); + stream << keywordString(util::toString(op->var.type())); + taco_iassert(isa(op->var)); + if (to(op->var)->is_ptr) { + stream << "* restrict"; + } + stream << " "; + string varName = varNameGenerator.getUniqueName(util::toString(op->var)); + varNames.insert({op->var, varName}); + op->var.accept(this); + parentPrecedence = Precedence::TOP; + stream << " = "; + op->rhs.accept(this); + stream << ";"; + stream << endl; + } + } void IRPrinter::visit(const Assign* op) { - doIndent(); - op->lhs.accept(this); - parentPrecedence = Precedence::TOP; - bool printed = false; - if (simplify) { - if (isa(op->rhs)) { - auto add = to(op->rhs); - if (add->a == op->lhs) { - const Literal* lit = add->b.as(); - if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || - (lit->type.isUInt() && lit->equalsScalar(1)))) { - stream << "++"; + if (is_ISPC_code_stream_enabled()) { + doIndent(); + op->lhs.accept(this); + parentPrecedence = Precedence::TOP; + bool printed = false; + if (simplify) { + if (isa(op->rhs)) { + auto add = to(op->rhs); + if (add->a == op->lhs) { + const Literal* lit = add->b.as(); + if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || + (lit->type.isUInt() && lit->equalsScalar(1)))) { + stream2 << "++"; + } + else { + stream2 << " += "; + add->b.accept(this); + } + printed = true; } - else { - stream << " += "; - add->b.accept(this); + } + else if (isa(op->rhs)) { + auto mul = to(op->rhs); + if (mul->a == op->lhs) { + stream2 << " *= "; + mul->b.accept(this); + printed = true; } - printed = true; } - } - else if (isa(op->rhs)) { - auto mul = to(op->rhs); - if (mul->a == op->lhs) { - stream << " *= "; - mul->b.accept(this); - printed = true; + else if (isa(op->rhs)) { + auto bitOr = to(op->rhs); + if (bitOr->a == op->lhs) { + stream2 << " |= "; + bitOr->b.accept(this); + printed = true; + } } } - else if (isa(op->rhs)) { - auto bitOr = to(op->rhs); - if (bitOr->a == op->lhs) { - stream << " |= "; - bitOr->b.accept(this); - printed = true; - } + if (!printed) { + stream2 << " = "; + op->rhs.accept(this); } + + stream2 << ";"; + stream2 << endl; } - if (!printed) { - stream << " = "; - op->rhs.accept(this); + + + + else { + doIndent(); + op->lhs.accept(this); + parentPrecedence = Precedence::TOP; + bool printed = false; + if (simplify) { + if (isa(op->rhs)) { + auto add = to(op->rhs); + if (add->a == op->lhs) { + const Literal* lit = add->b.as(); + if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || + (lit->type.isUInt() && lit->equalsScalar(1)))) { + stream << "++"; + } + else { + stream << " += "; + add->b.accept(this); + } + printed = true; + } + } + else if (isa(op->rhs)) { + auto mul = to(op->rhs); + if (mul->a == op->lhs) { + stream << " *= "; + mul->b.accept(this); + printed = true; + } + } + else if (isa(op->rhs)) { + auto bitOr = to(op->rhs); + if (bitOr->a == op->lhs) { + stream << " |= "; + bitOr->b.accept(this); + printed = true; + } + } + } + if (!printed) { + stream << " = "; + op->rhs.accept(this); + } + + stream << ";"; + stream << endl; } - stream << ";"; - stream << endl; } void IRPrinter::visit(const Yield* op) { @@ -559,17 +887,32 @@ void IRPrinter::visit(const Comment* op) { } void IRPrinter::visit(const BlankLine*) { - stream << endl; + if (is_ISPC_code_stream_enabled()) { + stream2 << endl; + } + else { + stream << endl; + } } void IRPrinter::visit(const Continue*) { doIndent(); - stream << "continue;" << endl; + if (!is_ISPC_code_stream_enabled()) { + stream << "continue;" << endl; + } + else { + stream2 << "continue;" << endl; + } } void IRPrinter::visit(const Break*) { doIndent(); - stream << "break;" << endl; + if (!is_ISPC_code_stream_enabled()) { + stream << "break;" << endl; + } + else { + stream2 << "break;" << endl; + } } void IRPrinter::visit(const Print* op) { @@ -585,7 +928,12 @@ void IRPrinter::visit(const Print* op) { } void IRPrinter::visit(const GetProperty* op) { - stream << op->name; + if (is_ISPC_code_stream_enabled()) { + stream2 << op->name; + } + else { + stream << op->name; + } } void IRPrinter::visit(const Sort* op) { @@ -643,23 +991,47 @@ void IRPrinter::resetNameCounters() { } void IRPrinter::doIndent() { - for (int i=0; ivar); Expr start = rewrite(op->start); Expr end = rewrite(op->end); diff --git a/src/lower/lowerer_impl_imperative.cpp b/src/lower/lowerer_impl_imperative.cpp index b4c9ea710..53ffd936f 100644 --- a/src/lower/lowerer_impl_imperative.cpp +++ b/src/lower/lowerer_impl_imperative.cpp @@ -1,4 +1,5 @@ #include +#include "taco/cuda.h" #include "taco/lower/lowerer_impl_imperative.h" #include "taco/lower/lowerer_impl.h" @@ -26,6 +27,7 @@ class LowererImplImperative::Visitor : public IndexNotationVisitorStrict { public: Visitor(LowererImplImperative* impl) : impl(impl) {} Stmt lower(IndexStmt stmt) { + std::cout << "lowering IndexStmt to ir:Stmt - IndexStmt: " << stmt << std::endl; this->stmt = Stmt(); impl->accessibleIterators.scope(); IndexStmtVisitorStrict::visit(stmt); @@ -200,6 +202,7 @@ static std::set hasSparseInserts(IndexStmt stmt, Iterators iterators, return ret; } + Stmt LowererImplImperative::lower(IndexStmt stmt, string name, bool assemble, bool compute, bool pack, bool unpack) @@ -586,19 +589,27 @@ LowererImplImperative::splitAppenderAndInserters(const vector& results } +// important function +/* +* This is the for loop lowering part +*/ Stmt LowererImplImperative::lowerForall(Forall forall) { + std::cout << "doing lowerForall: " << forall << std::endl; bool hasExactBound = provGraph.hasExactBound(forall.getIndexVar()); bool forallNeedsUnderivedGuards = !hasExactBound && emitUnderivedGuards; if (!ignoreVectorize && forallNeedsUnderivedGuards && (forall.getParallelUnit() == ParallelUnit::CPUVector || forall.getUnrollFactor() > 0)) { + std::cout << "calling lowerForallCloned(forall)\n"; return lowerForallCloned(forall); } + std::cout << "inParallelLoopDepth: " << inParallelLoopDepth << "========================\n"; if (forall.getParallelUnit() != ParallelUnit::NotParallel) { inParallelLoopDepth++; } + std::cout << "inParallelLoopDepth: " << inParallelLoopDepth << "========================\n"; // Recover any available parents that were not recoverable previously vector recoverySteps; @@ -786,19 +797,23 @@ Stmt LowererImplImperative::lowerForall(Forall forall) } if (!isWhereProducer && hasPosDescendant && underivedAncestors.size() > 1 && provGraph.isPosVariable(iterator.getIndexVar()) && posDescendant == forall.getIndexVar()) { + std::cout << "calling lowerForallFusedPosition(forall\n"; loops = lowerForallFusedPosition(forall, iterator, locators, inserters, appenders, reducedAccesses, recoveryStmt); } else if (canAccelWithSparseIteration) { + std::cout << "calling lowerForallDenseAcceleration(forall\n"; loops = lowerForallDenseAcceleration(forall, locators, inserters, appenders, reducedAccesses, recoveryStmt); } // Emit dimension coordinate iteration loop else if (iterator.isDimensionIterator()) { + std::cout << "calling lowerForallDimension(forall\n"; loops = lowerForallDimension(forall, point.locators(), inserters, appenders, reducedAccesses, recoveryStmt); } // Emit position iteration loop else if (iterator.hasPosIter()) { + std::cout << "calling lowerForallPosition(forall\n"; loops = lowerForallPosition(forall, iterator, locators, inserters, appenders, reducedAccesses, recoveryStmt); } @@ -816,6 +831,10 @@ Stmt LowererImplImperative::lowerForall(Forall forall) loops = lowerMergeLattice(lattice, underivedAncestors[0], forall.getStmt(), reducedAccesses); } + + std::cout << "printing loops ----------------------------------------------------------------------------------------------\n"; + std::cout << loops << std::endl; + std::cout << "loops printed -----------------------------------------------------------------------------------------------\n"; // taco_iassert(loops.defined()); if (!generateComputeCode() && !hasStores(loops)) { @@ -832,6 +851,7 @@ Stmt LowererImplImperative::lowerForall(Forall forall) parallelUnitIndexVars.erase(forall.getParallelUnit()); parallelUnitSizes.erase(forall.getParallelUnit()); } + return Block::blanks(preInitValues, temporaryValuesInitFree[0], loops, @@ -1136,6 +1156,7 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, set reducedAccesses, ir::Stmt recoveryStmt) { + std::cout << "1 Stmt LowererImplImperative::lowerForallDimension\n"; Expr coordinate = getCoordinateVar(forall.getIndexVar()); if (forall.getParallelUnit() != ParallelUnit::NotParallel && forall.getOutputRaceStrategy() == OutputRaceStrategy::Atomics) { @@ -1143,6 +1164,8 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, atomicParallelUnit = forall.getParallelUnit(); } + std::cout << "original forall : " << forall << std::endl; + std::cout << "inside IndexStmt: " << forall.getStmt() << std::endl; Stmt body = lowerForallBody(coordinate, forall.getStmt(), locators, inserters, appenders, reducedAccesses); @@ -1158,7 +1181,13 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, std::vector bounds = provGraph.deriveIterBounds(forall.getIndexVar(), definedIndexVarsOrdered, underivedBounds, indexVarToExprMap, iterators); LoopKind kind = LoopKind::Serial; - if (forall.getParallelUnit() == ParallelUnit::CPUVector && !ignoreVectorize) { + if (should_use_ISPC_codegen()) { + std::cout << "Foreach compatible loop\n"; + if (forall.getParallelUnit() == ParallelUnit::CPUVector && !ignoreVectorize) { + kind = LoopKind::Foreach; + } + } + else if (forall.getParallelUnit() == ParallelUnit::CPUVector && !ignoreVectorize) { kind = LoopKind::Vectorized; } else if (forall.getParallelUnit() != ParallelUnit::NotParallel @@ -1166,6 +1195,7 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, kind = LoopKind::Runtime; } + std::cout << "2 Stmt LowererImplImperative::lowerForallDimension\n"; return Block::blanks(For::make(coordinate, bounds[0], bounds[1], 1, body, kind, ignoreVectorize ? ParallelUnit::NotParallel : forall.getParallelUnit(), ignoreVectorize ? 0 : forall.getUnrollFactor()), @@ -1179,6 +1209,7 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, set reducedAccesses, ir::Stmt recoveryStmt) { + std::cout << "1 Stmt LowererImplImperative::lowerForallDenseAcceleration\n"; taco_iassert(locators.size() == 1) << "Optimizing a dense workspace is only supported when the consumer is the only RHS tensor"; taco_iassert(provGraph.isFullyDerived(forall.getIndexVar())) << "Sparsely accelerating a dense workspace only works with fully derived index vars"; taco_iassert(forall.getParallelUnit() == ParallelUnit::NotParallel) << "Sparsely accelerating a dense workspace only works within serial loops"; @@ -1204,6 +1235,8 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, } Stmt declareVar = VarDecl::make(coordinate, Load::make(indexList, loopVar)); + std::cout << "original forall : " << forall << std::endl; + std::cout << "inside IndexStmt: " << forall.getStmt() << std::endl; Stmt body = lowerForallBody(coordinate, forall.getStmt(), locators, inserters, appenders, reducedAccesses); Stmt resetGuard = ir::Store::make(bitGuard, coordinate, ir::Literal::make(false), markAssignsAtomicDepth > 0, atomicParallelUnit); @@ -1216,7 +1249,12 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, Stmt posAppend = generateAppendPositions(appenders); LoopKind kind = LoopKind::Serial; - if (forall.getParallelUnit() == ParallelUnit::CPUVector && !ignoreVectorize) { + if (should_use_ISPC_codegen()) { + if (forall.getParallelUnit() == ParallelUnit::CPUVector && !ignoreVectorize) { + kind = LoopKind::Foreach; + } + } + else if (forall.getParallelUnit() == ParallelUnit::CPUVector && !ignoreVectorize) { kind = LoopKind::Vectorized; } else if (forall.getParallelUnit() != ParallelUnit::NotParallel @@ -1224,6 +1262,7 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, kind = LoopKind::Runtime; } + std::cout << "2 Stmt LowererImplImperative::lowerForallDenseAcceleration\n"; return Block::blanks(For::make(loopVar, 0, indexListSize, 1, body, kind, ignoreVectorize ? ParallelUnit::NotParallel : forall.getParallelUnit(), ignoreVectorize ? 0 : forall.getUnrollFactor()), @@ -1247,6 +1286,8 @@ Stmt LowererImplImperative::lowerForallPosition(Forall forall, Iterator iterator set reducedAccesses, ir::Stmt recoveryStmt) { + std::cout << "1 Stmt LowererImplImperative::lowerForallPosition\n" << std::endl; + Expr coordinate = getCoordinateVar(forall.getIndexVar()); Stmt declareCoordinate = Stmt(); Stmt strideGuard = Stmt(); @@ -1278,6 +1319,11 @@ Stmt LowererImplImperative::lowerForallPosition(Forall forall, Iterator iterator markAssignsAtomicDepth++; } + // see we are inside a forall. ex: forall(i, forall(j, y(i) += A(i,j) * x(j))) + // when you call forall.getStmt it returns forall(j, y(i) += A(i,j) * x(j)) which is the + // IndexStmt inside the forall IndexStmt + std::cout << "original forall : " << forall << std::endl; + std::cout << "inside IndexStmt: " << forall.getStmt() << std::endl; Stmt body = lowerForallBody(coordinate, forall.getStmt(), locators, inserters, appenders, reducedAccesses); @@ -1339,6 +1385,7 @@ Stmt LowererImplImperative::lowerForallPosition(Forall forall, Iterator iterator kind = LoopKind::Runtime; } + std::cout << "2 Stmt LowererImplImperative::lowerForallPosition\n" << std::endl; // Loop with preamble and postamble return Block::blanks( boundsCompute, @@ -1357,6 +1404,7 @@ Stmt LowererImplImperative::lowerForallFusedPosition(Forall forall, Iterator ite set reducedAccesses, ir::Stmt recoveryStmt) { + std::cout << "1 Stmt LowererImplImperative::lowerForallFusedPosition\n" << std::endl; Expr coordinate = getCoordinateVar(forall.getIndexVar()); Stmt declareCoordinate = Stmt(); if (provGraph.isCoordVariable(forall.getIndexVar())) { @@ -1447,6 +1495,8 @@ Stmt LowererImplImperative::lowerForallFusedPosition(Forall forall, Iterator ite markAssignsAtomicDepth++; } + std::cout << "original forall : " << forall << std::endl; + std::cout << "inside IndexStmt: " << forall.getStmt() << std::endl; Stmt body = lowerForallBody(coordinate, forall.getStmt(), locators, inserters, appenders, reducedAccesses); @@ -1503,6 +1553,8 @@ Stmt LowererImplImperative::lowerForallFusedPosition(Forall forall, Iterator ite && forall.getOutputRaceStrategy() != OutputRaceStrategy::ParallelReduction && !ignoreVectorize) { kind = LoopKind::Runtime; } + + std::cout << "2 Stmt LowererImplImperative::lowerForallFusedPosition\n" << std::endl; // Loop with preamble and postamble return Block::blanks(boundsCompute, Block::make(Block::make(searchForUnderivedStart), @@ -1765,6 +1817,9 @@ Stmt LowererImplImperative::lowerForallBody(Expr coordinate, IndexStmt stmt, vector inserters, vector appenders, const set& reducedAccesses) { + + std::cout << "lowering a forall body----------------------------------------------------\n"; + Stmt initVals = resizeAndInitValues(appenders, reducedAccesses); // Inserter positions @@ -1780,6 +1835,7 @@ Stmt LowererImplImperative::lowerForallBody(Expr coordinate, IndexStmt stmt, // Code of loop body statement Stmt body = lower(stmt); + std::cout << "\nBefore: [" << stmt << "]\nAfter : [" << body << "]\n"; // Code to append coordinates Stmt appendCoords = appendCoordinate(appenders, coordinate); @@ -1889,6 +1945,7 @@ vector LowererImplImperative::codeToInitializeDenseAcceleratorArrays(Where Expr p = Var::make("p" + temporary.getName(), Int()); Stmt guardZeroInit = Store::make(alreadySetArr, p, ir::Literal::zero(bitGuardType)); + std::cout << "vector LowererImplImperative::codeToInitializeDenseAcceleratorArrays\n" << std::endl; Stmt zeroInitLoop = For::make(p, 0, bitGuardSize, 1, guardZeroInit, LoopKind::Serial); Stmt inits = Block::make(alreadySetDecl, indexListDecl, allocateAlreadySet, allocateIndexList, zeroInitLoop); return {inits, freeTemps}; @@ -2203,6 +2260,7 @@ Stmt LowererImplImperative::lowerWhere(Where where) { true, false); Expr size = getTemporarySize(where); Stmt zeroInit = Store::make(values, p, ir::Literal::zero(temporary.getType().getDataType())); + std::cout << "Stmt LowererImplImperative::lowerWhere\n"; Stmt loopInit = For::make(p, 0, size, 1, zeroInit, LoopKind::Serial); initializeTemporary = Block::make(initializeTemporary, loopInit); } @@ -2334,6 +2392,7 @@ Stmt LowererImplImperative::lowerAssemble(Assemble assemble) { resultModeOrdering[iter.getMode().getLevel() - 1]); Expr pos = iter.getPosVar(); Stmt initPos = VarDecl::make(pos, iter.locate(locateCoords)[0]); + std::cout << "Stmt LowererImplImperative::lowerAssemble\n"; insertEdgeLoop = For::make(coords.back(), 0, dim, 1, Block::make(initPos, insertEdgeLoop)); } else { @@ -2415,6 +2474,7 @@ Stmt LowererImplImperative::lowerMulti(Multi multi) { } Stmt LowererImplImperative::lowerSuchThat(SuchThat suchThat) { + std::cout << "lowering such that statement\n"; Stmt stmt = lower(suchThat.getStmt()); return Block::make(stmt); } @@ -2942,6 +3002,7 @@ Stmt LowererImplImperative::resizeAndInitValues(const std::vector& app Stmt LowererImplImperative::zeroInitValues(Expr tensor, Expr begin, Expr size) { + std::cout << "1 Stmt LowererImplImperative::zeroInitValues\n"; Expr lower = simplify(ir::Mul::make(begin, size)); Expr upper = simplify(ir::Mul::make(ir::Add::make(begin, 1), size)); Expr p = Var::make("p" + util::toString(tensor), Int()); @@ -2954,6 +3015,10 @@ Stmt LowererImplImperative::zeroInitValues(Expr tensor, Expr begin, Expr size) { return ir::VarDecl::make(ir::Var::make("status", Int()), ir::Call::make("cudaMemset", {values, ir::Literal::make(0, Int()), ir::Mul::make(ir::Sub::make(upper, lower), ir::Literal::make(values.type().getNumBytes()))}, Int())); } + std::cout << "2 Stmt LowererImplImperative::zeroInitValues\n"; + if (should_use_ISPC_codegen()) { + return For::make(p, lower, upper, 1, zeroInit, LoopKind::Foreach); + } return For::make(p, lower, upper, 1, zeroInit, parallel); } diff --git a/src/tensor.cpp b/src/tensor.cpp index fab437ff1..3519456c9 100644 --- a/src/tensor.cpp +++ b/src/tensor.cpp @@ -278,6 +278,7 @@ static size_t unpackTensorData(const taco_tensor_t& tensorData, /// Pack coordinates into a data structure given by the tensor format. void TensorBase::pack() { + std::cout << "TensorBase::Pack() method\n"; if (!needsPack()) { return; } @@ -346,6 +347,7 @@ void TensorBase::pack() { taco_iassert((content->coordinateBufferUsed % content->coordinateSize) == 0); const size_t numCoordinates = content->coordinateBufferUsed / content->coordinateSize; + std::cout << "call helperFuncs\n"; const auto helperFuncs = getHelperFunctions(getFormat(), getComponentType(), dimensions); @@ -623,6 +625,7 @@ void TensorBase::compile() { compile(stmt, content->assembleWhileCompute); } void TensorBase::compile(taco::IndexStmt stmt, bool assembleWhileCompute) { + std::cout << "TensorBase::compile\n"; if (!needsCompile()) { return; } @@ -934,6 +937,7 @@ TensorBase::getHelperFunctions(const Format& format, Datatype ctype, }; const auto dims = util::map(dimensions, getDim); + set_ISPC_code_stream_enabled(false); if (format.getOrder() > 0) { const Format bufferFormat = COO(format.getOrder(), false, true, false, format.getModeOrdering()); @@ -951,6 +955,7 @@ TensorBase::getHelperFunctions(const Format& format, Datatype ctype, } // Lower packing and iterator code. + std::cout << "1 Lower packing and iterator code\n"; helperModule->addFunction(lower(packStmt, "pack", true, true)); helperModule->addFunction(lower(iterateStmt, "iterate", false, true)); } else { @@ -964,12 +969,14 @@ TensorBase::getHelperFunctions(const Format& format, Datatype ctype, IndexVar indexVar; IndexStmt assignment = (packedScalar() = bufferVector(indexVar)); IndexStmt packStmt= makeConcreteNotation(makeReductionNotation(assignment)); + std::cout << "2 Lower packing and iterator code\n"; helperModule->addFunction(lower(packStmt, "pack", true, true)); // Define and lower iterator code. IndexStmt iterateStmt = Yield({}, packedScalar()); helperModule->addFunction(lower(iterateStmt, "iterate", false, true)); } + std::cout << "Compiling the helperModule\n"; helperModule->compile(); helperFunctionsMutex.lock(); diff --git a/test/tests-scheduling-eval.cpp b/test/tests-scheduling-eval.cpp index f59359081..6a228f38b 100644 --- a/test/tests-scheduling-eval.cpp +++ b/test/tests-scheduling-eval.cpp @@ -4,6 +4,7 @@ #include #include #include +#include "taco/cuda.h" #include "test.h" #include "test_tensors.h" #include "taco/tensor.h" @@ -48,10 +49,10 @@ IndexStmt scheduleSpMVCPU(IndexStmt stmt, int CHUNK_SIZE=16) { IndexStmt scheduleSpMVISPC(IndexStmt stmt, int CHUNK_SIZE=16) { IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1"); - return stmt; - // return stmt.split(i, i0, i1, CHUNK_SIZE) - // .reorder({i0, i1, j}) - // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); + // return stmt; + return stmt.split(i, i0, i1, CHUNK_SIZE) + .reorder({i0, i1, j}) + .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); } IndexStmt scheduleSpMMCPU(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { @@ -64,6 +65,16 @@ IndexStmt scheduleSpMMCPU(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, i .parallelize(k, ParallelUnit::CPUVector, OutputRaceStrategy::IgnoreRaces); } +IndexStmt scheduleSpMMISPC(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); + return stmt.split(i, i0, i1, CHUNK_SIZE) + .pos(j, jpos, A(i,j)) + .split(jpos, jpos0, jpos1, UNROLL_FACTOR) + .reorder({i0, i1, jpos0, k, jpos1}) + .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + .parallelize(k, ParallelUnit::CPUVector, OutputRaceStrategy::IgnoreRaces); +} + IndexStmt scheduleSpGEMMCPU(IndexStmt stmt, bool doPrecompute) { Assignment assign = stmt.as().getStmt().as().getStmt() .as().getStmt().as(); @@ -1473,8 +1484,6 @@ TEST(scheduling_eval, mttkrpGPU) { ASSERT_TENSOR_EQ(expected, A); } - - TEST(generate_ispc_evaluation_files, ispc) { std::cout << "Hi Adhitha!\n" << std::endl ; set_CUDA_codegen_enabled(false); @@ -1495,15 +1504,18 @@ TEST(generate_ispc_evaluation_files, ispc) { int NUM_I = 100; int NUM_J = 100; + int NUM_K = 100; + string c_file_ending = ".h"; string file_ending = ".ispc"; string file_path = "eval_prepared_ispc/"; mkdir(file_path.c_str(), 0777); // spmv { - stringstream source; - std::shared_ptr codegen = ir::CodeGen::init_default(source, ir::CodeGen::ImplementationGen); + stringstream source1; + stringstream source2; + std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); Tensor A("A", {NUM_I, NUM_J}, CSR); Tensor x("x", {NUM_J}, {Dense}); Tensor y("y", {NUM_I}, {Dense}); @@ -1511,18 +1523,53 @@ TEST(generate_ispc_evaluation_files, ispc) { std::cout << "concretizing the assignment statement\n"; IndexStmt stmt = y.getAssignment().concretize(); std::cout << "Printing the original IndexStmt: " << stmt << std::endl; + for (auto paramSet : spmv_parameters) { std::cout << "param set: " << paramSet[0] << std::endl; IndexStmt scheduled = scheduleSpMVISPC(stmt, paramSet[0]); std::cout << "scheduled IndexStmt: " << scheduled << std::endl; - ir::Stmt compute = lower(scheduled, "spmv_csr_ispc_taco", false, true); + ir::Stmt compute = lower(scheduled, string("compute_") + util::join(paramSet, "_"), false, true); std::cout << "computed statement: \n" << compute << std::endl; codegen->compile(compute, false); } ofstream source_file; - source_file.open(file_path + "spmv_csr_ispc_taco.h"); - source_file << source.str(); + source_file.open(file_path + "spmv_csr_ispc_taco" + c_file_ending); + source_file << source1.str(); + source_file.close(); + + ofstream ispc_source_file; + ispc_source_file.open(file_path + "__spmv_csr_ispc_taco" + file_ending); + ispc_source_file << source2.str(); + ispc_source_file.close(); + + } + + // spmm + { + stringstream source1; + stringstream source2; + std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); + Tensor A("A", {NUM_I, NUM_J}, CSR); + Tensor B("B", {NUM_J, NUM_K}, {Dense, Dense}); + Tensor C("C", {NUM_I, NUM_K}, {Dense, Dense}); + C(i, k) = A(i, j) * B(j, k); + IndexStmt stmt = C.getAssignment().concretize(); + bool isFirst = true; + for (auto paramSet : spmm_parameters) { + IndexStmt scheduled = scheduleSpMMISPC(stmt, A, paramSet[0], paramSet[1]); + ir::Stmt compute = lower(scheduled, string("compute_") + util::join(paramSet, "_"), false, true); + codegen->compile(compute, isFirst); + isFirst = false; + } + ofstream source_file; + source_file.open(file_path + "spmm_csr_ispc_taco" + c_file_ending); + source_file << source1.str(); source_file.close(); + + ofstream ispc_source_file; + ispc_source_file.open(file_path + "__spmm_csr_ispc_taco" + file_ending); + ispc_source_file << source2.str(); + ispc_source_file.close(); } @@ -1846,9 +1893,13 @@ TEST(generate_evaluation_files, cpu) { } TEST(generate_evaluation_files, gpu) { - if (!should_use_CUDA_codegen()) { - return; - } + // if (!should_use_CUDA_codegen()) { + // return; + // } + set_CUDA_codegen_enabled(true); + set_ISPC_codegen_enabled(false); + + std::cout << "executing generate_evaluation_file.gpu\n"; vector> spmv_parameters = {}; // {NNZ_PER_THREAD, BLOCK_SIZE} for (int i = 3; i <= 20; i++) { diff --git a/tools/taco.cpp b/tools/taco.cpp index ce03b61e1..9a864a699 100644 --- a/tools/taco.cpp +++ b/tools/taco.cpp @@ -1297,6 +1297,7 @@ int main(int argc, char* argv[]) { } bool hasPrinted = false; + std::shared_ptr codegen = ir::CodeGen::init_default(cout, ir::CodeGen::ImplementationGen); codegen->setColor(color); if (printAssemble) { @@ -1317,6 +1318,7 @@ int main(int argc, char* argv[]) { } if (compute.defined()) { + std::cout << "Code generation\n"; codegen->compile(compute, false); } else { From 4e7bd6879c5f7ca1f43397dff5cc92259a7e1eda Mon Sep 17 00:00:00 2001 From: Adhhitha Dias Date: Mon, 19 Jul 2021 15:13:47 -0400 Subject: [PATCH 4/8] add CPUSpmd directive partially --- include/taco/index_notation/transformations.h | 2 + include/taco/ir/ir.h | 2 +- include/taco/ir_tags.h | 2 +- include/taco/lower/lowerer_impl_imperative.h | 3 + src/codegen/codegen.cpp | 66 ----- src/codegen/codegen.h | 13 +- src/codegen/codegen_cuda.cpp | 1 + src/codegen/codegen_ispc.cpp | 257 +++++++++++++++++- src/codegen/codegen_ispc.h | 7 + src/index_notation/index_notation_printer.cpp | 4 +- src/index_notation/transformations.cpp | 62 ++++- src/ir/ir_printer.cpp | 43 --- src/ir_tags.cpp | 2 +- src/lower/lowerer_impl_imperative.cpp | 76 +++++- src/tensor.cpp | 1 + test/tests-scheduling-eval.cpp | 207 +++++++++++++- tools/taco.cpp | 47 +++- 17 files changed, 647 insertions(+), 148 deletions(-) diff --git a/include/taco/index_notation/transformations.h b/include/taco/index_notation/transformations.h index 7aa2579ad..6bf277d5c 100644 --- a/include/taco/index_notation/transformations.h +++ b/include/taco/index_notation/transformations.h @@ -223,6 +223,8 @@ IndexStmt parallelizeOuterLoop(IndexStmt stmt); */ IndexStmt reorderLoopsTopologically(IndexStmt stmt); +IndexStmt justTraverseThroughTheIndexStmt(IndexStmt stmt); + /** * Performs scalar promotion so that reductions are done by accumulating into * scalar temporaries whenever possible. diff --git a/include/taco/ir/ir.h b/include/taco/ir/ir.h index cb46b5142..651faff4e 100644 --- a/include/taco/ir/ir.h +++ b/include/taco/ir/ir.h @@ -591,7 +591,7 @@ struct Switch : public StmtNode { static const IRNodeType _type_info = IRNodeType::Switch; }; -enum class LoopKind {Serial, Static, Dynamic, Runtime, Vectorized, Static_Chunked, Foreach}; +enum class LoopKind {Serial, Static, Dynamic, Runtime, Vectorized, Static_Chunked, Foreach, Mul_Thread}; /** A for loop from start to end by increment. * A vectorized loop will require the increment to be 1 and the diff --git a/include/taco/ir_tags.h b/include/taco/ir_tags.h index 5858a13e3..6a74be173 100644 --- a/include/taco/ir_tags.h +++ b/include/taco/ir_tags.h @@ -9,7 +9,7 @@ namespace taco { /// ParallelUnit::GPUWarp can be optionally used to allow for GPU warp-level primitives /// ParallelUnit::GPUThread causes for every iteration to be executed on a separate GPU thread enum class ParallelUnit { - NotParallel, DefaultUnit, GPUBlock, GPUWarp, GPUThread, CPUThread, CPUVector, CPUThreadGroupReduction, GPUBlockReduction, GPUWarpReduction + NotParallel, DefaultUnit, GPUBlock, GPUWarp, GPUThread, CPUThread, CPUVector, CPUThreadGroupReduction, GPUBlockReduction, GPUWarpReduction, CPUSimd, CPUSpmd }; extern const char *ParallelUnit_NAMES[]; diff --git a/include/taco/lower/lowerer_impl_imperative.h b/include/taco/lower/lowerer_impl_imperative.h index 65f069fda..d743f5875 100644 --- a/include/taco/lower/lowerer_impl_imperative.h +++ b/include/taco/lower/lowerer_impl_imperative.h @@ -499,10 +499,13 @@ class LowererImplImperative : public LowererImpl { bool emitUnderivedGuards = true; + int loopDepth = 0; int inParallelLoopDepth = 0; std::map parallelUnitSizes; std::map parallelUnitIndexVars; + std::map forUnits; // + std::map whereTempsWithLoopDepth; /// Keep track of what IndexVars have already been defined std::set definedIndexVars; diff --git a/src/codegen/codegen.cpp b/src/codegen/codegen.cpp index 750f33516..7081bc195 100644 --- a/src/codegen/codegen.cpp +++ b/src/codegen/codegen.cpp @@ -441,72 +441,6 @@ string CodeGen::printDecls(map varMap, return ret.str(); } -string CodeGen::printCallISPCFunc(const Function *func, map varMap, - vector &sortedProps) { - std::stringstream ret; - ret << " "; - unordered_set propsAlreadyGenerated; - - ret << "__" << func->name << "("; - - vector inputs = func->inputs; - vector outputs = func->outputs; - getSortedProps(varMap, sortedProps, inputs, outputs); - - for (unsigned long i=0; i < sortedProps.size(); i++) { - ret << varMap[sortedProps[i]]; - if (i != sortedProps.size()-1) { - ret << ", "; - } - propsAlreadyGenerated.insert(varMap[sortedProps[i]]); - } - - ret << ");\n"; - return ret.str(); -} - -string CodeGen::printISPCFunc(const Function *func, map varMap, - vector &sortedProps) { - std::stringstream ret; - ret << "export void "; - unordered_set propsAlreadyGenerated; - - ret << "__" << func->name << "("; - - vector inputs = func->inputs; - vector outputs = func->outputs; - // getSortedProps(varMap, sortedProps, inputs, outputs); - - for (unsigned long i=0; i < sortedProps.size(); i++) { - auto prop = sortedProps[i]; - bool isOutputProp = (find(outputs.begin(), outputs.end(), - prop->tensor) != outputs.end()); - - auto var = prop->tensor.as(); - if (var->is_parameter) { - if (isOutputProp) { - ret << " " << printTensorProperty(varMap[prop], prop, false) << ";" << endl; - } else { - break; - } - } else { - ret << getUnpackedTensorArgument(varMap[prop], prop, isOutputProp); - } - propsAlreadyGenerated.insert(varMap[prop]); - - if (i!=sortedProps.size()-1) { - ret << ", "; - } - if (i%2==0) { - ret << "\n\t"; - } - } - ret << ") {\n"; - - return ret.str(); -} - - string CodeGen::printPack(map, string> outputProperties, vector outputs) { stringstream ret; diff --git a/src/codegen/codegen.h b/src/codegen/codegen.h index 641239834..db891f995 100644 --- a/src/codegen/codegen.h +++ b/src/codegen/codegen.h @@ -49,10 +49,6 @@ class CodeGen : public IRPrinter { std::string printContextDeclAndInit(std::map varMap, std::vector localVars, int labels, std::string funcName); - std::string printCallISPCFunc(const Function *func, std::map varMap, - std::vector &sortedProps); - std::string printISPCFunc(const Function *func, std::map varMap, - std::vector &sortedProps); std::string printDecls(std::map varMap, std::vector inputs, std::vector outputs); std::string printPack(std::map, @@ -63,6 +59,10 @@ class CodeGen : public IRPrinter { std::string printFuncName(const Function *func, std::map inputMap={}, std::map outputMap={}); + + std::string printTensorProperty(std::string varname, const GetProperty* op, bool is_ptr); + std::string getUnpackedTensorArgument(std::string varname, const GetProperty* op, + bool is_output_prop); void resetUniqueNameCounters(); std::string genUniqueName(std::string name); @@ -72,11 +72,8 @@ class CodeGen : public IRPrinter { private: virtual std::string restrictKeyword() const { return ""; } - std::string printTensorProperty(std::string varname, const GetProperty* op, bool is_ptr); std::string unpackTensorProperty(std::string varname, const GetProperty* op, - bool is_output_prop); - std::string getUnpackedTensorArgument(std::string varname, const GetProperty* op, - bool is_output_prop); + bool is_output_prop); std::string packTensorProperty(std::string varname, Expr tnsr, TensorProperty property, int mode, int index); std::string pointTensorProperty(std::string varname); diff --git a/src/codegen/codegen_cuda.cpp b/src/codegen/codegen_cuda.cpp index 77cf0cd88..14505f740 100644 --- a/src/codegen/codegen_cuda.cpp +++ b/src/codegen/codegen_cuda.cpp @@ -646,6 +646,7 @@ void CodeGen_CUDA::printDeviceFunctions(const Function* func) { // Collect device functions resetUniqueNameCounters(); deviceFunctionLoopDepth = 0; + // here they calculate the device FunctionCollecor DeviceFunctionCollector deviceFunctionCollector(func->inputs, func->outputs, this); func->body.accept(&deviceFunctionCollector); deviceFunctions = deviceFunctionCollector.blockFors; diff --git a/src/codegen/codegen_ispc.cpp b/src/codegen/codegen_ispc.cpp index f107728cc..c8480cd25 100644 --- a/src/codegen/codegen_ispc.cpp +++ b/src/codegen/codegen_ispc.cpp @@ -7,6 +7,9 @@ #include "taco/cuda.h" #include "taco/ir/ir_visitor.h" +#include "taco/ir/ir_rewriter.h" +#include "taco/ir/simplify.h" + #include "codegen_ispc.h" #include "taco/error.h" #include "taco/util/strings.h" @@ -240,6 +243,121 @@ class CodeGen_ISPC::FindVars : public IRVisitor { } }; + +// Finds all for loops tagged with accelerator and adds statements to deviceFunctions +// Also tracks scope of when device function is called and +// tracks which variables must be passed to function. +class CodeGen_ISPC::DeviceFunctionCollector : public IRVisitor { +public: + vector blockFors; + vector threadFors; // contents is device function + vector warpFors; + map scopeMap; + + // the variables to pass to each device function + vector>> functionParameters; + vector> currentParameters; // keep as vector so code generation is deterministic + set currentParameterSet; + + set variablesDeclaredInKernel; + + vector> threadIDVars; + vector> blockIDVars; + vector> warpIDVars; + vector numThreads; + vector numWarps; + + CodeGen_ISPC *codeGen; + // copy inputs and outputs into the map + DeviceFunctionCollector(vector inputs, vector outputs, CodeGen_ISPC *codeGen) : codeGen(codeGen) { + inDeviceFunction = false; + for (auto v: inputs) { + auto var = v.as(); + taco_iassert(var) << "Inputs must be vars in codegen"; + taco_iassert(scopeMap.count(var) == 0) << + "Duplicate input found in codegen"; + scopeMap[var] = var->name; + } + for (auto v: outputs) { + auto var = v.as(); + taco_iassert(var) << "Outputs must be vars in codegen"; + taco_iassert(scopeMap.count(var) == 0) << + "Duplicate output found in codegen"; + + scopeMap[var] = var->name; + } + } + +protected: + bool inDeviceFunction; + using IRVisitor::visit; + + virtual void visit(const For *op) { + if (op->parallel_unit == ParallelUnit::CPUSpmd) { + std::cout << "ParallelUnit::CPUSpmd directive found\n"; + inDeviceFunction = false; + op->var.accept(this); + inDeviceFunction = true; + + threadFors.push_back(op); + std::cout << "scopeMap: [" << scopeMap[op->var] << "], varExpr: [" << op->var << "]\n"; + threadIDVars.push_back(pair(scopeMap[op->var], op->var)); + Expr blockSize = ir::simplify(ir::Div::make(ir::Sub::make(op->end, op->start), op->increment)); + numThreads.push_back(blockSize); + + } + else if (op->parallel_unit == ParallelUnit::CPUSimd) { + + } + else{ + op->var.accept(this); + } + op->start.accept(this); + op->end.accept(this); + op->increment.accept(this); + op->contents.accept(this); + } + + virtual void visit(const Var *op) { + if (scopeMap.count(op) == 0) { + string name = codeGen->genUniqueName(op->name); + if (!inDeviceFunction) { + scopeMap[op] = name; + } + } + else if (scopeMap.count(op) == 1 && inDeviceFunction && currentParameterSet.count(op) == 0 + && (threadIDVars.empty() || op != threadIDVars.back().second) + && !variablesDeclaredInKernel.count(op)) { + currentParameters.push_back(pair(scopeMap[op], op)); + currentParameterSet.insert(op); + } + } + + virtual void visit(const VarDecl *op) { + if (inDeviceFunction) { + variablesDeclaredInKernel.insert(op->var); + } + op->var.accept(this); + op->rhs.accept(this); + } + + virtual void visit(const GetProperty *op) { + if (scopeMap.count(op->tensor) == 0 && !inDeviceFunction) { + auto key = + tuple(op->tensor,op->property, + (size_t)op->mode, + (size_t)op->index); + auto unique_name = codeGen->genUniqueName(op->name); + scopeMap[op->tensor] = unique_name; + } + else if (scopeMap.count(op->tensor) == 1 && inDeviceFunction && currentParameterSet.count(op->tensor) == 0) { + currentParameters.push_back(pair(op->tensor.as()->name, op->tensor)); + currentParameterSet.insert(op->tensor); + } + } +}; + + CodeGen_ISPC::CodeGen_ISPC(std::ostream &dest, OutputKind outputKind, bool simplify) : CodeGen(dest, false, simplify, C), out(dest), out2(dest), outputKind(outputKind) {} @@ -262,6 +380,76 @@ void CodeGen_ISPC::compile(Stmt stmt, bool isFirst) { stmt.accept(this); } +string CodeGen_ISPC::printCallISPCFunc(const Function *func, map varMap, + vector &sortedProps) { + std::stringstream ret; + ret << " "; + unordered_set propsAlreadyGenerated; + + ret << "__" << func->name << "("; + + vector inputs = func->inputs; + vector outputs = func->outputs; + getSortedProps(varMap, sortedProps, inputs, outputs); + + for (unsigned long i=0; i < sortedProps.size(); i++) { + ret << varMap[sortedProps[i]]; + if (i != sortedProps.size()-1) { + ret << ", "; + } + propsAlreadyGenerated.insert(varMap[sortedProps[i]]); + } + + ret << ");\n"; + return ret.str(); +} + +string CodeGen_ISPC::printISPCFunc(const Function *func, map varMap, + vector &sortedProps) { + + DeviceFunctionCollector deviceFunctionCollector(func->inputs, func->outputs, this); + func->body.accept(&deviceFunctionCollector); + + + std::stringstream ret; + ret << "export void "; + unordered_set propsAlreadyGenerated; + + ret << "__" << func->name << "("; + + vector inputs = func->inputs; + vector outputs = func->outputs; + // getSortedProps(varMap, sortedProps, inputs, outputs); + + for (unsigned long i=0; i < sortedProps.size(); i++) { + auto prop = sortedProps[i]; + bool isOutputProp = (find(outputs.begin(), outputs.end(), + prop->tensor) != outputs.end()); + + auto var = prop->tensor.as(); + if (var->is_parameter) { + if (isOutputProp) { + ret << " " << printTensorProperty(varMap[prop], prop, false) << ";" << endl; + } else { + break; + } + } else { + ret << getUnpackedTensorArgument(varMap[prop], prop, isOutputProp); + } + propsAlreadyGenerated.insert(varMap[prop]); + + if (i!=sortedProps.size()-1) { + ret << ", "; + } + if (i%2==0) { + ret << "\n\t"; + } + } + ret << "\n) {\n\n"; + + return ret.str(); +} + void CodeGen_ISPC::sendToStream(std::stringstream &stream) { if (is_ISPC_code_stream_enabled()) { this->out2 << stream.str(); @@ -466,6 +654,21 @@ void CodeGen_ISPC::visit(const For* op) { case LoopKind::Dynamic: case LoopKind::Runtime: case LoopKind::Static_Chunked: + case LoopKind::Mul_Thread: + op->start.accept(this); + stream2 << std::endl; + op->start.accept(this); + stream2 << std::endl; + op->start.accept(this); + stream2 << std::endl; + op->start.accept(this); + stream2 << std::endl; + op->end.accept(this); + stream2 << std::endl; + op->end.accept(this); + stream2 << std::endl; + op->end.accept(this); + stream2 << std::endl; default: break; } @@ -629,10 +832,58 @@ void CodeGen_ISPC::visit(const Sqrt* op) { void CodeGen_ISPC::visit(const Assign* op) { if (is_ISPC_code_stream_enabled()) { - if (op->use_atomics) { - doIndent(); - stream2 << getAtomicPragma() << endl; + doIndent(); + op->lhs.accept(this); + parentPrecedence = Precedence::TOP; + bool printed = false; + if (simplify) { + if (isa(op->rhs)) { + auto add = to(op->rhs); + if (add->a == op->lhs) { + const Literal* lit = add->b.as(); + if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || + (lit->type.isUInt() && lit->equalsScalar(1)))) { + stream2 << "++"; + } + else { + if (op->use_atomics) { + stream2 << " += reduce_add("; + add->b.accept(this); + stream2 << ")"; + } + else { + stream2 << " += "; + add->b.accept(this); + } + } + printed = true; + } + } + else if (isa(op->rhs)) { + auto mul = to(op->rhs); + if (mul->a == op->lhs) { + stream2 << " *= "; + mul->b.accept(this); + printed = true; + } + } + else if (isa(op->rhs)) { + auto bitOr = to(op->rhs); + if (bitOr->a == op->lhs) { + stream2 << " |= "; + bitOr->b.accept(this); + printed = true; + } + } + } + if (!printed) { + stream2 << " = "; + op->rhs.accept(this); } + + stream2 << ";"; + stream2 << endl; + } else { if (op->use_atomics) { diff --git a/src/codegen/codegen_ispc.h b/src/codegen/codegen_ispc.h index 8abd1cc09..279d0db7a 100644 --- a/src/codegen/codegen_ispc.h +++ b/src/codegen/codegen_ispc.h @@ -43,6 +43,12 @@ class CodeGen_ISPC : public CodeGen { void visit(const Store*); void visit(const Assign*); + Stmt simplifyFunctionBodies(Stmt stmt); + std::string printCallISPCFunc(const Function *func, std::map varMap, + std::vector &sortedProps); + std::string printISPCFunc(const Function *func, std::map varMap, + std::vector &sortedProps); + std::map varMap; std::vector localVars; std::ostream &out; @@ -55,6 +61,7 @@ class CodeGen_ISPC : public CodeGen { bool emittingCoroutine; class FindVars; + class DeviceFunctionCollector; private: virtual std::string restrictKeyword() const { return "restrict"; } diff --git a/src/index_notation/index_notation_printer.cpp b/src/index_notation/index_notation_printer.cpp index 0b41615ad..d7ee998ae 100644 --- a/src/index_notation/index_notation_printer.cpp +++ b/src/index_notation/index_notation_printer.cpp @@ -224,9 +224,9 @@ void IndexNotationPrinter::visit(const YieldNode* op) { void IndexNotationPrinter::visit(const ForallNode* op) { os << "forall(" << op->indexVar << ", "; op->stmt.accept(this); - if (op->parallel_unit != ParallelUnit::NotParallel) { + // if (op->parallel_unit != ParallelUnit::NotParallel) { os << ", " << ParallelUnit_NAMES[(int) op->parallel_unit] << ", " << OutputRaceStrategy_NAMES[(int) op->output_race_strategy]; - } + // } os << ")"; } diff --git a/src/index_notation/transformations.cpp b/src/index_notation/transformations.cpp index 47fc1dd55..011779caf 100644 --- a/src/index_notation/transformations.cpp +++ b/src/index_notation/transformations.cpp @@ -1,8 +1,10 @@ #include "taco/index_notation/transformations.h" +#include "taco/cuda.h" #include "taco/index_notation/index_notation.h" #include "taco/index_notation/index_notation_rewriter.h" #include "taco/index_notation/index_notation_nodes.h" +#include "taco/index_notation/index_notation_printer.h" #include "taco/error/error_messages.h" #include "taco/util/collections.h" #include "taco/lower/iterator.h" @@ -592,7 +594,10 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const { std::string reason = ""; IndexStmt rewriteParallel(IndexStmt stmt) { + std::cout << "1 rewriting IndexStmt to support parallelize schedule directive\n--------------------------------------------\n"; + std::cout << stmt << std::endl; provGraph = ProvenanceGraph(stmt); + std::cout << "2 rewriting IndexStmt to support parallelize schedule directive\n--------------------------------------------\n"; const auto reductionVars = getReductionVars(stmt); reductionIndexVars.clear(); @@ -607,15 +612,22 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const { tensorVars = createIRTensorVars(stmt); assembledByUngroupedInsert.clear(); + std::cout << "3 rewriting IndexStmt to support parallelize schedule directive\n--------------------------------------------\n"; for (const auto& result : getAssembledByUngroupedInsertion(stmt)) { assembledByUngroupedInsert.push_back(tensorVars[result]); } + std::cout << "4 rewriting IndexStmt to support parallelize schedule directive\n--------------------------------------------\n"; + std::cout << stmt << std::endl; return rewrite(stmt); } void visit(const ForallNode* node) { + std::cout << "transformations.cpp void visit(const ForallNode* node)\n"; + std::cout << "node: \n" << node << std::endl; Forall foralli(node); + std::cout << "foralli: \n" << foralli << std::endl; + std::cout << "before stmt update stmt: \n" << stmt << std::endl; IndexVar i = parallelize.geti(); definedIndexVars.insert(foralli.getIndexVar()); @@ -632,6 +644,7 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const { Iterators iterators(foralli, tensorVars); MergeLattice lattice = MergeLattice::make(foralli, iterators, provGraph, definedIndexVars); + std::cout << "iter: " << i << ", lattice: \n" << lattice << std::endl; // Precondition 2: No coiteration of modes (i.e., merge lattice has // only one iterator) @@ -660,6 +673,7 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const { MergeLattice underivedLattice = MergeLattice::make(underivedForall, iterators, provGraph, definedIndexVars); + std::cout << "iter: " << i << ", underivedLattice: \n" << lattice << std::endl; // Precondition 3: Every result iterator must have insert capability for (Iterator iterator : underivedLattice.results()) { @@ -721,6 +735,7 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const { // build consumer that writes from temporary to output, mark consumer as parallel reduction ParallelUnit reductionUnit = ParallelUnit::CPUThreadGroupReduction; if (should_use_CUDA_codegen()) { + std::cout << "should_use_CUDA_codegen() true\n"; if (parentParallelUnits.count(ParallelUnit::GPUWarp)) { reductionUnit = ParallelUnit::GPUWarpReduction; } @@ -728,6 +743,9 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const { reductionUnit = ParallelUnit::GPUBlockReduction; } } + else { + std::cout << "should_use_CUDA_codegen() false\n"; + } IndexStmt consumer = forall(i, Assignment(assignment->lhs, w(i), assignment->op), reductionUnit, OutputRaceStrategy::ParallelReduction); precomputed_stmt = where(consumer, producer); } @@ -746,8 +764,9 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const { return; } - + std::cout << "updated stmt: \n"; stmt = forall(i, foralli.getStmt(), parallelize.getParallelUnit(), parallelize.getOutputRaceStrategy(), foralli.getUnrollFactor()); + std::cout << stmt << std::endl; return; } @@ -1181,6 +1200,7 @@ std::ostream& operator<<(std::ostream& os, IndexStmt parallelizeOuterLoop(IndexStmt stmt) { // get outer ForAll + std::cout << "get outer ForAll ----------------- \n"; Forall forall; bool matched = false; match(stmt, @@ -1215,7 +1235,19 @@ IndexStmt parallelizeOuterLoop(IndexStmt stmt) { } return parallelized256; } + else if (should_use_ISPC_codegen()) { + std::cout << "outer loop parallelization for ISPC codegen\n"; + // IndexStmt parallelized = Parallelize(forall.getIndexVar(), ParallelUnit::CPUSpmd, OutputRaceStrategy::NoRaces).apply(stmt, &reason); + // if (parallelized == IndexStmt()) { + // // can't parallelize + // return stmt; + // } + // return parallelized; + + return stmt; + } else { + std::cout << "outer loop parallelization for CPU codgen index statement\n"; IndexStmt parallelized = Parallelize(forall.getIndexVar(), ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces).apply(stmt, &reason); if (parallelized == IndexStmt()) { // can't parallelize @@ -1320,8 +1352,25 @@ topologicallySort(map> hardDeps, return sortedVars; } +IndexStmt justTraverseThroughTheIndexStmt(IndexStmt stmt) { + struct IndexStatementTraverse : public IndexNotationPrinter { + IndexStatementTraverse(std::ostream& os) : IndexNotationPrinter(os) {}; + using IndexNotationPrinter::visit; + map forallParallelUnit; + map forallOutputRaceStrategy; + }; + + std::cout << "traversing through the index statement\n"; + IndexNotationPrinter printer(std::cout); + std::cout << std::endl; + stmt.accept(&printer); + return stmt; + +} + IndexStmt reorderLoopsTopologically(IndexStmt stmt) { + std::cout << "executing reorderLoopsTopologically\n"; // Collect tensorLevelVars which stores the pairs of IndexVar and tensor // level that each tensor is accessed at struct DAGBuilder : public IndexNotationVisitor { @@ -1384,6 +1433,8 @@ IndexStmt reorderLoopsTopologically(IndexStmt stmt) { Iterators iterators(stmt); DAGBuilder dagBuilder(iterators); stmt.accept(&dagBuilder); + std::cout << "After DAGBuilder\n"; + std::cout << stmt << std::endl; // Construct tensor dependencies (sorted list of IndexVars) from tensorLevelVars map>> tensorVarOrders; @@ -1414,6 +1465,8 @@ IndexStmt reorderLoopsTopologically(IndexStmt stmt) { }; CollectSoftDependencies collectSoftDeps; stmt.accept(&collectSoftDeps); + std::cout << "After CollectSoftDependencies\n"; + std::cout << stmt << std::endl; const auto sortedVars = topologicallySort(hardDeps, collectSoftDeps.softDeps, dagBuilder.indexVarOriginalOrder); @@ -1450,7 +1503,11 @@ IndexStmt reorderLoopsTopologically(IndexStmt stmt) { }; TopoReorderRewriter rewriter(sortedVars, dagBuilder.innerBody, dagBuilder.forallParallelUnit, dagBuilder.forallOutputRaceStrategy); - return rewriter.rewrite(stmt); + IndexStmt stmtChanged = rewriter.rewrite(stmt); + std::cout << "After TopoReorderRewriter\n"; + std::cout << stmtChanged << std::endl; + + return stmtChanged; } IndexStmt scalarPromote(IndexStmt stmt, ProvenanceGraph provGraph, @@ -1478,6 +1535,7 @@ IndexStmt scalarPromote(IndexStmt stmt, ProvenanceGraph provGraph, void visit(const ForallNode* node) { Forall foralli(node); + std::cout << "scalar promote: " << foralli << std::endl; IndexVar i = foralli.getIndexVar(); // Don't allow hoisting out of forall's for GPU warp and block reduction diff --git a/src/ir/ir_printer.cpp b/src/ir/ir_printer.cpp index f96251c5a..ba2bc894b 100644 --- a/src/ir/ir_printer.cpp +++ b/src/ir/ir_printer.cpp @@ -750,50 +750,7 @@ void IRPrinter::visit(const VarDecl* op) { void IRPrinter::visit(const Assign* op) { if (is_ISPC_code_stream_enabled()) { - doIndent(); - op->lhs.accept(this); - parentPrecedence = Precedence::TOP; - bool printed = false; - if (simplify) { - if (isa(op->rhs)) { - auto add = to(op->rhs); - if (add->a == op->lhs) { - const Literal* lit = add->b.as(); - if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || - (lit->type.isUInt() && lit->equalsScalar(1)))) { - stream2 << "++"; - } - else { - stream2 << " += "; - add->b.accept(this); - } - printed = true; - } - } - else if (isa(op->rhs)) { - auto mul = to(op->rhs); - if (mul->a == op->lhs) { - stream2 << " *= "; - mul->b.accept(this); - printed = true; - } - } - else if (isa(op->rhs)) { - auto bitOr = to(op->rhs); - if (bitOr->a == op->lhs) { - stream2 << " |= "; - bitOr->b.accept(this); - printed = true; - } - } - } - if (!printed) { - stream2 << " = "; - op->rhs.accept(this); - } - stream2 << ";"; - stream2 << endl; } diff --git a/src/ir_tags.cpp b/src/ir_tags.cpp index af3dbd775..e7365d6c2 100644 --- a/src/ir_tags.cpp +++ b/src/ir_tags.cpp @@ -2,7 +2,7 @@ namespace taco { -const char *ParallelUnit_NAMES[] = {"NotParallel", "DefaultUnit", "GPUBlock", "GPUWarp", "GPUThread", "CPUThread", "CPUVector", "CPUThreadGroupReduction", "GPUBlockReduction", "GPUWarpReduction"}; +const char *ParallelUnit_NAMES[] = {"NotParallel", "DefaultUnit", "GPUBlock", "GPUWarp", "GPUThread", "CPUThread", "CPUVector", "CPUThreadGroupReduction", "GPUBlockReduction", "GPUWarpReduction", "CPUSimd", "CPUSpmd"}; const char *OutputRaceStrategy_NAMES[] = {"IgnoreRaces", "NoRaces", "Atomics", "Temporary", "ParallelReduction"}; const char *BoundType_NAMES[] = {"MinExact", "MinConstraint", "MaxExact", "MaxConstraint"}; const char *AssembleStrategy_NAMES[] = {"Append", "Insert"}; diff --git a/src/lower/lowerer_impl_imperative.cpp b/src/lower/lowerer_impl_imperative.cpp index 53ffd936f..28bd6c7c2 100644 --- a/src/lower/lowerer_impl_imperative.cpp +++ b/src/lower/lowerer_impl_imperative.cpp @@ -1,5 +1,6 @@ #include #include "taco/cuda.h" +#include "taco/ir_tags.h" #include "taco/lower/lowerer_impl_imperative.h" #include "taco/lower/lowerer_impl.h" @@ -417,6 +418,7 @@ LowererImplImperative::lower(IndexStmt stmt, string name, Stmt LowererImplImperative::lowerAssignment(Assignment assignment) { + std::cout << "\n\n converting assignment IndexStmt============================================ Assignment\n"; taco_iassert(generateAssembleCode() || generateComputeCode()); Stmt computeStmt; @@ -424,7 +426,7 @@ Stmt LowererImplImperative::lowerAssignment(Assignment assignment) Expr var = getTensorVar(result); const bool needComputeAssign = util::contains(needCompute, result); - + std::cout << "does assignment need compute assign: " << needComputeAssign << std::endl; Expr rhs; if (needComputeAssign) { rhs = lower(assignment.getRhs()); @@ -432,20 +434,51 @@ Stmt LowererImplImperative::lowerAssignment(Assignment assignment) // Assignment to scalar variables. if (isScalar(result.getType())) { + std::cout << "assignment to scalar variables\n"; if (needComputeAssign) { + std::cout << "compute assign\n"; if (!assignment.getOperator().defined()) { + std::cout << "assignment operator is not defined\n"; + std::cout << "var: " << var << ", rhs, : " << rhs << std::endl; computeStmt = Assign::make(var, rhs); } else { taco_iassert(isa(assignment.getOperator())); - bool useAtomics = markAssignsAtomicDepth > 0 && - !util::contains(whereTemps, result); + + std::cout << "assignment depth -- loopDepth: " << loopDepth << std::endl; + std::cout << "is markAssignsAtomicDepth > 0: " << (markAssignsAtomicDepth > 0) << std::endl; + for (auto &tensors_ : whereTemps) { + std::cout << tensors_ << ", "; + } + std::cout << std::endl; + std::cout << result << std::endl; + int tempVarInitLoopDepth = whereTempsWithLoopDepth.find(result)->second; + std::cout << "tempInitLoopDepth: " << tempVarInitLoopDepth << std::endl; + + bool reduction = false; + std::map::iterator itr; + for (itr = forUnits.begin(); itr!=forUnits.end(); ++itr) { + if (itr->first<=loopDepth && itr->first>tempVarInitLoopDepth && itr->second == ParallelUnit::CPUSimd) { + reduction = true; + } + std::cout << itr->first << "\t" << ParallelUnit_NAMES[(int) itr->second] << std::endl; + } + + // less than or equal to loopDepth but greater than temp variable initialized loop depth + bool useAtomics = markAssignsAtomicDepth > 0 && (!util::contains(whereTemps, result) || reduction); + std::cout << "whereTemps and result: " << !util::contains(whereTemps, result) << std::endl; + std::cout << "assignment to scalar variables useAtomics: " << useAtomics << std::endl; computeStmt = compoundAssign(var, rhs, useAtomics, atomicParallelUnit); + std::cout << "computeStatment: " << computeStmt << std::endl; } } + else { + std::cout << "not compute assign\n"; + } } // Assignments to tensor variables (non-scalar). else { + std::cout << "assignment to tensor variables\n"; Expr values = getValuesArray(result); Expr loc = generateValueLocExpr(assignment.getLhs()); @@ -479,6 +512,7 @@ Stmt LowererImplImperative::lowerAssignment(Assignment assignment) } if (needComputeAssign && values.defined()) { + std::cout << "assign compute statement\n"; if (!assignment.getOperator().defined()) { computeStmt = Store::make(values, loc, rhs); } @@ -595,9 +629,20 @@ LowererImplImperative::splitAppenderAndInserters(const vector& results */ Stmt LowererImplImperative::lowerForall(Forall forall) { + loopDepth++; + forUnits.insert(std::pair(loopDepth,forall.getParallelUnit())); std::cout << "doing lowerForall: " << forall << std::endl; bool hasExactBound = provGraph.hasExactBound(forall.getIndexVar()); bool forallNeedsUnderivedGuards = !hasExactBound && emitUnderivedGuards; + + + std::cout << "printing temporary variables with their atomic depths\n"; + map::iterator itr; + for (itr = whereTempsWithLoopDepth.begin(); itr != whereTempsWithLoopDepth.end(); ++itr) { + std::cout << itr->first << "\t" << itr->second << "\n"; + } + + if (!ignoreVectorize && forallNeedsUnderivedGuards && (forall.getParallelUnit() == ParallelUnit::CPUVector || forall.getUnrollFactor() > 0)) { @@ -852,6 +897,8 @@ Stmt LowererImplImperative::lowerForall(Forall forall) parallelUnitSizes.erase(forall.getParallelUnit()); } + forUnits.erase(loopDepth); + loopDepth--; return Block::blanks(preInitValues, temporaryValuesInitFree[0], loops, @@ -1157,12 +1204,18 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, ir::Stmt recoveryStmt) { std::cout << "1 Stmt LowererImplImperative::lowerForallDimension\n"; + std::cout << "1 Stmt LowererImplImperative::lowerForallDimension markAssignsAtomicDepth: " << markAssignsAtomicDepth << std::endl; Expr coordinate = getCoordinateVar(forall.getIndexVar()); if (forall.getParallelUnit() != ParallelUnit::NotParallel && forall.getOutputRaceStrategy() == OutputRaceStrategy::Atomics) { markAssignsAtomicDepth++; + std::cout << "1 Stmt LowererImplImperative::lowerForallDimension getParallelUnit() is Not NotParallel and outputRaceStrategy is Atomics\n"; + std::cout << "markAssignsAtomicDepth: " << markAssignsAtomicDepth << std::endl; atomicParallelUnit = forall.getParallelUnit(); } + else { + std::cout << "1 Stmt LowererImplImperative::lowerForallDimension getParallelUnit() is NotParallel or outputRaceStrategy is not Atomics\n"; + } std::cout << "original forall : " << forall << std::endl; std::cout << "inside IndexStmt: " << forall.getStmt() << std::endl; @@ -1183,9 +1236,14 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, LoopKind kind = LoopKind::Serial; if (should_use_ISPC_codegen()) { std::cout << "Foreach compatible loop\n"; - if (forall.getParallelUnit() == ParallelUnit::CPUVector && !ignoreVectorize) { + if (forall.getParallelUnit() == ParallelUnit::CPUSimd) { kind = LoopKind::Foreach; } + else if (forall.getParallelUnit() == ParallelUnit::CPUSpmd + && forall.getOutputRaceStrategy() != OutputRaceStrategy::ParallelReduction + ) { + kind = LoopKind::Mul_Thread; + } } else if (forall.getParallelUnit() == ParallelUnit::CPUVector && !ignoreVectorize) { kind = LoopKind::Vectorized; @@ -1250,7 +1308,7 @@ Stmt LowererImplImperative::lowerForallDimension(Forall forall, LoopKind kind = LoopKind::Serial; if (should_use_ISPC_codegen()) { - if (forall.getParallelUnit() == ParallelUnit::CPUVector && !ignoreVectorize) { + if (forall.getParallelUnit() == ParallelUnit::CPUSimd) { kind = LoopKind::Foreach; } } @@ -2201,6 +2259,7 @@ vector LowererImplImperative::codeToInitializeTemporary(Where where) { } Stmt LowererImplImperative::lowerWhere(Where where) { + std::cout << "\n--------------------------------------- lowering where statement: " << where << "\n\n\n"; TensorVar temporary = where.getTemporary(); bool accelerateDenseWorkSpace, sortAccelerator; std::tie(accelerateDenseWorkSpace, sortAccelerator) = @@ -2237,6 +2296,7 @@ Stmt LowererImplImperative::lowerWhere(Where where) { }) ); + std::cout << "\ninitiating lowering of where consumer: " << where.getConsumer() << std::endl; Stmt consumer = lower(where.getConsumer()); if (accelerateDenseWorkSpace && sortAccelerator) { // We need to sort the indices array @@ -2266,6 +2326,7 @@ Stmt LowererImplImperative::lowerWhere(Where where) { } whereConsumers.push_back(consumer); + std::cout << "\nwhere temporaries: " << where.getTemporary() << std::endl; whereTemps.push_back(where.getTemporary()); captureNextLocatePos = true; @@ -2276,6 +2337,9 @@ Stmt LowererImplImperative::lowerWhere(Where where) { restoreAtomicDepth = true; } + whereTempsWithLoopDepth.insert(std::pair(where.getTemporary(), loopDepth)); + + std::cout << "\ninitiating lowering of where producer: " << where.getConsumer() << std::endl; Stmt producer = lower(where.getProducer()); if (accelerateDenseWorkSpace) { const Expr indexListSizeExpr = tempToIndexListSize.at(temporary); @@ -2283,6 +2347,8 @@ Stmt LowererImplImperative::lowerWhere(Where where) { initializeTemporary = Block::make(indexListSizeDecl, initializeTemporary); } + whereTempsWithLoopDepth.erase(where.getTemporary()); + if (restoreAtomicDepth) { markAssignsAtomicDepth++; } diff --git a/src/tensor.cpp b/src/tensor.cpp index 3519456c9..dac2c3fd2 100644 --- a/src/tensor.cpp +++ b/src/tensor.cpp @@ -621,6 +621,7 @@ void TensorBase::compile() { IndexStmt stmt = makeConcreteNotation(makeReductionNotation(assignment)); stmt = reorderLoopsTopologically(stmt); stmt = insertTemporaries(stmt); + std::cout << "calling parallelizeOuterLoop(stmt)\n"; stmt = parallelizeOuterLoop(stmt); compile(stmt, content->assembleWhileCompute); } diff --git a/test/tests-scheduling-eval.cpp b/test/tests-scheduling-eval.cpp index 6a228f38b..93ba7b01e 100644 --- a/test/tests-scheduling-eval.cpp +++ b/test/tests-scheduling-eval.cpp @@ -65,14 +65,31 @@ IndexStmt scheduleSpMMCPU(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, i .parallelize(k, ParallelUnit::CPUVector, OutputRaceStrategy::IgnoreRaces); } -IndexStmt scheduleSpMMISPC(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { +IndexStmt scheduleSpMMISPC1(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); return stmt.split(i, i0, i1, CHUNK_SIZE) .pos(j, jpos, A(i,j)) .split(jpos, jpos0, jpos1, UNROLL_FACTOR) .reorder({i0, i1, jpos0, k, jpos1}) .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) - .parallelize(k, ParallelUnit::CPUVector, OutputRaceStrategy::IgnoreRaces); + .parallelize(k, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); +} + +IndexStmt scheduleSpMMISPC2(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); + return stmt + .parallelize(k, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); +} + +IndexStmt scheduleSpMMISPC3(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); + return stmt + // .split(i, i0, i1, CHUNK_SIZE) + // .pos(j, jpos, A(i,j)) + // .split(jpos, jpos0, jpos1, UNROLL_FACTOR) + .reorder({j, k}) + .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + .parallelize(k, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); } IndexStmt scheduleSpGEMMCPU(IndexStmt stmt, bool doPrecompute) { @@ -128,6 +145,27 @@ IndexStmt scheduleSDDMMCPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, .parallelize(kpos1, ParallelUnit::CPUVector, OutputRaceStrategy::ParallelReduction); } +IndexStmt scheduleSDDMMISPC1(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1"); + return stmt.split(i, i0, i1, CHUNK_SIZE) + .pos(k, kpos, B(i,k)) + .split(kpos, kpos0, kpos1, UNROLL_FACTOR) + .reorder({i0, i1, kpos0, j, kpos1}) + .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + .parallelize(kpos1, ParallelUnit::CPUSimd, OutputRaceStrategy::ParallelReduction); +} + +IndexStmt scheduleSDDMMISPC2(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1"); + return stmt; + // .split(i, i0, i1, CHUNK_SIZE) + // .pos(k, kpos, B(i,k)) + // .split(kpos, kpos0, kpos1, UNROLL_FACTOR) + // .reorder({i0, i1, kpos0, j, kpos1}) + // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + // .parallelize(kpos1, ParallelUnit::CPUSimd, OutputRaceStrategy::ParallelReduction); +} + IndexStmt scheduleTTVCPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16) { IndexVar f("f"), fpos("fpos"), chunk("chunk"), fpos2("fpos2"); return stmt.fuse(i, j, f) @@ -1550,24 +1588,80 @@ TEST(generate_ispc_evaluation_files, ispc) { stringstream source2; std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); Tensor A("A", {NUM_I, NUM_J}, CSR); - Tensor B("B", {NUM_J, NUM_K}, {Dense, Dense}); - Tensor C("C", {NUM_I, NUM_K}, {Dense, Dense}); - C(i, k) = A(i, j) * B(j, k); - IndexStmt stmt = C.getAssignment().concretize(); + Tensor X("X", {NUM_J, NUM_K}, {Dense, Dense}); + Tensor Y("Y", {NUM_I, NUM_K}, {Dense, Dense}); + Y(i, k) = A(i, j) * X(j, k); + IndexStmt stmt = Y.getAssignment().concretize(); bool isFirst = true; for (auto paramSet : spmm_parameters) { - IndexStmt scheduled = scheduleSpMMISPC(stmt, A, paramSet[0], paramSet[1]); - ir::Stmt compute = lower(scheduled, string("compute_") + util::join(paramSet, "_"), false, true); + IndexStmt scheduled = scheduleSpMMISPC1(stmt, A, paramSet[0], paramSet[1]); + ir::Stmt compute = lower(scheduled, string("compute1_") + util::join(paramSet, "_"), false, true); + codegen->compile(compute, isFirst); + isFirst = false; + } + ofstream source_file; + source_file.open(file_path + "spmm_csr_ispc_taco1" + c_file_ending); + source_file << source1.str(); + source_file.close(); + + ofstream ispc_source_file; + ispc_source_file.open(file_path + "__spmm_csr_ispc_taco1" + file_ending); + ispc_source_file << source2.str(); + ispc_source_file.close(); + } + + // spmm + { + stringstream source1; + stringstream source2; + std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); + Tensor A("A", {NUM_I, NUM_J}, CSR); + Tensor X("X", {NUM_J, NUM_K}, {Dense, Dense}); + Tensor Y("Y", {NUM_I, NUM_K}, {Dense, Dense}); + Y(i, k) = A(i, j) * X(j, k); + IndexStmt stmt = Y.getAssignment().concretize(); + bool isFirst = true; + for (auto paramSet : spmm_parameters) { + IndexStmt scheduled = scheduleSpMMISPC2(stmt, A, paramSet[0], paramSet[1]); + ir::Stmt compute = lower(scheduled, string("compute2_") + util::join(paramSet, "_"), false, true); + codegen->compile(compute, isFirst); + isFirst = false; + } + ofstream source_file; + source_file.open(file_path + "spmm_csr_ispc_taco2" + c_file_ending); + source_file << source1.str(); + source_file.close(); + + ofstream ispc_source_file; + ispc_source_file.open(file_path + "__spmm_csr_ispc_taco2" + file_ending); + ispc_source_file << source2.str(); + ispc_source_file.close(); + } + + // spmm + { + stringstream source1; + stringstream source2; + std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); + Tensor A("A", {NUM_I, NUM_J}, CSR); + Tensor X("X", {NUM_J, NUM_K}, {Dense, Dense}); + Tensor Y("Y", {NUM_I, NUM_K}, {Dense, Dense}); + Y(i, k) = A(i, j) * X(j, k); + IndexStmt stmt = Y.getAssignment().concretize(); + bool isFirst = true; + for (auto paramSet : spmm_parameters) { + IndexStmt scheduled = scheduleSpMMISPC3(stmt, A, paramSet[0], paramSet[1]); + ir::Stmt compute = lower(scheduled, string("compute3_") + util::join(paramSet, "_"), false, true); codegen->compile(compute, isFirst); isFirst = false; } ofstream source_file; - source_file.open(file_path + "spmm_csr_ispc_taco" + c_file_ending); + source_file.open(file_path + "spmm_csr_ispc_taco3" + c_file_ending); source_file << source1.str(); source_file.close(); ofstream ispc_source_file; - ispc_source_file.open(file_path + "__spmm_csr_ispc_taco" + file_ending); + ispc_source_file.open(file_path + "__spmm_csr_ispc_taco3" + file_ending); ispc_source_file << source2.str(); ispc_source_file.close(); } @@ -1576,6 +1670,99 @@ TEST(generate_ispc_evaluation_files, ispc) { return; } + + +TEST(generate_ispc_sddmm_evaluation_files, ispc) { + std::cout << "Hi Adhitha!\n" << std::endl ; + set_CUDA_codegen_enabled(false); + set_ISPC_codegen_enabled(true); + + vector> spmv_parameters = {{32}}; + vector> spmspv_parameters = {{8}}; + + // 4 to 512 and 4, 8, 16 + vector> spmm_dcsr_parameters = {{16, 8}}; + vector> spmm_parameters = {{16,4}}; + + vector> mttkrp_parameters = {}; + mttkrp_parameters.push_back({64,0}); + + vector> sddmm_parameters = {{8, 8}}; + vector> ttv_parameters = {{32}}; + + int NUM_I = 100; + int NUM_J = 100; + int NUM_K = 100; + + string c_file_ending = ".h"; + string file_ending = ".ispc"; + string file_path = "eval_prepared_ispc/sddmm/"; + mkdir(file_path.c_str(), 0777); + + // sddmm + { + stringstream source1; + stringstream source2; + std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); + Tensor A("A", {NUM_I, NUM_K}, {Dense, Dense}); + Tensor B("B", {NUM_I, NUM_K}, CSR); + Tensor C("C", {NUM_I, NUM_J}, {Dense, Dense}); + Tensor D("D", {NUM_J, NUM_K}, {Dense, Dense}); + A(i,k) = B(i,k) * C(i,j) * D(j,k); + IndexStmt stmt = A.getAssignment().concretize(); + bool isFirst = true; + for (auto paramSet : sddmm_parameters) { + IndexStmt scheduled = scheduleSDDMMISPC1(stmt, B, paramSet[0], paramSet[1]); + ir::Stmt compute = lower(scheduled, string("compute1_") + util::join(paramSet, "_"), false, true); + codegen->compile(compute, isFirst); + isFirst = false; + } + ofstream source_file; + source_file.open(file_path + "sddmm_cpu_ispc_taco1" + file_ending); + source_file << source1.str(); + source_file.close(); + + ofstream ispc_source_file; + ispc_source_file.open(file_path + "__sddmm_cpu_ispc_taco1" + file_ending); + ispc_source_file << source2.str(); + ispc_source_file.close(); + } + + + // sddmm + { + stringstream source1; + stringstream source2; + std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); + Tensor Y("Y", {NUM_I, NUM_K}, {Dense, Dense}); + Tensor A("A", {NUM_I, NUM_K}, CSR); + Tensor X("X", {NUM_I, NUM_J}, {Dense, Dense}); + Y(i,j) = A(i,j) * X(i,k) * X(j,k); + IndexStmt stmt = Y.getAssignment().concretize(); + bool isFirst = true; + for (auto paramSet : sddmm_parameters) { + IndexStmt scheduled = scheduleSDDMMISPC2(stmt, A, paramSet[0], paramSet[1]); + ir::Stmt compute = lower(scheduled, string("compute2_") + util::join(paramSet, "_"), false, true); + codegen->compile(compute, isFirst); + isFirst = false; + } + ofstream source_file; + source_file.open(file_path + "sddmm_cpu_ispc_taco2" + file_ending); + source_file << source1.str(); + source_file.close(); + + ofstream ispc_source_file; + ispc_source_file.open(file_path + "__sddmm_cpu_ispc_taco2" + file_ending); + ispc_source_file << source2.str(); + ispc_source_file.close(); + } + + + return; +} + + + TEST(generate_evaluation_files, cpu) { if (should_use_CUDA_codegen()) { return; diff --git a/tools/taco.cpp b/tools/taco.cpp index 9a864a699..bf7e7c9dc 100644 --- a/tools/taco.cpp +++ b/tools/taco.cpp @@ -265,7 +265,7 @@ static void printSchedulingHelp() { "an output race strategy `strat`. Since the other transformations " "expect serial code, parallelize must come last in a series of " "transformations. Possible parallel hardware units are: " - "NotParallel, GPUBlock, GPUWarp, GPUThread, CPUThread, CPUVector. " + "NotParallel, GPUBlock, GPUWarp, GPUThread, CPUThread, CPUVector, CPUSimd, CPUSimd. " "Possible output race strategies are: " "IgnoreRaces, NoRaces, Atomics, Temporary, ParallelReduction."); } @@ -313,7 +313,8 @@ static void printCommandLine(ostream& os, int argc, char* argv[]) { } } -static bool setSchedulingCommands(vector> scheduleCommands, parser::Parser& parser, IndexStmt& stmt) { +static int setSchedulingCommands(vector> scheduleCommands, parser::Parser& parser, IndexStmt& stmt) { + std::cout << "setting scheduling commands\n"; auto findVar = [&stmt](string name) { ProvenanceGraph graph(stmt); for (auto v : graph.getAllIndexVars()) { @@ -326,9 +327,15 @@ static bool setSchedulingCommands(vector> scheduleCommands, parse abort(); // to silence a warning: control reaches end of non-void function }; - bool isGPU = false; + int isGPU = 0; + int isISPC = 0; for(vector scheduleCommand : scheduleCommands) { + std::cout << "running schedluing command: "; + for (auto &command : scheduleCommand) { + std::cout << command << " "; + } + std::cout << std::endl; string command = scheduleCommand[0]; scheduleCommand.erase(scheduleCommand.begin()); @@ -541,7 +548,15 @@ static bool setSchedulingCommands(vector> scheduleCommands, parse parallel_unit = ParallelUnit::CPUThread; } else if (unit == "CPUVector") { parallel_unit = ParallelUnit::CPUVector; - } else { + } else if (unit == "CPUSimd") { + isISPC = true; + parallel_unit = ParallelUnit::CPUSimd; + } + else if (unit == "CPUSpmd") { + parallel_unit = ParallelUnit::CPUSpmd; + isISPC = true; + } + else { taco_uerror << "Parallel hardware not defined."; goto end; } @@ -562,6 +577,8 @@ static bool setSchedulingCommands(vector> scheduleCommands, parse goto end; } + std::cout << "stmt before parallelizing the statement: " << stmt << endl; + std::cout << "ParallelUnit: " << ParallelUnit_NAMES[(int) parallel_unit] << ", outputRaceStrategy: " << OutputRaceStrategy_NAMES[(int) output_race_strategy] << std::endl; stmt = stmt.parallelize(findVar(i), parallel_unit, output_race_strategy); } else if (command == "assemble") { @@ -617,7 +634,13 @@ static bool setSchedulingCommands(vector> scheduleCommands, parse end:; } - return isGPU; + if (isGPU) { + return 1; + } + else if (isISPC) { + return 2; + } + return 0; } int main(int argc, char* argv[]) { @@ -1011,6 +1034,8 @@ int main(int argc, char* argv[]) { } } + std::cout << "cuda: " << cuda << ", ispc: " << ispc << std::endl; + // Print compute is the default if nothing else was asked for if (!printAssemble && !printEvaluate && !printIterationGraph && !writeCompute && !writeAssemble && !writeKernels && !readKernels && @@ -1019,6 +1044,7 @@ int main(int argc, char* argv[]) { } // pre-parse expression, to determine existence and order of loaded tensors + std::cout << "pre-parse expression, to determine existence and order of loaded tensors\n"; map loadedTensors; TensorBase temp_tensor; parser::Parser temp_parser(exprStr, formats, dataTypes, tensorsDimensions, loadedTensors, 42); @@ -1124,15 +1150,22 @@ int main(int argc, char* argv[]) { IndexStmt stmt = makeConcreteNotation(makeReductionNotation(tensor.getAssignment())); + std::cout << "concrete index statement: " << stmt << std::endl; + stmt = justTraverseThroughTheIndexStmt(stmt); stmt = reorderLoopsTopologically(stmt); + std::cout << "topologically reordered loops statement: " << stmt << std::endl; if (setSchedule) { - cuda |= setSchedulingCommands(scheduleCommands, parser, stmt); + int val = setSchedulingCommands(scheduleCommands, parser, stmt); + cuda |= (val==1); + ispc |= (val==2); } else { stmt = insertTemporaries(stmt); stmt = parallelizeOuterLoop(stmt); } + std::cout << "after setting the scheduling commands\n"; + std::cout << stmt << std::endl; if (cuda) { if (!CUDA_BUILT && benchmark) { @@ -1153,6 +1186,7 @@ int main(int argc, char* argv[]) { set_ISPC_codegen_enabled(false); } + std::cout << "running scalar promote\n" << std::endl; stmt = scalarPromote(stmt); if (printConcrete) { cout << stmt << endl; @@ -1240,6 +1274,7 @@ int main(int argc, char* argv[]) { } } else { + std::cout << "lowering stmt: " << stmt << std::endl; compute = lower(stmt, prefix+"compute", computeWithAssemble, true); assemble = lower(stmt, prefix+"assemble", true, false); evaluate = lower(stmt, prefix+"evaluate", true, true); From 0a4169728d9d6bcdfc1b1dabc40a0daf7e7e1e0a Mon Sep 17 00:00:00 2001 From: Adhhitha Dias Date: Mon, 26 Jul 2021 19:43:37 -0400 Subject: [PATCH 5/8] add tests and ispc compilation --- include/taco/codegen/module.h | 1 + src/codegen/codegen.cpp | 4 +- src/codegen/codegen_ispc.cpp | 249 ++++++++++---- src/codegen/codegen_ispc.h | 4 +- src/codegen/module.cpp | 79 ++++- src/tensor.cpp | 6 +- taco-uml.wsd | 411 +++++++++++++++++++++++ test/test.cpp | 14 + test/test.h | 1 + test/tests-scheduling-eval.cpp | 575 ++++++++++++++++++++++++++++++++- 10 files changed, 1263 insertions(+), 81 deletions(-) create mode 100644 taco-uml.wsd diff --git a/include/taco/codegen/module.h b/include/taco/codegen/module.h index 36eb34f1a..3df7c8e0f 100644 --- a/include/taco/codegen/module.h +++ b/include/taco/codegen/module.h @@ -68,6 +68,7 @@ class Module { private: std::stringstream source; + std::stringstream additional_source; std::stringstream header; std::string libname; std::string tmpdir; diff --git a/src/codegen/codegen.cpp b/src/codegen/codegen.cpp index 7081bc195..6ec54a2f8 100644 --- a/src/codegen/codegen.cpp +++ b/src/codegen/codegen.cpp @@ -265,9 +265,9 @@ string CodeGen::getUnpackedTensorArgument(string varname, const GetProperty* op, // all others are int* if (op->property == TensorProperty::Dimension) { if (op->type == Int32) { - ret << "int32 "; + ret << "uniform int32 "; } else if (op->type == Int64) { - ret << "int64 "; + ret << "uniform int64 "; } else { ret << "int "; } diff --git a/src/codegen/codegen_ispc.cpp b/src/codegen/codegen_ispc.cpp index c8480cd25..237bc822d 100644 --- a/src/codegen/codegen_ispc.cpp +++ b/src/codegen/codegen_ispc.cpp @@ -6,10 +6,12 @@ #include #include "taco/cuda.h" +#include "taco/ir/ir_printer.h" #include "taco/ir/ir_visitor.h" #include "taco/ir/ir_rewriter.h" #include "taco/ir/simplify.h" +#include "codegen_c.h" #include "codegen_ispc.h" #include "taco/error.h" #include "taco/util/strings.h" @@ -295,6 +297,7 @@ class CodeGen_ISPC::DeviceFunctionCollector : public IRVisitor { virtual void visit(const For *op) { if (op->parallel_unit == ParallelUnit::CPUSpmd) { std::cout << "ParallelUnit::CPUSpmd directive found\n"; + inDeviceFunction = false; op->var.accept(this); inDeviceFunction = true; @@ -380,6 +383,8 @@ void CodeGen_ISPC::compile(Stmt stmt, bool isFirst) { stmt.accept(this); } + + string CodeGen_ISPC::printCallISPCFunc(const Function *func, map varMap, vector &sortedProps) { std::stringstream ret; @@ -388,9 +393,6 @@ string CodeGen_ISPC::printCallISPCFunc(const Function *func, mapname << "("; - vector inputs = func->inputs; - vector outputs = func->outputs; - getSortedProps(varMap, sortedProps, inputs, outputs); for (unsigned long i=0; i < sortedProps.size(); i++) { ret << varMap[sortedProps[i]]; @@ -404,50 +406,123 @@ string CodeGen_ISPC::printCallISPCFunc(const Function *func, map varMap, +// varMap is already sorted <- make sure to pass the sorted varMap +void CodeGen_ISPC::printISPCFunc(const Function *func, map varMap, vector &sortedProps) { DeviceFunctionCollector deviceFunctionCollector(func->inputs, func->outputs, this); func->body.accept(&deviceFunctionCollector); - - std::stringstream ret; - ret << "export void "; - unordered_set propsAlreadyGenerated; - - ret << "__" << func->name << "("; - + std::stringstream variables; vector inputs = func->inputs; vector outputs = func->outputs; - // getSortedProps(varMap, sortedProps, inputs, outputs); + unordered_set propsAlreadyGenerated; - for (unsigned long i=0; i < sortedProps.size(); i++) { - auto prop = sortedProps[i]; - bool isOutputProp = (find(outputs.begin(), outputs.end(), - prop->tensor) != outputs.end()); - - auto var = prop->tensor.as(); - if (var->is_parameter) { - if (isOutputProp) { - ret << " " << printTensorProperty(varMap[prop], prop, false) << ";" << endl; + for (unsigned long i=0; i < sortedProps.size(); i++) { + auto prop = sortedProps[i]; + bool isOutputProp = (find(outputs.begin(), outputs.end(), + prop->tensor) != outputs.end()); + + auto var = prop->tensor.as(); + if (var->is_parameter) { + if (isOutputProp) { + variables << " " << printTensorProperty(varMap[prop], prop, false) << ";" << endl; + } else { + break; + } } else { - break; + variables << getUnpackedTensorArgument(varMap[prop], prop, isOutputProp); } - } else { - ret << getUnpackedTensorArgument(varMap[prop], prop, isOutputProp); - } - propsAlreadyGenerated.insert(varMap[prop]); + propsAlreadyGenerated.insert(varMap[prop]); - if (i!=sortedProps.size()-1) { - ret << ", "; - } - if (i%2==0) { - ret << "\n\t"; + if (i!=sortedProps.size()-1) { + variables << ", "; + } + if (i%2==0) { + variables << "\n\t"; + } } + + resetUniqueNameCounters(); + for (size_t i = 0; i < deviceFunctionCollector.threadFors.size(); i++) { + + const For *threadloop = to(deviceFunctionCollector.threadFors[i]); + taco_iassert(threadloop->parallel_unit == ParallelUnit::CPUSpmd); + Stmt function = threadloop->contents; + std::cout << "threadloop function: " << function << std::endl; + + out2 << "static task void __" << func->name << "__ ("; + out2 << variables.str(); + out2 << "\n) {\n\n"; + + indent++; + doIndent(); + // output body + print(threadloop); + indent--; + out2 << "}\n"; + + out2 << "export void __" << func->name << "("; + out2 << variables.str(); + out2 << "\n) {\n\n"; + indent++; + doIndent(); + out2 << "launch[4] " << printCallISPCFunc(func, varMap, sortedProps) << "\n"; + indent--; + out2 << "}\n"; + } - ret << "\n) {\n\n"; - return ret.str(); + if (deviceFunctionCollector.threadFors.size()==0) { + out2 << "export void __" << func->name << " ("; + out2 << variables.str(); + out2 << "\n) {\n\n"; + + indent++; + doIndent(); + // output body + print(func->body); + indent--; + out2 << "}\n"; + } + + // out2 << "export void "; + + // out2 << "__" << func->name << "("; + + // for (unsigned long i=0; i < sortedProps.size(); i++) { + // auto prop = sortedProps[i]; + // bool isOutputProp = (find(outputs.begin(), outputs.end(), + // prop->tensor) != outputs.end()); + + // auto var = prop->tensor.as(); + // if (var->is_parameter) { + // if (isOutputProp) { + // out2 << " " << printTensorProperty(varMap[prop], prop, false) << ";" << endl; + // } else { + // break; + // } + // } else { + // out2 << getUnpackedTensorArgument(varMap[prop], prop, isOutputProp); + // } + // propsAlreadyGenerated.insert(varMap[prop]); + + // if (i!=sortedProps.size()-1) { + // out2 << ", "; + // } + // if (i%2==0) { + // out2 << "\n\t"; + // } + // } + // out2 << "\n) {\n\n"; + + // indent++; + // doIndent(); + // // output body + // print(func->body); + // indent--; + // out2 << "}\n"; + } void CodeGen_ISPC::sendToStream(std::stringstream &stream) { @@ -461,6 +536,75 @@ void CodeGen_ISPC::sendToStream(std::stringstream &stream) { void CodeGen_ISPC::visit(const Function* func) { // if generating a header, protect the function declaration with a guard + if (func->name == "assemble") { + if (outputKind == HeaderGen) { + out << "#ifndef TACO_GENERATED_" << func->name << "\n"; + out << "#define TACO_GENERATED_" << func->name << "\n"; + } + + int numYields = countYields(func); + emittingCoroutine = (numYields > 0); + funcName = func->name; + labelCount = 0; + + resetUniqueNameCounters(); + FindVars inputVarFinder(func->inputs, {}, this); + func->body.accept(&inputVarFinder); + FindVars outputVarFinder({}, func->outputs, this); + func->body.accept(&outputVarFinder); + + // output function declaration + doIndent(); + out << printFuncName(func, inputVarFinder.varDecls, outputVarFinder.varDecls); + + // if we're just generating a header, this is all we need to do + if (outputKind == HeaderGen) { + out << ";\n"; + out << "#endif\n"; + return; + } + + out << " {\n"; + + indent++; + + // find all the vars that are not inputs or outputs and declare them + resetUniqueNameCounters(); + FindVars varFinder(func->inputs, func->outputs, this); + func->body.accept(&varFinder); + varMap = varFinder.varMap; + localVars = varFinder.localVars; + + // Print variable declarations + out << printDecls(varFinder.varDecls, func->inputs, func->outputs) << endl; + + if (emittingCoroutine) { + out << printContextDeclAndInit(varMap, localVars, numYields, func->name) + << endl; + } + + // output body + print(func->body); + + // output repack only if we allocated memory + if (checkForAlloc(func)) + out << endl << printPack(varFinder.outputProperties, func->outputs); + + if (emittingCoroutine) { + out << printCoroutineFinish(numYields, funcName); + } + + doIndent(); + out << "return 0;\n"; + indent--; + + doIndent(); + out << "}\n"; + return; + + } + + if (outputKind == HeaderGen) { out << "#ifndef TACO_GENERATED_" << func->name << "\n"; out << "#define TACO_GENERATED_" << func->name << "\n"; @@ -503,6 +647,9 @@ void CodeGen_ISPC::visit(const Function* func) { out << printDecls(varFinder.varDecls, func->inputs, func->outputs) << endl; vector sortedProps; + vector inputs = func->inputs; + vector outputs = func->outputs; + getSortedProps(varFinder.varDecls, sortedProps, inputs, outputs); out << printCallISPCFunc(func, varFinder.varDecls, sortedProps); if (emittingCoroutine) { @@ -526,13 +673,7 @@ void CodeGen_ISPC::visit(const Function* func) { out << "}\n\n"; set_ISPC_code_stream_enabled(true); - out2 << printISPCFunc(func, varFinder.varDecls, sortedProps); - indent++; - doIndent(); - // output body - print(func->body); - indent--; - out2 << "}\n"; + printISPCFunc(func, varFinder.varDecls, sortedProps); set_ISPC_code_stream_enabled(false); } @@ -655,20 +796,20 @@ void CodeGen_ISPC::visit(const For* op) { case LoopKind::Runtime: case LoopKind::Static_Chunked: case LoopKind::Mul_Thread: - op->start.accept(this); - stream2 << std::endl; - op->start.accept(this); - stream2 << std::endl; - op->start.accept(this); - stream2 << std::endl; - op->start.accept(this); - stream2 << std::endl; - op->end.accept(this); - stream2 << std::endl; - op->end.accept(this); - stream2 << std::endl; - op->end.accept(this); - stream2 << std::endl; + // op->start.accept(this); + // stream2 << std::endl; + // op->start.accept(this); + // stream2 << std::endl; + // op->start.accept(this); + // stream2 << std::endl; + // op->start.accept(this); + // stream2 << std::endl; + // op->end.accept(this); + // stream2 << std::endl; + // op->end.accept(this); + // stream2 << std::endl; + // op->end.accept(this); + // stream2 << std::endl; default: break; } diff --git a/src/codegen/codegen_ispc.h b/src/codegen/codegen_ispc.h index 279d0db7a..08e73b252 100644 --- a/src/codegen/codegen_ispc.h +++ b/src/codegen/codegen_ispc.h @@ -5,7 +5,7 @@ #include "taco/ir/ir.h" #include "taco/ir/ir_printer.h" -#include "codegen.h" +#include "codegen_c.h" namespace taco { namespace ir { @@ -46,7 +46,7 @@ class CodeGen_ISPC : public CodeGen { Stmt simplifyFunctionBodies(Stmt stmt); std::string printCallISPCFunc(const Function *func, std::map varMap, std::vector &sortedProps); - std::string printISPCFunc(const Function *func, std::map varMap, + void printISPCFunc(const Function *func, std::map varMap, std::vector &sortedProps); std::map varMap; diff --git a/src/codegen/module.cpp b/src/codegen/module.cpp index d9cbe2edc..82b736a13 100644 --- a/src/codegen/module.cpp +++ b/src/codegen/module.cpp @@ -43,6 +43,7 @@ void Module::addFunction(Stmt func) { void Module::compileToSource(string path, string prefix) { if (!moduleFromUserSource) { + std::cout << "module not from user source\n"; // create a codegen instance and add all the funcs bool didGenRuntime = false; @@ -51,11 +52,13 @@ void Module::compileToSource(string path, string prefix) { header.clear(); source.str(""); source.clear(); + additional_source.str(""); + additional_source.clear(); taco_tassert(target.arch == Target::C99) << "Only C99 codegen supported currently"; std::shared_ptr sourcegen = - CodeGen::init_default(source, CodeGen::ImplementationGen); + CodeGen::init_default(source, additional_source, CodeGen::ImplementationGen); std::shared_ptr headergen = CodeGen::init_default(header, CodeGen::HeaderGen); @@ -69,8 +72,17 @@ void Module::compileToSource(string path, string prefix) { ofstream source_file; string file_ending = should_use_CUDA_codegen() ? ".cu" : ".c"; source_file.open(path+prefix+file_ending); + if (should_use_ISPC_codegen()) { + source_file << "#include \"" << path+prefix+"_ispc.h\"\n"; + } source_file << source.str(); source_file.close(); + + ofstream additional_source_file; + string file_ending2 = ".ispc"; + additional_source_file.open(path+prefix+file_ending2); + additional_source_file << additional_source.str(); + additional_source_file.close(); ofstream header_file; header_file.open(path+prefix+".h"); @@ -90,9 +102,9 @@ void writeShims(vector funcs, string path, string prefix) { if (should_use_CUDA_codegen()) { CodeGen_CUDA::generateShim(func, shims); } - else if (should_use_ISPC_codegen()) { - CodeGen_ISPC::generateShim(func, shims); - } + // else if (should_use_ISPC_codegen()) { + // CodeGen_ISPC::generateShim(func, shims); + // } else { CodeGen_C::generateShim(func, shims); } @@ -102,9 +114,9 @@ void writeShims(vector funcs, string path, string prefix) { if (should_use_CUDA_codegen()) { shims_file.open(path+prefix+"_shims.cpp"); } - else if (should_use_ISPC_codegen()) { - shims_file.open(path+prefix+".ispc", ios::app); - } + // else if (should_use_ISPC_codegen()) { + // shims_file.open(path+prefix+".c", ios::app); + // } else { shims_file.open(path+prefix+".c", ios::app); } @@ -131,12 +143,13 @@ string Module::compile() { file_ending = ".cu"; shims_file = prefix + "_shims.cpp"; } - else if (should_use_ISPC_codegen()) { - cc = util::getFromEnv(target.compiler_env, target.compiler); - cflags = util::getFromEnv("TACO_CFLAGS", - "-O3 -ffast-math -std=c99") + " -shared -fPIC"; - - } + // else if (should_use_ISPC_codegen()) { + // cc = util::getFromEnv("TACO_ISPC", "ispc"); + // cflags = util::getFromEnv("TACO_ISPC_FLAGS", + // " --target=sse2-i32x4,sse4-i32x8,avx1-i32x8,avx2-i32x8,avx512knl-i32x16,avx512skx-i32x16 --pic -O3 --addressing=64 --arch=x86-64" + // ) + " "; + + // } else { cc = util::getFromEnv(target.compiler_env, target.compiler); cflags = util::getFromEnv("TACO_CFLAGS", @@ -151,9 +164,15 @@ string Module::compile() { string cmd = cc + " " + cflags + " " + prefix + file_ending + " " + shims_file + " " + "-o " + fullpath + " -lm"; + std::cout << "--------------------------------------------------------------------------------tmpdir: " << tmpdir << std::endl; + std::cout << "--------------------------------------------------------------------------------libname: " << libname << std::endl; + std::cout << "--------------------------------------------------------------------------------prefix: " << prefix << std::endl; + std::cout << "--------------------------------------------------------------------------------fullpath: " << fullpath << std::endl; + std::cout << "--------------------------------------------------------------------------------cmd: " << cmd << std::endl; // open the output file & write out the source compileToSource(tmpdir, libname); + // write out the shims writeShims(funcs, tmpdir, libname); @@ -164,10 +183,36 @@ string Module::compile() { } std::cout << tmpdir << std::endl << libname << std::endl; - // now compile it - int err = system(cmd.data()); - taco_uassert(err == 0) << "Compilation command failed:\n" << cmd - << "\nreturned " << err; + if (should_use_ISPC_codegen()) { + string ispc = util::getFromEnv("TACO_ISPC", "ispc"); + string ispcflags = util::getFromEnv("TACO_ISPC_FLAGS", + " --target=sse2-i32x4,sse4-i32x8,avx1-i32x8,avx2-i32x8,avx512knl-i32x16,avx512skx-i32x16 --pic -O3 --addressing=64 --arch=x86-64" + ) + " "; + string cmd = ispc + " " + ispcflags + " -o " + prefix + ".ispc.o " + " --emit-obj " + prefix + ".ispc " + "-h " + prefix + "_ispc.h"; + + // now compile the ispc file to generate the object file and the ispc header file + std::cout << "--------------------------------------------------------------------------------cmd: " << cmd << std::endl; + int err = system(cmd.data()); + taco_uassert(err == 0) << "Compilation command failed:\n" << cmd + << "\nreturned " << err; + + string ispc_object_file = " " + prefix + ".ispc.o "; + string ispc_object_files_for_diff_targets = " " + prefix + ".ispc_* "; + cmd = cc + " " + cflags + " " + + prefix + file_ending + " " + ispc_object_file + ispc_object_files_for_diff_targets + shims_file + " " + + "-o " + fullpath + " -lm -lrt "; + + // now compile the c file linking the ispc object file. ispc header is added to the top of the c file + std::cout << "--------------------------------------------------------------------------------cmd: " << cmd << std::endl; + err = system(cmd.data()); + taco_uassert(err == 0) << "Compilation command failed:\n" << cmd + << "\nreturned " << err; + } else { + // now compile it + int err = system(cmd.data()); + taco_uassert(err == 0) << "Compilation command failed:\n" << cmd + << "\nreturned " << err; + } // use dlsym() to open the compiled library if (lib_handle) { diff --git a/src/tensor.cpp b/src/tensor.cpp index dac2c3fd2..5e02d2660 100644 --- a/src/tensor.cpp +++ b/src/tensor.cpp @@ -808,9 +808,9 @@ void TensorBase::assemble() { void TensorBase::compute() { taco_uassert(!needsCompile()) << error::compute_without_compile; - if (!needsCompute()) { - return; - } + // if (!needsCompute()) { + // return; + // } setNeedsCompute(false); // Sync operand tensors if needed. auto operands = getTensors(getAssignment().getRhs()); diff --git a/taco-uml.wsd b/taco-uml.wsd new file mode 100644 index 000000000..4b8e39802 --- /dev/null +++ b/taco-uml.wsd @@ -0,0 +1,411 @@ +@startuml taco +scale 1 + + +class IntrusivePtr { + +T *ptr +} +class Uncopyable {} + +class IRNode { + +virtual void accept(IRVisitorStrict *v) const = 0 + +virtual IRNodeType type_info() const = 0; +} + +class BaseStmtNode {} +class BaseExprNode { + +Datatype type +} + +class StmtNode { + +void accept(IRVisitorStrict *v) const +} +class ExprNode { + +void accept(IRVisitorStrict *v) const +} + +Uncopyable <|-- IRNode +IRNode <|-- BaseStmtNode +IRNode <|-- BaseExprNode +BaseStmtNode <|-- StmtNode +BaseExprNode <|-- ExprNode + +class IRHandle { + +void accept(IRVisitorStrict *v) const +} +class Expr {} +class Stmt {} + +IntrusivePtr <|-- IRHandle +IRHandle <|-- Expr +IRHandle <|-- Stmt + +IRHandle "1" *-- "1" IRNode : contains + + + +' this class is abstract but plantuml version does not support interface keyword +interface IRVisitorStrict { + +virtual void visit(const IRNode*) const = 0 +} + +/' +IRVisitor is not an interface or abstract because it +has not pure virtual methods +'/ +class IRVisitor { + +virtual void visit(const IRNode*) +} + +class IRRewriter { + ' protected fields and methods + #Expr expr + #Stmt stmt + + #virtual void visit(const ExprNode* op) + #virtual void visit(const StmtNode* op) + + ' public fields and methods + +Expr rewrite(Expr) + +Stmt rewrite(Stmt) +} +class IRPrinter { + #std::ostream &stream + #std::ostream &stream2 + #int indent + #bool color + #bool simplify + #enum Precedence + #Precedence parentPrecedence = BOTTOM + #NameGenerator varNameGenerator + #scopedMap varNames + + #void doIndent() + #void printBinOp(Expr a, Expr b, std::string op, Precedence precedence) + #void fewMoreMethods() + + #virtual void visit(const ExprNode*) + #virtual void visit(const StmtNode*) + + +setColor(bool color) + +print(Stmt) +} +class IRVerifier {} + +IRVisitorStrict <|-- IRVisitor +IRVisitorStrict <|-- IRPrinter +IRVisitorStrict <|-- IRRewriter +IRVisitor <|-- IRVerifier + +' Inheritance from IRRewriter +' simplifier for ir::Expr +class ExpressionSimplifier {} +IRRewriter <|-- ExpressionSimplifier + +' simplifiers for ir::Stmt +class RemoveRedundantStatements {} +class RemoveRedundantLoops {} +class RemoveDuplicateBody {} + +IRRewriter <|-- RemoveRedundantStatements +IRRewriter <|-- RemoveRedundantLoops +IRRewriter <|-- RemoveDuplicateBody + + +' Inheritance from IRPrinter +class CodeGen {} +class CodeGen_C {} +class CodeGen_CUDA {} +class CodeGen_ISPC { + -class FindVars +} + +class FindVars {} + +IRPrinter <|-- CodeGen +CodeGen <|-- CodeGen_C +CodeGen <|-- CodeGen_ISPC +CodeGen <|-- CodeGen_CUDA + +IRVisitor <|-- FindVars +CodeGen_ISPC +-- FindVars + +class Manageable {} +class IndexStmtNode { + -virtual void accept(IndexStmtVisitorStrict*) const = 0 +} +class IndexExprNode { + -virtual void accept(IndexStmtVisitorStrict*) const = 0 +} + + +Manageable <|-- IndexStmtNode +Uncopyable <|-- IndexStmtNode +Manageable <|-- IndexExprNode +Uncopyable <|-- IndexExprNode + +class IndexStmt {} +class IndexExpr {} + +IntrusivePtr <|-- IndexStmt +IndexStmt "1" *-- "1" IndexStmtNode +IntrusivePtr <|-- IndexExpr +IndexExpr "1" *-- "1" IndexExprNode + + +abstract class IndexExprVisitorStrict { + +void visit(const IndexStmt&) + +virtual void visit(const AccessNode*) = 0 + +virtual void visit(const LiteralNode*) = 0 + +virtual void visit(const NegNode*) = 0 + +virtual void visit(const AddNode*) = 0 + +virtual void visit(const SubNode*) = 0 + +virtual void visit(const MulNode*) = 0 + +virtual void visit(const DivNode*) = 0 + +virtual void visit(const SqrtNode*) = 0 + +virtual void visit(const CastNode*) = 0 + +virtual void visit(const CallIntrinsicNode*) = 0 + +virtual void visit(const ReductionNode*) = 0 +} +abstract class IndexStmtVisitorStrict { + +void visit(const IndexStmt&) + +virtual void visit(const AssignmentNode*) = 0 + +virtual void visit(const YieldNode*) = 0 + +virtual void visit(const ForallNode*) = 0 + +virtual void visit(const WhereNode*) = 0 + +virtual void visit(const SequenceNode*) = 0 + +virtual void visit(const AssembleNode*) = 0 + +virtual void visit(const MultiNode*) = 0 + +virtual void visit(const SuchThatNode*) = 0 +} + +abstract class IndexNotationVisitorStrict {} +class IndexNotationPrinter { + +void print(const IndexExpr& expr) + +void print(const IndexStmt& expr) + + ' Index Expressions visit() + +void visit(const AccessNode* node) + +void visit(const LiteralNode* node) + + void visit(const NegNode* node) + + void visit(const AddNode* node) + + void visit(const SubNode* node) + + void visit(const MulNode* node) + + void visit(const DivNode* node) + + void visit(const SqrtNode* node) + + void visit(const CastNode* node) + + void visit(const CallIntrinsicNode* node) + + void visit(const UnaryExprNode* node) + + void visit(const BinaryExprNode* node) + + void visit(const ReductionNode* node) + + ' Index Statement visit() + + void visit(const AssignmentNode* node) + + void visit(const YieldNode* node) + + void visit(const ForallNode* node) + + void visit(const WhereNode* node) + + void visit(const SequenceNode* node) + + void visit(const AssembleNode* node) + + void visit(const MultiNode* node) + + void visit(const SuchThatNode* node) +} +class IndexNotationVisitor { + ' Index Expressions visit() + +virtual void visit(const AccessNode* node) + +virtual void visit(const LiteralNode* node) + +virtual void visit(const NegNode* node) + +virtual void visit(const AddNode* node) + +virtual void visit(const SubNode* node) + +virtual void visit(const MulNode* node) + +virtual void visit(const DivNode* node) + +virtual void visit(const SqrtNode* node) + +virtual void visit(const CastNode* node) + +virtual void visit(const CallIntrinsicNode* node) + +virtual void visit(const UnaryExprNode* node) + +virtual void visit(const BinaryExprNode* node) + +virtual void visit(const ReductionNode* node) + + ' Index Statement visit() + +virtual void visit(const AssignmentNode* node) + +virtual void visit(const YieldNode* node) + +virtual void visit(const ForallNode* node) + +virtual void visit(const WhereNode* node) + +virtual void visit(const SequenceNode* node) + +virtual void visit(const AssembleNode* node) + +virtual void visit(const MultiNode* node) + +virtual void visit(const SuchThatNode* node) +} +class Matcher { + +} + +abstract class IndexExprRewriterStrict { + +IndexExpr rewrite(IndexExpr) + + #IndexExpr expr + + #virtual void visit(const AccessNode* op) = 0 + #virtual void visit(const LiteralNode* op) = 0 + #virtual void visit(const NegNode* op) = 0 + #virtual void visit(const SqrtNode* op) = 0 + #virtual void visit(const AddNode* op) = 0 + #virtual void visit(const SubNode* op) = 0 + #virtual void visit(const MulNode* op) = 0 + #virtual void visit(const DivNode* op) = 0 + #virtual void visit(const CastNode* op) = 0 + #virtual void visit(const CallIntrinsicNode* op) = 0 + #virtual void visit(const ReductionNode* op) = 0 +} +abstract class IndexStmtRewriterStrict { + +IndexStmt rewrite(IndexStmt) + + #IndexStmt stmt + + #virtual void visit(const AssignmentNode* op) = 0 + #virtual void visit(const YieldNode* op) = 0 + #virtual void visit(const ForallNode* op) = 0 + #virtual void visit(const WhereNode* op) = 0 + #virtual void visit(const SequenceNode* op) = 0 + #virtual void visit(const AssembleNode* op) = 0 + #virtual void visit(const MultiNode* op) = 0 + #virtual void visit(const SuchThatNode* op) = 0 +} +abstract class IndexNotationRewriterStrict {} +class IndexNotationRewriter { + ' Index Expressions visit() + +virtual void visit(const AccessNode* node) + +virtual void visit(const LiteralNode* node) + +virtual void visit(const NegNode* node) + +virtual void visit(const AddNode* node) + +virtual void visit(const SubNode* node) + +virtual void visit(const MulNode* node) + +virtual void visit(const DivNode* node) + +virtual void visit(const SqrtNode* node) + +virtual void visit(const CastNode* node) + +virtual void visit(const CallIntrinsicNode* node) + +virtual void visit(const UnaryExprNode* node) + +virtual void visit(const BinaryExprNode* node) + +virtual void visit(const ReductionNode* node) + + ' Index Statement visit() + +virtual void visit(const AssignmentNode* node) + +virtual void visit(const YieldNode* node) + +virtual void visit(const ForallNode* node) + +virtual void visit(const WhereNode* node) + +virtual void visit(const SequenceNode* node) + +virtual void visit(const AssembleNode* node) + +virtual void visit(const MultiNode* node) + +virtual void visit(const SuchThatNode* node) +} + + +IndexExprVisitorStrict <|-- IndexNotationVisitorStrict +IndexStmtVisitorStrict <|-- IndexNotationVisitorStrict +IndexNotationVisitorStrict <|-- IndexNotationVisitor +IndexNotationVisitorStrict <|-- IndexNotationPrinter +IndexNotationVisitor <|-- Matcher + +IndexExprVisitorStrict <|-- IndexExprRewriterStrict +IndexStmtVisitorStrict <|-- IndexStmtRewriterStrict +IndexExprRewriterStrict <|-- IndexNotationRewriterStrict +IndexStmtRewriterStrict <|-- IndexNotationRewriterStrict + +IndexNotationRewriterStrict <|-- IndexNotationRewriter + +' - private +' # protected +' ~ package private +' + public + +' {static} +' {abstract} virtual methods + +' lowering part -- convertion from IndexExpr and IndexStmt to ir::Expr and ir::Stmt +class Lowerer { + +std::shared_ptr impl; +} +abstract class LowererImpl { + ' protected fields and methods + #class Visitor; + #friend class Visitor; + #std::shared_ptr visitor; + + #virtual ir::Stmt lower(IndexStmt stmt); + #virtual ir::Expr lower(IndexExpr expr); + + #virtual ir::Expr lowerExpr(IndexExpr expr) = 0; + #virtual ir::Stmt lowerStmt(IndexStmt stmt) = 0; + + ' public fields and methods + +virtual ir::Stmt lower(IndexStmt stmt, std::string name, + bool assemble, bool compute, bool pack, bool unpack) = 0; +} + +class LowererImplImperative { + ' private fields and methods + -class Visitor + -fiend class Visitor + -std::shared_ptr visitor + -bool assemble + -bool compute + -vars a_bunch_of_other_fields + + ' protected fields and methods + #virtual ir::Stmt lowerExpr(IndexExpr expr); + #virtual ir::Stmt lowerStmt(IndexStmt stmt); + + ' public fields and methods + +ir::Stmt lower(IndexStmt stmt, std::string name, + bool assemble, bool compute, bool pack, bool unpack) + +} +note bottom of LowererImplImperative : Stmt LowererImplImperative::lower(IndexStmt stmt) {\n return visitor->lower(stmt);\n} + +Uncopyable <|-- LowererImpl +Lowerer "1" *-- "1" LowererImpl : contains + + +' visitor that does the lowering +class Visitor { + ' private fields and methods + -LowererImpl* impl + -Expr expr + -Stmt stmt + + -void visit(const AssignmentNode* node) + -void visit(const YieldNode* node) + -void visit(const ForallNode* node) + -void visit(const WhereNode* node) + -void visit(const MultiNode* node) + -void visit(const SuchThatNode* node) + -void visit(const SequenceNode* node) + -void visit(const AssembleNode* node) + -void visit(const AccessNode* node) + -void visit(const LiteralNode* node) + -void visit(const NegNode* node) + -void visit(const AddNode* node) + -void visit(const SubNode* node) + -void visit(const MulNode* node) + -void visit(const DivNode* node) + -void visit(const SqrtNode* node) + -void visit(const CastNode* node) + -void visit(const CallIntrinsicNode* node) + -void visit(const ReductionNode* node) + + ' public fields and methods + +Visitor(LowererImplImperative* impl) + +Stmt lower(IndexStmt stmt) + +Expr lower(IndexExpr expr) +} + +note bottom of Visitor: Stmt lower(IndexStmt stmt) {\n this->stmt = Stmt();\n impl->accessibleIterators.scope();\n IndexStmtVisitorStrict::visit(stmt);\n impl->accessibleIterators.unscope();\n return this->stmt;\n} + +IndexNotationVisitorStrict <|-- Visitor +LowererImpl "1" +-- "1" Visitor : contains +Visitor "1" *-- "1" LowererImpl : contains + +LowererImpl <|-- LowererImplImperative +LowererImplImperative "1" +-- "1" Visitor : contains +Visitor "1" *-- "1" LowererImplImperative : contains + +@enduml \ No newline at end of file diff --git a/test/test.cpp b/test/test.cpp index a49f10ff7..851493b7f 100644 --- a/test/test.cpp +++ b/test/test.cpp @@ -38,6 +38,20 @@ void ASSERT_TENSOR_EQ(TensorBase expected, TensorBase actual) { ASSERT_TRUE(equals(expected, actual)); } +// void ASSERT_TENSOR_VAL(TensorBase expected, TensorBase actual) { +// std::cout << "order: " << expected.getOrder(); +// std::vector modes{}; +// for (int mode = 0; mode < expected.getOrder(); mode++) { +// if (expected.getDimension(mode) != actual.getDimension(mode)) { +// ASSERT_TRUE(false); +// } + +// for (int i=0; i expected, void ASSERT_STORAGE_EQ(TensorStorage expected, TensorStorage actual); void ASSERT_TENSOR_EQ(TensorBase expected, TensorBase actual); +// void ASSERT_TENSOR_VAL(TensorBase expected, TensorBase actual); template void ASSERT_COMPONENTS_EQUALS(vector>> expectedIndices, diff --git a/test/tests-scheduling-eval.cpp b/test/tests-scheduling-eval.cpp index 93ba7b01e..4957418e0 100644 --- a/test/tests-scheduling-eval.cpp +++ b/test/tests-scheduling-eval.cpp @@ -12,6 +12,23 @@ #include "taco/index_notation/transformations.h" #include "codegen/codegen.h" #include "taco/lower/lower.h" +#include "taco/util/timers.h" + + +#define TOOL_BENCHMARK_TIMER(CODE,NAME,TIMER) { \ + if (time) { \ + taco::util::Timer timer; \ + timer.start(); \ + CODE; \ + timer.stop(); \ + taco::util::TimeResults result = timer.getResult(); \ + cout << NAME << " " << result << " ms" << endl; \ + TIMER=result; \ + } \ + else { \ + CODE; \ + } \ +} using namespace taco; const IndexVar i("i"), j("j"), k("k"), l("l"), m("m"), n("n"); @@ -52,7 +69,7 @@ IndexStmt scheduleSpMVISPC(IndexStmt stmt, int CHUNK_SIZE=16) { // return stmt; return stmt.split(i, i0, i1, CHUNK_SIZE) .reorder({i0, i1, j}) - .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); + .parallelize(i0, ParallelUnit::CPUSimd, OutputRaceStrategy::NoRaces); } IndexStmt scheduleSpMMCPU(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { @@ -71,16 +88,42 @@ IndexStmt scheduleSpMMISPC1(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, .pos(j, jpos, A(i,j)) .split(jpos, jpos0, jpos1, UNROLL_FACTOR) .reorder({i0, i1, jpos0, k, jpos1}) - .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) .parallelize(k, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); } +IndexStmt scheduleSpMMISPC1_2(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); + return stmt.split(i, i0, i1, CHUNK_SIZE) + .pos(j, jpos, A(i,j)) + .split(jpos, jpos0, jpos1, UNROLL_FACTOR) + .reorder({i0, i1, jpos0, k, jpos1}) + // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + .parallelize(i0, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); +} + +IndexStmt scheduleSpMMISPC1_3(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); + return stmt.split(i, i0, i1, CHUNK_SIZE) + .pos(j, jpos, A(i,j)) + .split(jpos, jpos0, jpos1, UNROLL_FACTOR) + .reorder({i0, i1, jpos0, k, jpos1}) + // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + .parallelize(i1, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); +} + IndexStmt scheduleSpMMISPC2(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); return stmt .parallelize(k, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); } +IndexStmt scheduleSpMMISPC2_2(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); + return stmt + .parallelize(i, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); +} + IndexStmt scheduleSpMMISPC3(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); return stmt @@ -88,10 +131,21 @@ IndexStmt scheduleSpMMISPC3(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, // .pos(j, jpos, A(i,j)) // .split(jpos, jpos0, jpos1, UNROLL_FACTOR) .reorder({j, k}) - .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) .parallelize(k, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); } +IndexStmt scheduleSpMMISPC3_2(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); + return stmt + // .split(i, i0, i1, CHUNK_SIZE) + // .pos(j, jpos, A(i,j)) + // .split(jpos, jpos0, jpos1, UNROLL_FACTOR) + .reorder({j, k}) + // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + .parallelize(i, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); +} + IndexStmt scheduleSpGEMMCPU(IndexStmt stmt, bool doPrecompute) { Assignment assign = stmt.as().getStmt().as().getStmt() .as().getStmt().as(); @@ -145,6 +199,16 @@ IndexStmt scheduleSDDMMCPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, .parallelize(kpos1, ParallelUnit::CPUVector, OutputRaceStrategy::ParallelReduction); } +IndexStmt scheduleSDDMMISPC(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1"); + return stmt.split(i, i0, i1, CHUNK_SIZE) + .pos(k, kpos, B(i,k)) + .split(kpos, kpos0, kpos1, UNROLL_FACTOR) + .reorder({i0, i1, kpos0, j, kpos1}) + // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); + .parallelize(kpos1, ParallelUnit::CPUSimd, OutputRaceStrategy::ParallelReduction); +} + IndexStmt scheduleSDDMMISPC1(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1"); return stmt.split(i, i0, i1, CHUNK_SIZE) @@ -175,6 +239,16 @@ IndexStmt scheduleTTVCPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16) { .parallelize(chunk, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); } +IndexStmt scheduleTTVISPC(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16) { + IndexVar f("f"), fpos("fpos"), chunk("chunk"), fpos2("fpos2"); + return stmt; + // return stmt.fuse(i, j, f) + // .pos(f, fpos, B(i,j,k)) + // .split(fpos, chunk, fpos2, CHUNK_SIZE) + // .reorder({chunk, fpos2, k}) + // .parallelize(chunk, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); +} + IndexStmt scheduleTTVCPUCSR(IndexStmt stmt) { TensorVar result = stmt.as().getStmt().as().getStmt() .as().getStmt().as().getLhs() @@ -635,6 +709,92 @@ TEST(scheduling_eval, spmmCPU) { ASSERT_TENSOR_EQ(expected, C); } +TEST(scheduling_eval, spmmISPC) { + taco::util::TimeResults timevalue; + bool time = true; + + set_ISPC_codegen_enabled(false); + set_CUDA_codegen_enabled(false); + + int NUM_I = 1021/10; + int NUM_J = 1039/10; + int NUM_K = 128; + float SPARSITY = .1; + Tensor A("A", {NUM_I, NUM_J}, CSR); + Tensor B("B", {NUM_J, NUM_K}, {Dense, Dense}); + Tensor C("C", {NUM_I, NUM_K}, {Dense, Dense}); + + srand(75883); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + if (rand_float < SPARSITY) { + A.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + } + + for (int j = 0; j < NUM_J; j++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + B.insert({j, k}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + + A.pack(); + B.pack(); + + set_ISPC_codegen_enabled(true); + C(i, k) = A(i, j) * B(j, k); + + IndexStmt stmt = C.getAssignment().concretize(); + // stmt = scheduleSpMMISPC1(stmt, A); + // stmt = scheduleSpMMISPC1_2(stmt, A); + stmt = scheduleSpMMISPC1_3(stmt, A); + + // stmt = scheduleSpMMISPC2(stmt, A); + // stmt = scheduleSpMMISPC2_2(stmt, A); + + // stmt = scheduleSpMMISPC3(stmt, A); + // stmt = scheduleSpMMISPC3_2(stmt, A); + + //printToFile("spmm_cpu", stmt); + + C.compile(stmt); + C.assemble(); + C.compute(); + + set_ISPC_codegen_enabled(false); + Tensor expected("expected", {NUM_I, NUM_K}, {Dense, Dense}); + expected(i, k) = A(i, j) * B(j, k); + IndexStmt stmt_taco = expected.getAssignment().concretize(); + stmt_taco = scheduleSpMMCPU(stmt_taco, A); + + expected.compile(stmt_taco); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, C); + + float ERROR_MARGIN = 0.01; + // ASSERT_TENSOR_VAL(expected, y); + for (int i = 0; i < NUM_I; i++) { + for (int k = 0; k < NUM_K; k++) { + if (expected(i,k) <= C(i,k) + ERROR_MARGIN && expected(i,k) >= C(i,k) - ERROR_MARGIN) { + // std::cout << "matched values: expected -> " << expected(j) << " == " << y(j) << " <- actual\n"; + } + else { + std::cout << "unmatched values: expected -> " << expected(i,k) << " != " << C(i,k) << " <- actual\n"; + ASSERT_TRUE(false); + }; + } + } + + for (int i=0; i<10; i++) { + TOOL_BENCHMARK_TIMER(C.compute(), "Compute ISPC: ", timevalue); + TOOL_BENCHMARK_TIMER(expected.compute(), "Compute TACO: ", timevalue); + } +} + struct spgemm : public TestWithParam> {}; TEST_P(spgemm, scheduling_eval) { @@ -878,6 +1038,96 @@ TEST(scheduling_eval, sddmmCPU) { ASSERT_TENSOR_EQ(expected, A); } +// bin/taco-test --gtest_filter=scheduling_eval.sddmmISPC +TEST(scheduling_eval, sddmmISPC) { + + taco::util::TimeResults timevalue; + bool time = true; + + set_CUDA_codegen_enabled(false); + set_ISPC_codegen_enabled(false); + + int NUM_I = 1021/10; + int NUM_J = 1039/10; + int NUM_K = 1057/10; + float SPARSITY = .3; + Tensor A("A", {NUM_I, NUM_K}, {Dense, Dense}); + Tensor B("B", {NUM_I, NUM_K}, CSR); + Tensor C("C", {NUM_I, NUM_J}, {Dense, Dense}); + Tensor D("D", {NUM_J, NUM_K}, {Dense, Dense}); + + srand(268238); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + C.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + + for (int i = 0; i < NUM_I; i++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, k}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + } + + for (int j = 0; j < NUM_J; j++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + D.insert({j, k}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + + B.pack(); + C.pack(); + D.pack(); + + set_ISPC_codegen_enabled(true); + A(i,k) = B(i,k) * C(i,j) * D(j,k); + + IndexStmt stmt = A.getAssignment().concretize(); + stmt = scheduleSDDMMISPC(stmt, B); + + //printToFile("sddmm_cpu", stmt); + + A.compile(stmt); + A.assemble(); + // A.compute(); + + set_ISPC_codegen_enabled(false); + Tensor expected("expected", {NUM_I, NUM_K}, {Dense, Dense}); + expected(i,k) = B(i,k) * C(i,j) * D(j,k); + IndexStmt stmt_taco = A.getAssignment().concretize(); + stmt_taco = scheduleSDDMMCPU(stmt_taco, B); + expected.compile(stmt_taco); + expected.assemble(); + // expected.compute(); + + TOOL_BENCHMARK_TIMER(A.compute(), "Compute ISPC: ", timevalue); + TOOL_BENCHMARK_TIMER(expected.compute(), "Compute TACO: ", timevalue); + + ASSERT_TENSOR_EQ(expected, A); + + + float ERROR_MARGIN = 0.01; + // ASSERT_TENSOR_VAL(expected, y); + for (int i = 0; i < NUM_I; i++) { + for (int k = 0; k < NUM_K; k++) { + if (expected(i,k) <= A(i,k) + ERROR_MARGIN && expected(i,k) >= A(i,k) - ERROR_MARGIN) { + // std::cout << "matched values: expected -> " << expected(j) << " == " << y(j) << " <- actual\n"; + } + else { + std::cout << "unmatched values: expected -> " << expected(i,k) << " != " << A(i,k) << " <- actual\n"; + ASSERT_TRUE(false); + }; + } + } + std::cout << "test scheduling_eval.sddmmISPC passed\n"; + +} + TEST(scheduling_eval, spmvCPU) { if (should_use_CUDA_codegen()) { return; @@ -926,6 +1176,100 @@ TEST(scheduling_eval, spmvCPU) { ASSERT_TENSOR_EQ(expected, y); } + +TEST(scheduling_eval, spmvISPC) { + + taco::util::TimeResults timevalue; + bool time = true; + + set_ISPC_codegen_enabled(false); + set_CUDA_codegen_enabled(false); + + int NUM_I = 200021/10; + int NUM_J = 200039/10; + float SPARSITY = .2; + Tensor A("A", {NUM_I, NUM_J}, CSR); + Tensor x("x", {NUM_J}, Format({Dense})); + Tensor y("y", {NUM_I}, Format({Dense})); + + srand(120); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + if (rand_float < SPARSITY) { + A.insert({i, j}, (double) ((int) (rand_float * 3 / SPARSITY))); + } + } + } + + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + x.insert({j}, (double) ((int) (rand_float*3/SPARSITY))); + } + + x.pack(); + A.pack(); + + set_ISPC_codegen_enabled(true); + + y(i) = A(i, j) * x(j); + + IndexStmt stmt = y.getAssignment().concretize(); + stmt = scheduleSpMVISPC(stmt); + + //printToFile("spmv_cpu", stmt); + + y.compile(stmt); + y.assemble(); + // y.compile(); + + set_ISPC_codegen_enabled(false); + + // Tensor expected("expected", {NUM_I}, Format({Dense})); + // expected(i) = A(i, j) * x(j); + // expected.compile(); + // expected.assemble(); + // expected.compute(); + + + Tensor expected("expected", {NUM_I}, Format({Dense})); + expected(i) = A(i, j) * x(j); + IndexStmt stmt_taco = expected.getAssignment().concretize(); + stmt_taco = scheduleSpMVCPU(stmt_taco); + + expected.compile(stmt_taco); + expected.assemble(); + // expected.compile(); + + + TOOL_BENCHMARK_TIMER(y.compute(), "Compute ISPC: ", timevalue); + TOOL_BENCHMARK_TIMER(expected.compute(), "Compute TACO: ", timevalue); + + + ASSERT_TENSOR_EQ(expected, y); + + float ERROR_MARGIN = 0.01; + // ASSERT_TENSOR_VAL(expected, y); + for (int j = 0; j < NUM_J; j++) { + if (expected(j) <= y(j) + ERROR_MARGIN && expected(j) >= y(j) - ERROR_MARGIN) { + // std::cout << "matched values: expected -> " << expected(j) << " == " << y(j) << " <- actual\n"; + } + else { + std::cout << "unmatched values: expected -> " << expected(j) << " != " << y(j) << " <- actual\n"; + ASSERT_TRUE(false); + }; + } + + std::cout << "test scheduling_eval.spmvISPC passed\n"; + + for (int i=0; i<10; i++) { + TOOL_BENCHMARK_TIMER(y.compute(), "Compute ISPC: ", timevalue); + TOOL_BENCHMARK_TIMER(expected.compute(), "Compute TACO: ", timevalue); + } + + +} + TEST(scheduling_eval, ttvCPU) { if (should_use_CUDA_codegen()) { return; @@ -977,6 +1321,65 @@ TEST(scheduling_eval, ttvCPU) { ASSERT_TENSOR_EQ(expected, A); } + +TEST(scheduling_eval, ttvISPC) { + if (should_use_CUDA_codegen()) { + return; + } + set_CUDA_codegen_enabled(false); + set_ISPC_codegen_enabled(false); + int NUM_I = 1021/10; + int NUM_J = 1039/10; + int NUM_K = 1057/10; + float SPARSITY = .3; + Tensor A("A", {NUM_I, NUM_J}, {Dense, Dense}); // TODO: change to sparse outputs + Tensor B("B", {NUM_I, NUM_J, NUM_K}, {Sparse, Sparse, Sparse}); + Tensor c("c", {NUM_K}, Format({Dense})); + + srand(9536); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float) rand() / (float) (RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, j, k}, (double) ((int) (rand_float * 3 / SPARSITY))); + } + } + } + } + + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + c.insert({k}, (double) ((int) (rand_float*3))); + } + + B.pack(); + c.pack(); + + set_ISPC_codegen_enabled(true); + A(i,j) = B(i,j,k) * c(k); + + IndexStmt stmt = A.getAssignment().concretize(); + stmt = scheduleTTVISPC(stmt, B); + + //printToFile("ttv_cpu", stmt); + + A.compile(stmt); + A.assemble(); + A.compute(); + + set_ISPC_codegen_enabled(false); + Tensor expected("expected", {NUM_I, NUM_J}, {Dense, Dense}); + expected(i,j) = B(i,j,k) * c(k); + IndexStmt stmt_taco = expected.getAssignment().concretize(); + stmt_taco = scheduleTTVCPU(stmt_taco, B); + expected.compile(); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, A); +} + + TEST(scheduling_eval, ttvCPU_CSR) { if (should_use_CUDA_codegen()) { return; @@ -1081,6 +1484,60 @@ TEST(scheduling_eval, ttmCPU) { ASSERT_TENSOR_EQ(expected, A); } +TEST(scheduling_eval, ttmISPC) { + if (should_use_CUDA_codegen()) { + return; + } + int NUM_I = 1021/40; + int NUM_J = 1039/40; + int NUM_K = 1057/40; + int NUM_L = 1232/40; + float SPARSITY = .1; + Tensor A("A", {NUM_I, NUM_J, NUM_L}, {Dense, Dense, Dense}); // TODO: change to sparse outputs + Tensor B("B", {NUM_I, NUM_J, NUM_K}, {Sparse, Sparse, Sparse}); + Tensor C("C", {NUM_K, NUM_L}, {Dense, Dense}); + + srand(935); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float) rand() / (float) (RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, j, k}, (double) ((int) (rand_float * 3 / SPARSITY))); + } + } + } + } + + for (int k = 0; k < NUM_K; k++) { + for (int l = 0; l < NUM_L; l++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + C.insert({k, l}, (double) ((int) (rand_float*3))); + } + } + + B.pack(); + C.pack(); + + A(i,j,l) = B(i,j,k) * C(k,l); + + IndexStmt stmt = A.getAssignment().concretize(); + stmt = scheduleTTMCPU(stmt, B); + + //printToFile("ttm_cpu", stmt); + + A.compile(stmt); + A.assemble(); + A.compute(); + + Tensor expected("expected", {NUM_I, NUM_J, NUM_L}, {Dense, Dense, Dense}); + expected(i,j,l) = B(i,j,k) * C(k,l); + expected.compile(); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, A); +} + TEST(scheduling_eval, mttkrpCPU) { if (should_use_CUDA_codegen()) { return; @@ -1143,6 +1600,69 @@ TEST(scheduling_eval, mttkrpCPU) { ASSERT_TENSOR_EQ(expected, A); } + +TEST(scheduling_eval, mttkrpISPC) { + if (should_use_CUDA_codegen()) { + return; + } + int NUM_I = 1021/20; + int NUM_J = 1039/20; + int NUM_K = 1057/20; + int NUM_L = 1232/20; + float SPARSITY = .1; + Tensor A("A", {NUM_I, NUM_J}, {Dense, Dense}); + Tensor B("B", {NUM_I, NUM_K, NUM_L}, {Dense, Sparse, Sparse}); + Tensor C("C", {NUM_K, NUM_J}, {Dense, Dense}); + Tensor D("D", {NUM_L, NUM_J}, {Dense, Dense}); + + srand(549694); + for (int i = 0; i < NUM_I; i++) { + for (int k = 0; k < NUM_K; k++) { + for (int l = 0; l < NUM_L; l++) { + float rand_float = (float) rand() / (float) (RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, k, l}, (double) ((int) (rand_float * 3 / SPARSITY))); + } + } + } + } + + for (int k = 0; k < NUM_K; k++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + C.insert({k, j}, (double) ((int) (rand_float*3))); + } + } + + for (int l = 0; l < NUM_L; l++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + D.insert({l, j}, (double) ((int) (rand_float*3))); + } + } + + B.pack(); + C.pack(); + D.pack(); + + A(i,j) = B(i,k,l) * C(k,j) * D(l,j); + + IndexStmt stmt = A.getAssignment().concretize(); + stmt = scheduleMTTKRPCPU(stmt, B); + //printToFile("mttkrp_cpu", stmt); + + A.compile(stmt); + A.assemble(); + A.compute(); + + Tensor expected("expected", {NUM_I, NUM_J}, {Dense, Dense}); + expected(i,j) = B(i,k,l) * C(k,j) * D(l,j); + expected.compile(); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, A); +} + TEST(scheduling_eval, spmvGPU) { if (!should_use_CUDA_codegen()) { return; @@ -2079,6 +2599,55 @@ TEST(generate_evaluation_files, cpu) { } } +TEST(generate_evaluation_files_spmv, ispc) { + set_CUDA_codegen_enabled(false); + set_ISPC_codegen_enabled(true); + + std::cout << "executing generate_evaluation_file.ispc\n"; + + int NUM_I = 100; + int NUM_J = 100; + + vector> spmv_parameters = {}; // {NNZ_PER_THREAD, BLOCK_SIZE} + for (int i = 3; i <= 20; i++) { + spmv_parameters.push_back({i, 512}); + } + + string file_ending_c = ".c"; + string file_ending_ispc = ".ispc"; + string file_path = "eval_prepared_ispc/spmv/"; + mkdir(file_path.c_str(), 0777); + + // spmv + { + stringstream source1; + stringstream source2; + std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); + Tensor A("A", {NUM_I, NUM_J}, CSR); + Tensor x("x", {NUM_J}, Format({Dense})); + Tensor y("y", {NUM_I}, Format({Dense})); + IndexExpr precomputed = A(i, j) * x(j); + y(i) = precomputed; + IndexStmt stmt = y.getAssignment().concretize(); + bool isFirst = true; + for (auto paramSet : spmv_parameters) { + IndexStmt scheduled = scheduleSpMVCPU(stmt); + ir::Stmt compute = lower(scheduled, string("compute_") + util::join(paramSet, "_"), false, true); + codegen->compile(compute, isFirst); + isFirst = false; + } + ofstream source_file1; + source_file1.open(file_path + "spmv_ispc" + file_ending_c); + source_file1 << source1.str(); + source_file1.close(); + + ofstream source_file2; + source_file2.open(file_path + "__spmv_ispc" + file_ending_ispc); + source_file2 << source2.str(); + source_file2.close(); + } +} + TEST(generate_evaluation_files, gpu) { // if (!should_use_CUDA_codegen()) { // return; From a5c3a8cea4c8c736d7bf0c4cf976095cbed11401 Mon Sep 17 00:00:00 2001 From: Adhhitha Dias Date: Wed, 8 Sep 2021 10:26:47 -0400 Subject: [PATCH 6/8] add class diagram --- .gitignore | 1 + out/taco-uml/._taco.svg | Bin 0 -> 4096 bytes out/taco-uml/taco.svg | 878 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 879 insertions(+) create mode 100755 out/taco-uml/._taco.svg create mode 100644 out/taco-uml/taco.svg diff --git a/.gitignore b/.gitignore index 9abc3adc7..215b56e9a 100644 --- a/.gitignore +++ b/.gitignore @@ -14,4 +14,5 @@ doc apps/tensor_times_vector/tensor_times_vector .cache +.vscode compile_commands.json diff --git a/out/taco-uml/._taco.svg b/out/taco-uml/._taco.svg new file mode 100755 index 0000000000000000000000000000000000000000..e88dbd51b684b39e4ea0b0f4425ef9bc02f5d445 GIT binary patch literal 4096 zcmZQz6=P>$Vqox1Ojhs@R)|o50+1L3ClDJkFz{^v(m+1nBL)UWIUt(=a103vVwlIZ z9-@O`0Z_RBnifVNA1W@DoS&IntrusivePtrT *ptrUncopyableIRNodevirtual void accept(IRVisitorStrict *v) const = 0virtual IRNodeType type_info() const = 0;BaseStmtNodeBaseExprNodeDatatype typeStmtNodevoid accept(IRVisitorStrict *v) constExprNodevoid accept(IRVisitorStrict *v) constIRHandlevoid accept(IRVisitorStrict *v) constExprStmtIRVisitorStrictvirtual void visit(const IRNode*) const = 0IRVisitorvirtual void visit(const IRNode*)IRRewriterExpr exprStmt stmtvirtual void visit(const ExprNode* op)virtual void visit(const StmtNode* op)Expr rewrite(Expr)Stmt rewrite(Stmt)IRPrinterstd::ostream &streamstd::ostream &stream2int indentbool colorbool simplifyenum PrecedencePrecedence parentPrecedence = BOTTOMNameGenerator varNameGeneratorscopedMap<Expr, std::String> varNamesvoid doIndent()void printBinOp(Expr a, Expr b, std::string op, Precedence precedence)void fewMoreMethods()virtual void visit(const ExprNode*)virtual void visit(const StmtNode*)setColor(bool color)print(Stmt)IRVerifierExpressionSimplifierRemoveRedundantStatementsRemoveRedundantLoopsRemoveDuplicateBodyCodeGenCodeGen_CCodeGen_CUDACodeGen_ISPCManageableIndexStmtNodevirtual void accept(IndexStmtVisitorStrict*) const = 0IndexExprNodevirtual void accept(IndexStmtVisitorStrict*) const = 0IndexStmtIndexExprIndexExprVisitorStrictvoid visit(const IndexStmt&)virtual void visit(const AccessNode*) = 0virtual void visit(const LiteralNode*) = 0virtual void visit(const NegNode*) = 0virtual void visit(const AddNode*) = 0virtual void visit(const SubNode*) = 0virtual void visit(const MulNode*) = 0virtual void visit(const DivNode*) = 0virtual void visit(const SqrtNode*) = 0virtual void visit(const CastNode*) = 0virtual void visit(const CallIntrinsicNode*) = 0virtual void visit(const ReductionNode*) = 0IndexStmtVisitorStrictvoid visit(const IndexStmt&)virtual void visit(const AssignmentNode*) = 0virtual void visit(const YieldNode*) = 0virtual void visit(const ForallNode*) = 0virtual void visit(const WhereNode*) = 0virtual void visit(const SequenceNode*) = 0virtual void visit(const AssembleNode*) = 0virtual void visit(const MultiNode*) = 0virtual void visit(const SuchThatNode*) = 0IndexNotationVisitorStrictIndexNotationPrintervoid print(const IndexExpr& expr)void print(const IndexStmt& expr)void visit(const AccessNode* node)void visit(const LiteralNode* node)void visit(const NegNode* node)void visit(const AddNode* node)void visit(const SubNode* node)void visit(const MulNode* node)void visit(const DivNode* node)void visit(const SqrtNode* node)void visit(const CastNode* node)void visit(const CallIntrinsicNode* node)void visit(const UnaryExprNode* node)void visit(const BinaryExprNode* node)void visit(const ReductionNode* node)void visit(const AssignmentNode* node)void visit(const YieldNode* node)void visit(const ForallNode* node)void visit(const WhereNode* node)void visit(const SequenceNode* node)void visit(const AssembleNode* node)void visit(const MultiNode* node)void visit(const SuchThatNode* node)IndexNotationVisitorvirtual void visit(const AccessNode* node)virtual void visit(const LiteralNode* node)virtual void visit(const NegNode* node)virtual void visit(const AddNode* node)virtual void visit(const SubNode* node)virtual void visit(const MulNode* node)virtual void visit(const DivNode* node)virtual void visit(const SqrtNode* node)virtual void visit(const CastNode* node)virtual void visit(const CallIntrinsicNode* node)virtual void visit(const UnaryExprNode* node)virtual void visit(const BinaryExprNode* node)virtual void visit(const ReductionNode* node)virtual void visit(const AssignmentNode* node)virtual void visit(const YieldNode* node)virtual void visit(const ForallNode* node)virtual void visit(const WhereNode* node)virtual void visit(const SequenceNode* node)virtual void visit(const AssembleNode* node)virtual void visit(const MultiNode* node)virtual void visit(const SuchThatNode* node)MatcherIndexExprRewriterStrictIndexExpr exprIndexExpr rewrite(IndexExpr)virtual void visit(const AccessNode* op) = 0virtual void visit(const LiteralNode* op) = 0virtual void visit(const NegNode* op) = 0virtual void visit(const SqrtNode* op) = 0virtual void visit(const AddNode* op) = 0virtual void visit(const SubNode* op) = 0virtual void visit(const MulNode* op) = 0virtual void visit(const DivNode* op) = 0virtual void visit(const CastNode* op) = 0virtual void visit(const CallIntrinsicNode* op) = 0virtual void visit(const ReductionNode* op) = 0IndexStmtRewriterStrictIndexStmt stmtIndexStmt rewrite(IndexStmt)virtual void visit(const AssignmentNode* op) = 0virtual void visit(const YieldNode* op) = 0virtual void visit(const ForallNode* op) = 0virtual void visit(const WhereNode* op) = 0virtual void visit(const SequenceNode* op) = 0virtual void visit(const AssembleNode* op) = 0virtual void visit(const MultiNode* op) = 0virtual void visit(const SuchThatNode* op) = 0IndexNotationRewriterStrictIndexNotationRewritervirtual void visit(const AccessNode* node)virtual void visit(const LiteralNode* node)virtual void visit(const NegNode* node)virtual void visit(const AddNode* node)virtual void visit(const SubNode* node)virtual void visit(const MulNode* node)virtual void visit(const DivNode* node)virtual void visit(const SqrtNode* node)virtual void visit(const CastNode* node)virtual void visit(const CallIntrinsicNode* node)virtual void visit(const UnaryExprNode* node)virtual void visit(const BinaryExprNode* node)virtual void visit(const ReductionNode* node)virtual void visit(const AssignmentNode* node)virtual void visit(const YieldNode* node)virtual void visit(const ForallNode* node)virtual void visit(const WhereNode* node)virtual void visit(const SequenceNode* node)virtual void visit(const AssembleNode* node)virtual void visit(const MultiNode* node)virtual void visit(const SuchThatNode* node)Lowererstd::shared_ptr<LowererImpl> impl;LowererImplclass Visitor;friend class Visitor;std::shared_ptr<Visitor> visitor;virtual ir::Stmt lower(IndexStmt stmt);virtual ir::Expr lower(IndexExpr expr);virtual ir::Expr lowerExpr(IndexExpr expr) = 0;virtual ir::Stmt lowerStmt(IndexStmt stmt) = 0;virtual ir::Stmt lower(IndexStmt stmt, std::string name,bool assemble, bool compute, bool pack, bool unpack) = 0;LowererImplImperativeclass Visitorfiend class Visitorstd::shared_ptr<Visitor> visitorbool assemblebool computevars a_bunch_of_other_fieldsvirtual ir::Stmt lowerExpr(IndexExpr expr);virtual ir::Stmt lowerStmt(IndexStmt stmt);ir::Stmt lower(IndexStmt stmt, std::string name,bool assemble, bool compute, bool pack, bool unpack)Stmt LowererImplImperative::lower(IndexStmt stmt) {return visitor->lower(stmt);}VisitorLowererImpl* implExpr exprStmt stmtvoid visit(const AssignmentNode* node)void visit(const YieldNode* node)void visit(const ForallNode* node)void visit(const WhereNode* node)void visit(const MultiNode* node)void visit(const SuchThatNode* node)void visit(const SequenceNode* node)void visit(const AssembleNode* node)void visit(const AccessNode* node)void visit(const LiteralNode* node)void visit(const NegNode* node)void visit(const AddNode* node)void visit(const SubNode* node)void visit(const MulNode* node)void visit(const DivNode* node)void visit(const SqrtNode* node)void visit(const CastNode* node)void visit(const CallIntrinsicNode* node)void visit(const ReductionNode* node)Visitor(LowererImplImperative* impl)Stmt lower(IndexStmt stmt)Expr lower(IndexExpr expr)Stmt lower(IndexStmt stmt) {this->stmt = Stmt();impl->accessibleIterators.scope();IndexStmtVisitorStrict::visit(stmt);impl->accessibleIterators.unscope();return this->stmt;}contains111111contains11contains11contains11contains11contains11 \ No newline at end of file From 4a4a569f83b7acf5656eff290fd004c62bdc38b9 Mon Sep 17 00:00:00 2001 From: Adhhitha Dias Date: Wed, 8 Sep 2021 10:35:14 -0400 Subject: [PATCH 7/8] add ispc headers for binary search and fix compile errors --- include/taco/ir/ir.h | 2 +- src/codegen/codegen_ispc.cpp | 397 +++++++++++++++++++++-------------- src/codegen/codegen_ispc.h | 8 +- src/ir/ir_printer.cpp | 40 +++- 4 files changed, 277 insertions(+), 170 deletions(-) diff --git a/include/taco/ir/ir.h b/include/taco/ir/ir.h index 651faff4e..96dc7d034 100644 --- a/include/taco/ir/ir.h +++ b/include/taco/ir/ir.h @@ -591,7 +591,7 @@ struct Switch : public StmtNode { static const IRNodeType _type_info = IRNodeType::Switch; }; -enum class LoopKind {Serial, Static, Dynamic, Runtime, Vectorized, Static_Chunked, Foreach, Mul_Thread}; +enum class LoopKind {Serial, Static, Dynamic, Runtime, Vectorized, Static_Chunked, Foreach, Mul_Thread, Init}; /** A for loop from start to end by increment. * A vectorized loop will require the increment to be 1 and the diff --git a/src/codegen/codegen_ispc.cpp b/src/codegen/codegen_ispc.cpp index 237bc822d..d35af1748 100644 --- a/src/codegen/codegen_ispc.cpp +++ b/src/codegen/codegen_ispc.cpp @@ -145,8 +145,61 @@ const string cHeaders = " free(t);\n" "}\n" "#endif\n"; + +const string ispcHeaders = + "#define __TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))\n" + "#define __TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))\n" + "#define __TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)\n" + "int __cmp(const void *a, const void *b) {\n" + " return *((const int*)a) - *((const int*)b);\n" + "}\n" + "int __taco_binarySearchAfter(int *array, int arrayStart, int arrayEnd, int target) {\n" + " if (array[arrayStart] >= target) {\n" + " return arrayStart;\n" + " }\n" + " int lowerBound = arrayStart; // always < target\n" + " int upperBound = arrayEnd; // always >= target\n" + " while (upperBound - lowerBound > 1) {\n" + " int mid = (upperBound + lowerBound) / 2;\n" + " int midValue = array[mid];\n" + " if (midValue < target) {\n" + " lowerBound = mid;\n" + " }\n" + " else if (midValue > target) {\n" + " upperBound = mid;\n" + " }\n" + " else {\n" + " return mid;\n" + " }\n" + " }\n" + " return upperBound;\n" + "}\n" + "int __taco_binarySearchBefore(int *array, int arrayStart, int arrayEnd, int target) {\n" + " if (array[arrayEnd] <= target) {\n" + " return arrayEnd;\n" + " }\n" + " int lowerBound = arrayStart; // always <= target\n" + " int upperBound = arrayEnd; // always > target\n" + " while (upperBound - lowerBound > 1) {\n" + " int mid = (upperBound + lowerBound) / 2;\n" + " int midValue = array[mid];\n" + " if (midValue < target) {\n" + " lowerBound = mid;\n" + " }\n" + " else if (midValue > target) {\n" + " upperBound = mid;\n" + " }\n" + " else {\n" + " return mid;\n" + " }\n" + " }\n" + " return lowerBound;\n" + "}\n\n\n"; + } // anonymous namespace + + // find variables for generating declarations // generates a single var for each GetProperty class CodeGen_ISPC::FindVars : public IRVisitor { @@ -249,11 +302,10 @@ class CodeGen_ISPC::FindVars : public IRVisitor { // Finds all for loops tagged with accelerator and adds statements to deviceFunctions // Also tracks scope of when device function is called and // tracks which variables must be passed to function. -class CodeGen_ISPC::DeviceFunctionCollector : public IRVisitor { +class CodeGen_ISPC::FunctionCollector : public IRVisitor { public: - vector blockFors; vector threadFors; // contents is device function - vector warpFors; + vector initFors; // for loops to initialize statements map scopeMap; // the variables to pass to each device function @@ -271,7 +323,7 @@ class CodeGen_ISPC::DeviceFunctionCollector : public IRVisitor { CodeGen_ISPC *codeGen; // copy inputs and outputs into the map - DeviceFunctionCollector(vector inputs, vector outputs, CodeGen_ISPC *codeGen) : codeGen(codeGen) { + FunctionCollector(vector inputs, vector outputs, CodeGen_ISPC *codeGen) : codeGen(codeGen) { inDeviceFunction = false; for (auto v: inputs) { auto var = v.as(); @@ -310,7 +362,11 @@ class CodeGen_ISPC::DeviceFunctionCollector : public IRVisitor { } else if (op->parallel_unit == ParallelUnit::CPUSimd) { - + std::cout << "************************************************************************** CPUSimd For node\n"; + } + else if (op->kind == LoopKind::Init) { + std::cout << "************************************************************************* Init loop kind found\n"; + initFors.push_back(op); } else{ op->var.accept(this); @@ -376,6 +432,10 @@ void CodeGen_ISPC::compile(Stmt stmt, bool isFirst) { if (isFirst) { // output the headers out << cHeaders; + + if (&out != &out2) { + out2 << ispcHeaders; + } } out << endl; // generate code for the Stmt @@ -385,13 +445,13 @@ void CodeGen_ISPC::compile(Stmt stmt, bool isFirst) { -string CodeGen_ISPC::printCallISPCFunc(const Function *func, map varMap, +string CodeGen_ISPC::printCallISPCFunc(const std::string& funcName, map varMap, vector &sortedProps) { std::stringstream ret; ret << " "; unordered_set propsAlreadyGenerated; - ret << "__" << func->name << "("; + ret << "__" << funcName << "("; for (unsigned long i=0; i < sortedProps.size(); i++) { @@ -410,118 +470,71 @@ string CodeGen_ISPC::printCallISPCFunc(const Function *func, map varMap, vector &sortedProps) { - DeviceFunctionCollector deviceFunctionCollector(func->inputs, func->outputs, this); - func->body.accept(&deviceFunctionCollector); + FunctionCollector functionCollector(func->inputs, func->outputs, this); + func->body.accept(&functionCollector); - std::stringstream variables; vector inputs = func->inputs; vector outputs = func->outputs; unordered_set propsAlreadyGenerated; - for (unsigned long i=0; i < sortedProps.size(); i++) { - auto prop = sortedProps[i]; - bool isOutputProp = (find(outputs.begin(), outputs.end(), - prop->tensor) != outputs.end()); - - auto var = prop->tensor.as(); - if (var->is_parameter) { - if (isOutputProp) { - variables << " " << printTensorProperty(varMap[prop], prop, false) << ";" << endl; - } else { - break; - } + for (unsigned long i=0; i < sortedProps.size(); i++) { + auto prop = sortedProps[i]; + bool isOutputProp = (find(outputs.begin(), outputs.end(), + prop->tensor) != outputs.end()); + + auto var = prop->tensor.as(); + if (var->is_parameter) { + if (isOutputProp) { + funcVariables << " " << printTensorProperty(varMap[prop], prop, false) << ";" << endl; } else { - variables << getUnpackedTensorArgument(varMap[prop], prop, isOutputProp); + break; } - propsAlreadyGenerated.insert(varMap[prop]); + } else { + funcVariables << getUnpackedTensorArgument(varMap[prop], prop, isOutputProp); + } + propsAlreadyGenerated.insert(varMap[prop]); - if (i!=sortedProps.size()-1) { - variables << ", "; - } - if (i%2==0) { - variables << "\n\t"; - } + if (i!=sortedProps.size()-1) { + funcVariables << ", "; + } + if (i%2==0) { + funcVariables << "\n\t"; } + } resetUniqueNameCounters(); - for (size_t i = 0; i < deviceFunctionCollector.threadFors.size(); i++) { - const For *threadloop = to(deviceFunctionCollector.threadFors[i]); + // threadFors code generation + for (size_t i = 0; i < functionCollector.threadFors.size(); i++) { + + const For *threadloop = to(functionCollector.threadFors[i]); taco_iassert(threadloop->parallel_unit == ParallelUnit::CPUSpmd); Stmt function = threadloop->contents; std::cout << "threadloop function: " << function << std::endl; - out2 << "static task void __" << func->name << "__ ("; - out2 << variables.str(); + out2 << "\nstatic task void __" << func->name << "__ ("; + out2 << funcVariables.str(); out2 << "\n) {\n\n"; indent++; - doIndent(); - // output body + // output body of the threadloop + taskCode = true; print(threadloop); indent--; - out2 << "}\n"; - - out2 << "export void __" << func->name << "("; - out2 << variables.str(); - out2 << "\n) {\n\n"; - indent++; - doIndent(); - out2 << "launch[4] " << printCallISPCFunc(func, varMap, sortedProps) << "\n"; - indent--; - out2 << "}\n"; - - } - - if (deviceFunctionCollector.threadFors.size()==0) { - out2 << "export void __" << func->name << " ("; - out2 << variables.str(); - out2 << "\n) {\n\n"; + out2 << "}\n\n"; - indent++; - doIndent(); - // output body - print(func->body); - indent--; - out2 << "}\n"; } - // out2 << "export void "; - - // out2 << "__" << func->name << "("; + taskCode = false; + out2 << "export void __" << func->name << " ("; + out2 << funcVariables.str(); + out2 << "\n) {\n\n"; - // for (unsigned long i=0; i < sortedProps.size(); i++) { - // auto prop = sortedProps[i]; - // bool isOutputProp = (find(outputs.begin(), outputs.end(), - // prop->tensor) != outputs.end()); - - // auto var = prop->tensor.as(); - // if (var->is_parameter) { - // if (isOutputProp) { - // out2 << " " << printTensorProperty(varMap[prop], prop, false) << ";" << endl; - // } else { - // break; - // } - // } else { - // out2 << getUnpackedTensorArgument(varMap[prop], prop, isOutputProp); - // } - // propsAlreadyGenerated.insert(varMap[prop]); - - // if (i!=sortedProps.size()-1) { - // out2 << ", "; - // } - // if (i%2==0) { - // out2 << "\n\t"; - // } - // } - // out2 << "\n) {\n\n"; - - // indent++; - // doIndent(); - // // output body - // print(func->body); - // indent--; - // out2 << "}\n"; + indent++; + // output body + print(func->body); + indent--; + out2 << "}\n"; } @@ -535,6 +548,8 @@ void CodeGen_ISPC::sendToStream(std::stringstream &stream) { } void CodeGen_ISPC::visit(const Function* func) { + set_ISPC_code_stream_enabled(false); + // if generating a header, protect the function declaration with a guard if (func->name == "assemble") { if (outputKind == HeaderGen) { @@ -646,11 +661,11 @@ void CodeGen_ISPC::visit(const Function* func) { // Print variable declarations out << printDecls(varFinder.varDecls, func->inputs, func->outputs) << endl; - vector sortedProps; + sortedProps = {}; vector inputs = func->inputs; vector outputs = func->outputs; getSortedProps(varFinder.varDecls, sortedProps, inputs, outputs); - out << printCallISPCFunc(func, varFinder.varDecls, sortedProps); + out << printCallISPCFunc(func->name, varFinder.varDecls, sortedProps); if (emittingCoroutine) { out << printContextDeclAndInit(varMap, localVars, numYields, func->name) @@ -788,51 +803,84 @@ static string getAtomicPragma() { // Docs for vectorization pragmas: // http://clang.llvm.org/docs/LanguageExtensions.html#extensions-for-loop-hint-optimizations void CodeGen_ISPC::visit(const For* op) { - switch (op->kind) { - // TODO - add ISPC based multi threaded execution handling - case LoopKind::Vectorized: - case LoopKind::Static: - case LoopKind::Dynamic: - case LoopKind::Runtime: - case LoopKind::Static_Chunked: - case LoopKind::Mul_Thread: - // op->start.accept(this); - // stream2 << std::endl; - // op->start.accept(this); - // stream2 << std::endl; - // op->start.accept(this); - // stream2 << std::endl; - // op->start.accept(this); - // stream2 << std::endl; - // op->end.accept(this); - // stream2 << std::endl; - // op->end.accept(this); - // stream2 << std::endl; - // op->end.accept(this); - // stream2 << std::endl; - default: - break; + if (!is_ISPC_code_stream_enabled()) { + CodeGen::visit(op); + return; } - doIndent(); - if (op->kind == LoopKind::Foreach) { - stream2 << keywordString("foreach") << " ("; - // if (!emittingCoroutine) { - // if (op->var.type() == Int32) { - // stream << "int32 "; - // } - // else if (op->var.type() == Int64) { - // stream << "int64 "; - // } + if (op->kind == LoopKind::Mul_Thread) { + if (!taskCode) { + out2 << "launch[4] " << printCallISPCFunc(funcName+"__", varMap, sortedProps) << "\n"; + return; + } + stream2 << "uniform unsigned int chunk_size = ("; + op->end.accept(this); + stream2 << " - "; + op->start.accept(this); + stream2 << ") / taskCount;\n"; + stream2 << " uniform unsigned int modulo = ("; + op->end.accept(this); + stream2 << " - "; + op->start.accept(this); + stream2 << ") % taskCount;\n"; + + stream2 << " uniform unsigned int start = "; + op->start.accept(this); + stream2 << " + chunk_size * taskIndex;\n"; + + stream2 << " if (taskIndex != 0) {\n"; + stream2 << " start += modulo;\n"; + stream2 << " }\n"; + + stream2 << " uniform unsigned int end = start + chunk_size;\n"; + stream2 << " if (taskIndex == 0) {\n"; + stream2 << " end += modulo;\n"; + stream2 << " }\n\n"; + + stream2 << keywordString(" for") << " ("; + if (!emittingCoroutine) { + if (op->var.type() == Int32) { + stream2 << "int32 "; + } + else if (op->var.type() == Int64) { + stream2 << "int64 "; + } - // } + } + op->var.accept(this); + stream2 << " = "; + stream2 << "start"; + // op->start.accept(this); + stream2 << keywordString("; "); + op->var.accept(this); + stream2 << " < "; + parentPrecedence = BOTTOM; + stream2 << "end"; + // op->end.accept(this); + stream2 << keywordString("; "); + op->var.accept(this); + + auto lit = op->increment.as(); + if (lit != nullptr && ((lit->type.isInt() && lit->equalsScalar(1)) || + (lit->type.isUInt() && lit->equalsScalar(1)))) { + stream2 << "++"; + } + else { + stream2 << " += "; + op->increment.accept(this); + } + + } + + else if (op->kind == LoopKind::Foreach) { + stream2 << keywordString("foreach") << " ("; + op->var.accept(this); stream2 << " = "; op->start.accept(this); stream2 << keywordString(" ... "); op->end.accept(this); - stream2 << ") {\n"; } else { stream2 << keywordString("for") << " ("; @@ -865,9 +913,10 @@ void CodeGen_ISPC::visit(const For* op) { stream2 << " += "; op->increment.accept(this); } - stream2 << ") {\n"; + } + stream2 << ") {\n"; op->contents.accept(this); doIndent(); stream2 << "}"; @@ -934,33 +983,69 @@ void CodeGen_ISPC::visit(const Max* op) { void CodeGen_ISPC::visit(const Allocate* op) { string elementType = printCType(op->var.type(), false); - doIndent(); - op->var.accept(this); - stream << " = ("; - stream << elementType << "*"; - stream << ")"; - if (op->is_realloc) { - stream << "realloc("; + + if (is_ISPC_code_stream_enabled()) { + op->var.accept(this); - stream << ", "; - } - else { - // If the allocation was requested to clear the allocated memory, - // use calloc instead of malloc. - if (op->clear) { - stream << "calloc(1, "; - } else { - stream << "malloc("; + stream2 << " = "; + // stream2 << " = ("; + // stream2 << elementType << "*"; + // stream2 << ")"; + if (op->is_realloc) { + stream2 << "realloc("; + op->var.accept(this); + stream2 << ", "; } - } - stream << "sizeof(" << elementType << ")"; - stream << " * "; - parentPrecedence = MUL; - op->num_elements.accept(this); - parentPrecedence = TOP; - stream << ");"; + else { + // If the allocation was requested to clear the allocated memory, + // use calloc instead of malloc. + if (op->clear) { + stream2 << "calloc(1, "; + } else { + stream2 << "new "; + } + } + stream2 << elementType << "["; + parentPrecedence = MUL; + op->num_elements.accept(this); + parentPrecedence = TOP; + stream2 << "];"; + stream2 << endl; + + + } else { + + op->var.accept(this); + stream << " = ("; + stream << elementType << "*"; + stream << ")"; + if (op->is_realloc) { + stream << "realloc("; + op->var.accept(this); + stream << ", "; + } + else { + // If the allocation was requested to clear the allocated memory, + // use calloc instead of malloc. + if (op->clear) { + stream << "calloc(1, "; + } else { + stream << "malloc("; + } + } + stream << "sizeof(" << elementType << ")"; + stream << " * "; + parentPrecedence = MUL; + op->num_elements.accept(this); + parentPrecedence = TOP; + stream << ");"; stream << endl; + + + } + + } void CodeGen_ISPC::visit(const Sqrt* op) { diff --git a/src/codegen/codegen_ispc.h b/src/codegen/codegen_ispc.h index 08e73b252..2e440abc0 100644 --- a/src/codegen/codegen_ispc.h +++ b/src/codegen/codegen_ispc.h @@ -2,6 +2,7 @@ #define TACO_BACKEND_ISPC_H #include #include +#include #include "taco/ir/ir.h" #include "taco/ir/ir_printer.h" @@ -44,24 +45,27 @@ class CodeGen_ISPC : public CodeGen { void visit(const Assign*); Stmt simplifyFunctionBodies(Stmt stmt); - std::string printCallISPCFunc(const Function *func, std::map varMap, + std::string printCallISPCFunc(const std::string& funcName, std::map varMap, std::vector &sortedProps); void printISPCFunc(const Function *func, std::map varMap, std::vector &sortedProps); std::map varMap; std::vector localVars; + bool taskCode = false; std::ostream &out; std::ostream &out2; OutputKind outputKind; std::string funcName; + std::stringstream funcVariables; + std::vector sortedProps; int labelCount; bool emittingCoroutine; class FindVars; - class DeviceFunctionCollector; + class FunctionCollector; private: virtual std::string restrictKeyword() const { return "restrict"; } diff --git a/src/ir/ir_printer.cpp b/src/ir/ir_printer.cpp index ba2bc894b..fa224bde4 100644 --- a/src/ir/ir_printer.cpp +++ b/src/ir/ir_printer.cpp @@ -333,10 +333,18 @@ void IRPrinter::visit(const Cast* op) { } void IRPrinter::visit(const Call* op) { - stream << op->func << "("; - parentPrecedence = Precedence::CALL; - acceptJoin(this, stream, op->args, ", "); - stream << ")"; + if (!is_ISPC_code_stream_enabled()) { + stream << op->func << "("; + parentPrecedence = Precedence::CALL; + acceptJoin(this, stream, op->args, ", "); + stream << ")"; + } else { + // statically added function to the ispc file has __ in the front + stream2 << "__" << op->func << "("; + parentPrecedence = Precedence::CALL; + acceptJoin(this, stream2, op->args, ", "); + stream2 << ")"; + } } void IRPrinter::visit(const IfThenElse* op) { @@ -716,7 +724,7 @@ void IRPrinter::visit(const VarDecl* op) { } taco_iassert(isa(op->var)); if (to(op->var)->is_ptr) { - stream2 << "* restrict"; + stream2 << "* "; // removed restrict keyword from here } stream2 << " "; string varName = varNameGenerator.getUniqueName(util::toString(op->var)); @@ -829,12 +837,22 @@ void IRPrinter::visit(const Allocate* op) { } void IRPrinter::visit(const Free* op) { - doIndent(); - stream << "free("; - parentPrecedence = Precedence::TOP; - op->var.accept(this); - stream << ");"; - stream << endl; + if (is_ISPC_code_stream_enabled()) { + doIndent(); + stream2 << "delete[] "; + parentPrecedence = Precedence::TOP; + op->var.accept(this); + stream2 << ";"; + stream2 << endl; + } + else { + doIndent(); + stream << "free("; + parentPrecedence = Precedence::TOP; + op->var.accept(this); + stream << ");"; + stream << endl; + } } void IRPrinter::visit(const Comment* op) { From 8a42b2f226cece4a8da21f06e548fe46bfc2e124 Mon Sep 17 00:00:00 2001 From: Adhhitha Dias Date: Wed, 8 Sep 2021 10:37:00 -0400 Subject: [PATCH 8/8] add test kernels sddmm, mttkrp, ttv, etc.. --- test/tests-scheduling-eval.cpp | 727 +++++++++++++++++++++++++++++++-- 1 file changed, 695 insertions(+), 32 deletions(-) diff --git a/test/tests-scheduling-eval.cpp b/test/tests-scheduling-eval.cpp index 4957418e0..59debc88e 100644 --- a/test/tests-scheduling-eval.cpp +++ b/test/tests-scheduling-eval.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include "taco/cuda.h" #include "test.h" #include "test_tensors.h" @@ -57,6 +58,31 @@ void printToFile(string filename, IndexStmt stmt) { source_file.close(); } +void printToFile(string filename, string additional_filename, IndexStmt stmt) { + stringstream source1; + stringstream source2; + + string file_path = "eval_generated/"; + mkdir(file_path.c_str(), 0777); + + std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); + ir::Stmt compute = lower(stmt, "compute", false, true); + codegen->compile(compute, true); + + ofstream source_file; + string file_ending = should_use_CUDA_codegen() ? ".cu" : ".c"; + source_file.open(file_path+filename+file_ending); + source_file << source1.str(); + source_file.close(); + + ofstream additional_source_file; + string additional_file_ending = ".ispc"; + additional_source_file.open(file_path+additional_filename+additional_file_ending); + additional_source_file << source2.str(); + additional_source_file.close(); + +} + IndexStmt scheduleSpMVCPU(IndexStmt stmt, int CHUNK_SIZE=16) { IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1"); return stmt.split(i, i0, i1, CHUNK_SIZE) @@ -92,6 +118,16 @@ IndexStmt scheduleSpMMISPC1(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, .parallelize(k, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); } +IndexStmt scheduleSpMMISPCOMP1(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); + return stmt.split(i, i0, i1, CHUNK_SIZE) + .pos(j, jpos, A(i,j)) + .split(jpos, jpos0, jpos1, UNROLL_FACTOR) + .reorder({i0, i1, jpos0, k, jpos1}) + .parallelize(i0, ParallelUnit::CPUSpmd, OutputRaceStrategy::NoRaces) + .parallelize(k, ParallelUnit::CPUSimd, OutputRaceStrategy::IgnoreRaces); +} + IndexStmt scheduleSpMMISPC1_2(IndexStmt stmt, Tensor A, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { IndexVar i0("i0"), i1("i1"), kbounded("kbounded"), k0("k0"), k1("k1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); return stmt.split(i, i0, i1, CHUNK_SIZE) @@ -199,6 +235,27 @@ IndexStmt scheduleSDDMMCPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, .parallelize(kpos1, ParallelUnit::CPUVector, OutputRaceStrategy::ParallelReduction); } +IndexStmt scheduleSDDMMCSRCPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1"); + return stmt; + // return stmt.split(i, i0, i1, CHUNK_SIZE) + // .pos(k, kpos, B(i,k)) + // .split(kpos, kpos0, kpos1, UNROLL_FACTOR) + // .reorder({i0, i1, kpos0, j, kpos1}); + // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); + // .parallelize(k, ParallelUnit::CPUVector, OutputRaceStrategy::IgnoreRaces); +} + +IndexStmt scheduleSDDMM2CPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); + return stmt.split(i, i0, i1, CHUNK_SIZE) + .pos(j, jpos, B(i,j)) + .split(jpos, jpos0, jpos1, UNROLL_FACTOR) + .reorder({i0, i1, jpos0, k, jpos1}) + .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + .parallelize(jpos1, ParallelUnit::CPUVector, OutputRaceStrategy::ParallelReduction); +} + IndexStmt scheduleSDDMMISPC(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1"); return stmt.split(i, i0, i1, CHUNK_SIZE) @@ -209,6 +266,16 @@ IndexStmt scheduleSDDMMISPC(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, .parallelize(kpos1, ParallelUnit::CPUSimd, OutputRaceStrategy::ParallelReduction); } +IndexStmt scheduleSDDMM2ISPC(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i0("i0"), i1("i1"), jpos("jpos"), jpos0("jpos0"), jpos1("jpos1"); + return stmt.split(i, i0, i1, CHUNK_SIZE) + .pos(j, jpos, B(i,j)) + .split(jpos, jpos0, jpos1, UNROLL_FACTOR) + .reorder({i0, i1, jpos0, k, jpos1}) + // .parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); + .parallelize(jpos1, ParallelUnit::CPUSimd, OutputRaceStrategy::ParallelReduction); +} + IndexStmt scheduleSDDMMISPC1(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { IndexVar i0("i0"), i1("i1"), kpos("kpos"), kpos0("kpos0"), kpos1("kpos1"); return stmt.split(i, i0, i1, CHUNK_SIZE) @@ -241,12 +308,12 @@ IndexStmt scheduleTTVCPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16) { IndexStmt scheduleTTVISPC(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16) { IndexVar f("f"), fpos("fpos"), chunk("chunk"), fpos2("fpos2"); - return stmt; - // return stmt.fuse(i, j, f) - // .pos(f, fpos, B(i,j,k)) - // .split(fpos, chunk, fpos2, CHUNK_SIZE) - // .reorder({chunk, fpos2, k}) - // .parallelize(chunk, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); + // return stmt; + return stmt.fuse(i, j, f) + .pos(f, fpos, B(i,j,k)) + .split(fpos, chunk, fpos2, CHUNK_SIZE) + .reorder({chunk, fpos2, k}) + .parallelize(chunk, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); } IndexStmt scheduleTTVCPUCSR(IndexStmt stmt) { @@ -258,6 +325,25 @@ IndexStmt scheduleTTVCPUCSR(IndexStmt stmt) { OutputRaceStrategy::NoRaces); } +IndexStmt scheduleTTVCPUCSR_ST(IndexStmt stmt) { + TensorVar result = stmt.as().getStmt().as().getStmt() + .as().getStmt().as().getLhs() + .getTensorVar(); + return stmt.assemble(result, AssembleStrategy::Insert); +} + +IndexStmt scheduleTTVISPCCSR(IndexStmt stmt) { + TensorVar result = stmt.as().getStmt().as().getStmt() + .as().getStmt().as().getLhs() + .getTensorVar(); + return stmt.assemble(result, AssembleStrategy::Insert) + .parallelize(i, ParallelUnit::CPUSimd, OutputRaceStrategy::NoRaces); +} + +IndexStmt scheduleTTVISPCCSR2(IndexStmt stmt) { + return stmt; +} + IndexStmt scheduleTTMCPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { IndexVar f("f"), fpos("fpos"), chunk("chunk"), fpos2("fpos2"), kpos("kpos"), kpos1("kpos1"), kpos2("kpos2"); return stmt.fuse(i, j, f) @@ -282,12 +368,47 @@ IndexStmt scheduleMTTKRPCPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, .parallelize(i1, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); } +IndexStmt scheduleMTTKRPCPU_ST(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i1("i1"), i2("i2"); + IndexExpr precomputeExpr = stmt.as().getStmt().as().getStmt() + .as().getStmt().as().getStmt() + .as().getRhs().as().getA(); + TensorVar w("w", Type(Float64, {Dimension(j)}), taco::dense); + return stmt.split(i, i1, i2, CHUNK_SIZE) + .reorder({i1, i2, k, l, j}) + .precompute(precomputeExpr, j, j, w); + // .parallelize(j, ParallelUnit::CPUVector, OutputRaceStrategy::Atomics); // gives error when lowering for IgnoreRaces, NoRaces and Atomics + // .parallelize(i1, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); +} + +IndexStmt scheduleMTTKRPISPC(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i1("i1"), i2("i2"); + IndexExpr precomputeExpr = stmt.as().getStmt().as().getStmt() + .as().getStmt().as().getStmt() + .as().getRhs().as().getA(); + TensorVar w("w", Type(Float64, {Dimension(j)}), taco::dense); + return stmt.split(i, i1, i2, CHUNK_SIZE) + .reorder({i1, i2, k, l, j}) + .precompute(precomputeExpr, j, j, w) + .parallelize(j, ParallelUnit::CPUSimd, OutputRaceStrategy::NoRaces); +} + IndexStmt scheduleMTTKRPPrecomputedCPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { IndexVar i1("i1"), i2("i2"), j_pre("j_pre"); return stmt.split(i, i1, i2, CHUNK_SIZE) .parallelize(i1, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); } +IndexStmt scheduleMTTKRPPrecomputedCPU_ST(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i1("i1"), i2("i2"), j_pre("j_pre"); + return stmt.split(i, i1, i2, CHUNK_SIZE); +} + +IndexStmt scheduleMTTKRPPrecomputedISPC_ST(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i1("i1"), i2("i2"), j_pre("j_pre"); + return stmt.parallelize(j, ParallelUnit::CPUSimd, OutputRaceStrategy::NoRaces); +} + IndexStmt scheduleMTTKRP4CPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { IndexVar i1("i1"), i2("i2"); return stmt.split(i, i1, i2, CHUNK_SIZE) @@ -295,6 +416,19 @@ IndexStmt scheduleMTTKRP4CPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16 .parallelize(i1, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); } +IndexStmt scheduleMTTKRP4CPU_ST(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i1("i1"), i2("i2"); + return stmt.split(i, i1, i2, CHUNK_SIZE) + .reorder({i1, i2, k, l, m, j}); +} + +IndexStmt scheduleMTTKRP4ISPC_ST(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { + IndexVar i1("i1"), i2("i2"); + return stmt.split(i, i1, i2, CHUNK_SIZE) + .reorder({i1, i2, k, l, m, j}) + .parallelize(j, ParallelUnit::CPUSimd, OutputRaceStrategy::NoRaces); +} + IndexStmt scheduleMTTKRP5CPU(IndexStmt stmt, Tensor B, int CHUNK_SIZE=16, int UNROLL_FACTOR=8) { IndexVar i1("i1"), i2("i2"); return stmt.split(i, i1, i2, CHUNK_SIZE) @@ -1024,7 +1158,7 @@ TEST(scheduling_eval, sddmmCPU) { IndexStmt stmt = A.getAssignment().concretize(); stmt = scheduleSDDMMCPU(stmt, B); - //printToFile("sddmm_cpu", stmt); + printToFile("sddmm_cpu_ryan2", stmt); A.compile(stmt); A.assemble(); @@ -1038,6 +1172,126 @@ TEST(scheduling_eval, sddmmCPU) { ASSERT_TENSOR_EQ(expected, A); } + +TEST(scheduling_eval, sddmmcsrCPU) { + if (should_use_CUDA_codegen()) { + return; + } + int NUM_I = 1021/10; + int NUM_J = 1039/10; + int NUM_K = 1057/10; + float SPARSITY = .3; + Tensor A("A", {NUM_I, NUM_K}, CSR); + Tensor B("B", {NUM_I, NUM_K}, CSR); + Tensor C("C", {NUM_I, NUM_J}, {Dense, Dense}); + Tensor D("D", {NUM_J, NUM_K}, {Dense, Dense}); + + srand(268238); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + C.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + + for (int i = 0; i < NUM_I; i++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, k}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + } + + for (int j = 0; j < NUM_J; j++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + D.insert({j, k}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + + B.pack(); + C.pack(); + D.pack(); + + A(i,k) = B(i,k) * C(i,j) * D(j,k); + + IndexStmt stmt = A.getAssignment().concretize(); + stmt = scheduleSDDMMCSRCPU(stmt, B); + + printToFile("sddmm_cpu", stmt); + + A.compile(stmt); + A.assemble(); + A.compute(); + + Tensor expected("expected", {NUM_I, NUM_K}, CSR); + expected(i,k) = B(i,k) * C(i,j) * D(j,k); + + IndexStmt stmt_ref = expected.getAssignment().concretize(); + printToFile("sddmm_cpu_ref", stmt_ref); + + expected.compile(stmt_ref); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, A); +} + + +TEST(scheduling_eval, sddmm2CPU) { + if (should_use_CUDA_codegen()) { + return; + } + int NUM_I = 1021/10; + int NUM_J = 1021/10; + int NUM_K = 18; + float SPARSITY = .3; + Tensor Y("Y", {NUM_I, NUM_J}, CSR); + Tensor A("A", {NUM_I, NUM_J}, CSR); + Tensor X("X", {NUM_I, NUM_K}, {Dense, Dense}); + + srand(268238); + + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + if (rand_float < SPARSITY) { + A.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + } + + for (int i = 0; i < NUM_J; i++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + X.insert({i, k}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + + A.pack(); + X.pack(); + + Y(i,j) = A(i,j) * X(i,k) * X(j,k); + + IndexStmt stmt = A.getAssignment().concretize(); + // stmt = scheduleSDDMMCPU(stmt, B); + + //printToFile("sddmm_cpu", stmt); + + A.compile(stmt); + A.assemble(); + A.compute(); + + Tensor expected("expected", {NUM_I, NUM_J}, {Dense, Dense}); + expected(i,j) = A(i,j) * X(i,k) * X(j,k); + expected.compile(); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, A); +} + + + // bin/taco-test --gtest_filter=scheduling_eval.sddmmISPC TEST(scheduling_eval, sddmmISPC) { @@ -1128,6 +1382,89 @@ TEST(scheduling_eval, sddmmISPC) { } + +// bin/taco-test --gtest_filter=scheduling_eval.sddmmISPC +TEST(scheduling_eval, sddmm2ISPC) { + + taco::util::TimeResults timevalue; + bool time = true; + + set_CUDA_codegen_enabled(false); + set_ISPC_codegen_enabled(false); + + int NUM_I = 1021/10; + int NUM_K = 1039/10; + int NUM_J = 1021/10; + float SPARSITY = .3; + Tensor A("A", {NUM_I, NUM_J}, {Dense, Dense}); + Tensor B("B", {NUM_I, NUM_J}, CSR); + Tensor C("C", {NUM_I, NUM_K}, {Dense, Dense}); + + srand(268238); + for (int i = 0; i < NUM_I; i++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + C.insert({i, k}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, j}, (double) ((int) (rand_float*3/SPARSITY))); + } + } + } + + B.pack(); + C.pack(); + + set_ISPC_codegen_enabled(true); + A(i,j) = B(i,j) * C(i,k) * C(j,k); + + IndexStmt stmt = A.getAssignment().concretize(); + stmt = scheduleSDDMM2ISPC(stmt, B); + + //printToFile("sddmm_cpu", stmt); + + A.compile(stmt); + A.assemble(); + // A.compute(); + + set_ISPC_codegen_enabled(false); + Tensor expected("expected", {NUM_I, NUM_J}, {Dense, Dense}); + expected(i,j) = B(i,j) * C(i,k) * C(j,k); + IndexStmt stmt_taco = A.getAssignment().concretize(); + stmt_taco = scheduleSDDMM2CPU(stmt_taco, B); + expected.compile(stmt_taco); + expected.assemble(); + // expected.compute(); + + TOOL_BENCHMARK_TIMER(A.compute(), "Compute ISPC: ", timevalue); + TOOL_BENCHMARK_TIMER(expected.compute(), "Compute TACO: ", timevalue); + + ASSERT_TENSOR_EQ(expected, A); + + + float ERROR_MARGIN = 0.01; + // ASSERT_TENSOR_VAL(expected, y); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + if (expected(i,j) <= A(i,j) + ERROR_MARGIN && expected(i,j) >= A(i,j) - ERROR_MARGIN) { + // std::cout << "matched values: expected -> " << expected(j) << " == " << y(j) << " <- actual\n"; + } + else { + std::cout << "unmatched values: expected -> " << expected(i,j) << " != " << A(i,j) << " <- actual\n"; + ASSERT_TRUE(false); + }; + } + } + std::cout << "test scheduling_eval.sddmmISPC passed\n"; + +} + + TEST(scheduling_eval, spmvCPU) { if (should_use_CUDA_codegen()) { return; @@ -1215,9 +1552,9 @@ TEST(scheduling_eval, spmvISPC) { y(i) = A(i, j) * x(j); IndexStmt stmt = y.getAssignment().concretize(); - stmt = scheduleSpMVISPC(stmt); + // stmt = scheduleSpMVISPC(stmt); - //printToFile("spmv_cpu", stmt); + printToFile("spmv_cpu", stmt); y.compile(stmt); y.assemble(); @@ -1307,7 +1644,7 @@ TEST(scheduling_eval, ttvCPU) { IndexStmt stmt = A.getAssignment().concretize(); stmt = scheduleTTVCPU(stmt, B); - //printToFile("ttv_cpu", stmt); + printToFile("ttv_cpu", stmt); A.compile(stmt); A.assemble(); @@ -1362,7 +1699,7 @@ TEST(scheduling_eval, ttvISPC) { IndexStmt stmt = A.getAssignment().concretize(); stmt = scheduleTTVISPC(stmt, B); - //printToFile("ttv_cpu", stmt); + printToFile("ttv_ispc", "__ttv_ispc", stmt); A.compile(stmt); A.assemble(); @@ -1390,7 +1727,7 @@ TEST(scheduling_eval, ttvCPU_CSR) { int NUM_K = 1057/10; float SPARSITY = .3; Tensor A("A", {NUM_I, NUM_J}, {Dense, Sparse}); - Tensor B("B", {NUM_I, NUM_J, NUM_K}, {Sparse, Sparse, Sparse}); + Tensor B("B", {NUM_I, NUM_J, NUM_K}, {Dense, Sparse, Sparse}); Tensor c("c", {NUM_K}, Format({Dense})); srand(9536); @@ -1418,11 +1755,13 @@ TEST(scheduling_eval, ttvCPU_CSR) { IndexStmt stmt = A.getAssignment().concretize(); stmt = scheduleTTVCPUCSR(stmt); + printToFile("ttv_cpu_csr", stmt); + A.compile(stmt); A.assemble(); A.compute(); - Tensor expected("expected", {NUM_I, NUM_J}, {Dense, Dense}); + Tensor expected("expected", {NUM_I, NUM_J}, {Dense, Sparse}); expected(i,j) = B(i,j,k) * c(k); expected.compile(); expected.assemble(); @@ -1430,6 +1769,82 @@ TEST(scheduling_eval, ttvCPU_CSR) { ASSERT_TENSOR_EQ(expected, A); } +TEST(scheduling_eval, ttvISPC_CSR) { + if (should_use_CUDA_codegen()) { + return; + } + + int NUM_I = 10000; + int NUM_J = 1039/10; + int NUM_K = 128; + float SPARSITY = .3; + Tensor A("A", {NUM_I, NUM_J}, {Dense, Sparse}); + Tensor B("B", {NUM_I, NUM_J, NUM_K}, {Dense, Sparse, Sparse}); + Tensor c("c", {NUM_K}, Format({Dense})); + + srand(9536); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float) rand() / (float) (RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, j, k}, (double) ((int) (rand_float * 3 / SPARSITY))); + } + } + } + } + + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + c.insert({k}, (double) ((int) (rand_float*3))); + } + + B.pack(); + c.pack(); + + set_ISPC_codegen_enabled(true); + A(i,j) = B(i,j,k) * c(k); + + IndexStmt stmt = A.getAssignment().concretize(); + stmt = scheduleTTVISPCCSR(stmt); + printToFile("ttv_ispc_csr", "__ttv_ispc_csr", stmt); + + A.compile(stmt); + A.assemble(); + A.compute(); + + set_ISPC_codegen_enabled(false); + Tensor expected("expected", {NUM_I, NUM_J}, {Dense, Sparse}); + expected(i,j) = B(i,j,k) * c(k); + IndexStmt taco_stmt = expected.getAssignment().concretize(); + taco_stmt = scheduleTTVCPUCSR_ST(taco_stmt); + expected.compile(taco_stmt); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, A); + + Tensor A2("A2", {NUM_I, NUM_J}, {Dense, Sparse}); + set_ISPC_codegen_enabled(true); + A2(i,j) = B(i,j,k) * c(k); + + IndexStmt stmt2 = A2.getAssignment().concretize(); + + A2.compile(stmt2); + A2.assemble(); + A2.compute(); + + taco::util::TimeResults timevalue; + bool time = true; + + for (int i=0; i<3; i++) { + TOOL_BENCHMARK_TIMER(expected.compute(), "Compute TACO1: ", timevalue); + TOOL_BENCHMARK_TIMER(A.compute(), "Compute ISPC1: ", timevalue); + TOOL_BENCHMARK_TIMER(A2.compute(), "Compute ISPC2: ", timevalue); + } + + +} + TEST(scheduling_eval, ttmCPU) { if (should_use_CUDA_codegen()) { return; @@ -1605,12 +2020,13 @@ TEST(scheduling_eval, mttkrpISPC) { if (should_use_CUDA_codegen()) { return; } - int NUM_I = 1021/20; - int NUM_J = 1039/20; + set_ISPC_codegen_enabled(false); + set_CUDA_codegen_enabled(false); + int NUM_I = 10000; // 1021/20; + int NUM_J = 256; int NUM_K = 1057/20; int NUM_L = 1232/20; float SPARSITY = .1; - Tensor A("A", {NUM_I, NUM_J}, {Dense, Dense}); Tensor B("B", {NUM_I, NUM_K, NUM_L}, {Dense, Sparse, Sparse}); Tensor C("C", {NUM_K, NUM_J}, {Dense, Dense}); Tensor D("D", {NUM_L, NUM_J}, {Dense, Dense}); @@ -1645,24 +2061,183 @@ TEST(scheduling_eval, mttkrpISPC) { C.pack(); D.pack(); - A(i,j) = B(i,k,l) * C(k,j) * D(l,j); + set_ISPC_codegen_enabled(true); - IndexStmt stmt = A.getAssignment().concretize(); - stmt = scheduleMTTKRPCPU(stmt, B); - //printToFile("mttkrp_cpu", stmt); + Tensor A1("A1", {NUM_I, NUM_J}, {Dense, Dense}); + A1(i,j) = B(i,k,l) * C(k,j) * D(l,j); + IndexStmt stmt1 = A1.getAssignment().concretize(); + stmt1 = scheduleMTTKRPISPC(stmt1, B); + // printToFile("mttkrp1_cpu_ispc", stmt1); + A1.compile(stmt1); + A1.assemble(); + A1.compute(); - A.compile(stmt); - A.assemble(); - A.compute(); + set_ISPC_codegen_enabled(false); + Tensor expected1("expected1", {NUM_I, NUM_J}, {Dense, Dense}); + expected1(i,j) = B(i,k,l) * C(k,j) * D(l,j); + IndexStmt taco_stmt1 = expected1.getAssignment().concretize(); + taco_stmt1 = scheduleMTTKRPCPU(taco_stmt1, B); + expected1.compile(taco_stmt1); + expected1.assemble(); + expected1.compute(); + ASSERT_TENSOR_EQ(expected1, A1); - Tensor expected("expected", {NUM_I, NUM_J}, {Dense, Dense}); - expected(i,j) = B(i,k,l) * C(k,j) * D(l,j); - expected.compile(); - expected.assemble(); - expected.compute(); - ASSERT_TENSOR_EQ(expected, A); + set_ISPC_codegen_enabled(true); + Tensor A2("A2", {NUM_I, NUM_J}, {Dense, Dense}); + A2(i,j) = B(i,k,l) * C(k,j) * D(l,j); + IndexStmt stmt2 = A1.getAssignment().concretize(); + stmt2 = scheduleMTTKRPPrecomputedISPC_ST(stmt2, B); + // printToFile("mttkrp_cpu_ispc", stmt); + A2.compile(stmt2); + A2.assemble(); + A2.compute(); + ASSERT_TENSOR_EQ(expected1, A2); + + set_ISPC_codegen_enabled(false); + Tensor expected2("expected2", {NUM_I, NUM_J}, {Dense, Dense}); + expected2(i,j) = B(i,k,l) * C(k,j) * D(l,j); + IndexStmt taco_stmt2 = expected2.getAssignment().concretize(); + taco_stmt2 = scheduleMTTKRPPrecomputedCPU_ST(taco_stmt2, B); + expected2.compile(taco_stmt2); + expected2.assemble(); + expected2.compute(); + ASSERT_TENSOR_EQ(expected1, expected2); + + taco::util::TimeResults timevalue; + bool time = true; + + for (int i=0; i<3; i++) { + TOOL_BENCHMARK_TIMER(expected1.compute(), "Compute TACO1: ", timevalue); + TOOL_BENCHMARK_TIMER(A1.compute(), "Compute ISPC1: ", timevalue); + TOOL_BENCHMARK_TIMER(expected2.compute(), "Compute TACO2: ", timevalue); + TOOL_BENCHMARK_TIMER(A2.compute(), "Compute ISPC2: ", timevalue); + } } + +TEST(scheduling_eval, mttkrp4ISPC) { + if (should_use_CUDA_codegen()) { + return; + } + set_ISPC_codegen_enabled(false); + set_CUDA_codegen_enabled(false); + int NUM_I = 1000; // 1021/20; + int NUM_J = 16; + int NUM_K = 1057/20; + int NUM_L = 1232/20; + int NUM_M = 1124/20; + float SPARSITY = .1; + Tensor B("B", {NUM_I, NUM_K, NUM_L, NUM_M}, {Dense, Sparse, Sparse, Sparse}); + Tensor C("C", {NUM_K, NUM_J}, {Dense, Dense}); + Tensor D("D", {NUM_L, NUM_J}, {Dense, Dense}); + Tensor E("E", {NUM_M, NUM_J}, {Dense, Dense}); + + srand(549694); + for (int i = 0; i < NUM_I; i++) { + for (int k = 0; k < NUM_K; k++) { + for (int l = 0; l < NUM_L; l++) { + for (int m = 0; m < NUM_M; m++) { + float rand_float = (float) rand() / (float) (RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, k, l, m}, (double) ((int) (rand_float * 3 / SPARSITY))); + } + } + } + } + } + + for (int k = 0; k < NUM_K; k++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + C.insert({k, j}, (double) ((int) (rand_float*3))); + } + } + + for (int l = 0; l < NUM_L; l++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + D.insert({l, j}, (double) ((int) (rand_float*3))); + } + } + + for (int m = 0; m < NUM_M; m++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + E.insert({m, j}, (double) ((int) (rand_float*3))); + } + } + + B.pack(); + C.pack(); + D.pack(); + E.pack(); + + set_ISPC_codegen_enabled(true); + Tensor A1("A1", {NUM_I, NUM_J}, {Dense, Dense}); + A1(i,j) = B(i,k,l,m) * C(k,j) * D(l,j) * E(m,j); + IndexStmt stmt1 = A1.getAssignment().concretize(); + stmt1 = scheduleMTTKRP4ISPC_ST(stmt1, B); + // printToFile("mttkrp1_cpu_ispc", stmt1); + A1.compile(stmt1); + A1.assemble(); + A1.compute(); + + set_ISPC_codegen_enabled(false); + Tensor expected1("expected1", {NUM_I, NUM_J}, {Dense, Dense}); + expected1(i,j) = B(i,k,l,m) * C(k,j) * D(l,j) * E(m,j); + IndexStmt taco_stmt1 = expected1.getAssignment().concretize(); + taco_stmt1 = scheduleMTTKRP4CPU_ST(taco_stmt1, B); + expected1.compile(taco_stmt1); + expected1.assemble(); + expected1.compute(); + ASSERT_TENSOR_EQ(expected1, A1); + + // set_ISPC_codegen_enabled(true); + // Tensor A2("A2", {NUM_I, NUM_J}, {Dense, Dense}); + // A2(i,j) = B(i,k,l) * C(k,j) * D(l,j); + // IndexStmt stmt2 = A1.getAssignment().concretize(); + // stmt2 = scheduleMTTKRPPrecomputedISPC_ST(stmt2, B); + // // printToFile("mttkrp_cpu_ispc", stmt); + // A2.compile(stmt2); + // A2.assemble(); + // A2.compute(); + // ASSERT_TENSOR_EQ(expected1, A2); + + set_ISPC_codegen_enabled(false); + Tensor expected2("expected2", {NUM_I, NUM_J}, {Dense, Dense}); + expected2(i,j) = B(i,k,l,m) * C(k,j) * D(l,j) * E(m,j); + + IndexExpr BE = B(i,k,l,m) * E(m,j); + IndexExpr BDE = BE * D(l, j); + expected2(i,j) = BDE * C(k,j); + IndexStmt taco_stmt2 = expected2.getAssignment().concretize(); + TensorVar BE_workspace("BE_workspace", Type(Float64, {Dimension(j)}), taco::dense); + TensorVar BDE_workspace("BDE_workspace", Type(Float64, {Dimension(j)}), taco::dense); + + IndexStmt precomputed_stmt = forall(i, forall(k, + where(forall(j, expected2(i,j) += BDE_workspace(j) * C(k,j)), + forall(l, where(forall(j, BDE_workspace(j) += BE_workspace(j) * D(l,j)), + forall(m, forall(j, BE_workspace(j) += B(i,k,l,m) * E(m,j)))))))); + + // IndexStmt scheduled2 = scheduleMTTKRPPrecomputedCPU(precomputed_stmt, B, 64); + // expected2.compile(scheduled2); + // expected2.assemble(); + // expected2.compute(); + // ASSERT_TENSOR_EQ(expected1, expected2); + + taco::util::TimeResults timevalue; + bool time = true; + + for (int i=0; i<3; i++) { + TOOL_BENCHMARK_TIMER(expected1.compute(), "Compute TACO1: ", timevalue); + TOOL_BENCHMARK_TIMER(A1.compute(), "Compute ISPC1: ", timevalue); + // TOOL_BENCHMARK_TIMER(expected2.compute(), "Compute TACO2: ", timevalue); + // TOOL_BENCHMARK_TIMER(A2.compute(), "Compute ISPC2: ", timevalue); + } +} + + + TEST(scheduling_eval, spmvGPU) { if (!should_use_CUDA_codegen()) { return; @@ -2042,7 +2617,7 @@ TEST(scheduling_eval, mttkrpGPU) { ASSERT_TENSOR_EQ(expected, A); } -TEST(generate_ispc_evaluation_files, ispc) { +TEST(generate_evaluation_files, ispc) { std::cout << "Hi Adhitha!\n" << std::endl ; set_CUDA_codegen_enabled(false); set_ISPC_codegen_enabled(true); @@ -2063,6 +2638,7 @@ TEST(generate_ispc_evaluation_files, ispc) { int NUM_I = 100; int NUM_J = 100; int NUM_K = 100; + int NUM_L = 100; string c_file_ending = ".h"; string file_ending = ".ispc"; @@ -2130,7 +2706,35 @@ TEST(generate_ispc_evaluation_files, ispc) { ispc_source_file.close(); } - // spmm + // spmm omp + { + stringstream source1; + stringstream source2; + std::shared_ptr codegen = ir::CodeGen::init_default(source1, source2, ir::CodeGen::ImplementationGen); + Tensor A("A", {NUM_I, NUM_J}, CSR); + Tensor X("X", {NUM_J, NUM_K}, {Dense, Dense}); + Tensor Y("Y", {NUM_I, NUM_K}, {Dense, Dense}); + Y(i, k) = A(i, j) * X(j, k); + IndexStmt stmt = Y.getAssignment().concretize(); + bool isFirst = true; + for (auto paramSet : spmm_parameters) { + IndexStmt scheduled = scheduleSpMMISPCOMP1(stmt, A, paramSet[0], paramSet[1]); + ir::Stmt compute = lower(scheduled, string("compute1_") + util::join(paramSet, "_"), false, true); + codegen->compile(compute, isFirst); + isFirst = false; + } + ofstream source_file; + source_file.open(file_path + "spmm_omp_ispc_taco1" + c_file_ending); + source_file << source1.str(); + source_file.close(); + + ofstream ispc_source_file; + ispc_source_file.open(file_path + "__spmm_omp_ispc_taco1" + file_ending); + ispc_source_file << source2.str(); + ispc_source_file.close(); + } + + // spmm2 { stringstream source1; stringstream source2; @@ -2186,6 +2790,64 @@ TEST(generate_ispc_evaluation_files, ispc) { ispc_source_file.close(); } + // ttv + { + stringstream source; + stringstream source2; + std::shared_ptr codegen = ir::CodeGen::init_default(source, source2, ir::CodeGen::ImplementationGen); + Tensor A("A", {NUM_I, NUM_J}, {Dense, Dense}); // TODO: change to sparse outputs + Tensor B("B", {NUM_I, NUM_J, NUM_K}, {Sparse, Sparse, Sparse}); + Tensor c("c", {NUM_K}, Format({Dense})); + A(i,j) = B(i,j,k) * c(k); + IndexStmt stmt = A.getAssignment().concretize(); + bool isFirst = true; + for (auto paramSet : ttv_parameters) { + IndexStmt scheduled = scheduleTTVCPU(stmt, B, paramSet[0]); + ir::Stmt compute = lower(scheduled, string("compute_") + util::join(paramSet, "_"), false, true); + codegen->compile(compute, isFirst); + isFirst = false; + } + ofstream source_file; + source_file.open(file_path + "ttv_cpu" + c_file_ending); + source_file << source.str(); + source_file.close(); + + ofstream ispc_source_file; + ispc_source_file.open(file_path + "__ttv_cpu" + file_ending); + ispc_source_file << source2.str(); + ispc_source_file.close(); + } + + + // mttkrp3 + { + stringstream source; + stringstream source2; + std::shared_ptr codegen = ir::CodeGen::init_default(source, source2, ir::CodeGen::ImplementationGen); + Tensor A("A", {NUM_I, NUM_J}, {Dense, Dense}); + Tensor B("B", {NUM_I, NUM_K, NUM_L}, {Dense, Sparse, Sparse}); + Tensor C("C", {NUM_K, NUM_J}, {Dense, Dense}); + Tensor D("D", {NUM_L, NUM_J}, {Dense, Dense}); + A(i,j) = B(i,k,l) * C(k,j) * D(l,j); + IndexStmt stmt = A.getAssignment().concretize(); + bool isFirst = true; + for (auto paramSet : mttkrp_parameters) { + IndexStmt scheduled = scheduleMTTKRPCPU(stmt, B, paramSet[0], paramSet[1]); + ir::Stmt compute = lower(scheduled, string("compute_") + util::join(paramSet, "_"), false, true); + codegen->compile(compute, isFirst); + isFirst = false; + } + ofstream source_file; + source_file.open(file_path + "mttkrp3_cpu" + c_file_ending); + source_file << source.str(); + source_file.close(); + + ofstream ispc_source_file; + ispc_source_file.open(file_path + "__mttkrp3_cpu" + file_ending); + ispc_source_file << source2.str(); + ispc_source_file.close(); + } + return; } @@ -2283,6 +2945,7 @@ TEST(generate_ispc_sddmm_evaluation_files, ispc) { + TEST(generate_evaluation_files, cpu) { if (should_use_CUDA_codegen()) { return; @@ -2599,7 +3262,7 @@ TEST(generate_evaluation_files, cpu) { } } -TEST(generate_evaluation_files_spmv, ispc) { +TEST(generate_evaluation_files, spmv_ispc) { set_CUDA_codegen_enabled(false); set_ISPC_codegen_enabled(true);