aankit-ca · aankit-ca · Jul 18, 2019 · shubhamp-ca · Jul 23, 2019 · shubhamp-ca
diff --git a/apps/random_pipeline/Makefile b/apps/random_pipeline/Makefile
@@ -4,7 +4,7 @@ include ../support/autoscheduler.inc
 all: $(BIN)/test
 
 PIPELINE_SEED ?= 0
-PIPELINE_STAGES ?= 20
+PIPELINE_STAGES ?= 5
 HL_RANDOM_DROPOUT ?= 100
 HL_SEED ?= 0
 HL_BEAM_SIZE ?= 1

diff --git a/apps/random_pipeline/autotune_loop.sh b/apps/random_pipeline/autotune_loop.sh
@@ -1,4 +1,4 @@
-# set -x
+set -x
 
 # Install a watchdog to kill benchmarking processes that take too long
 bash ./watchdog_bench.sh &
@@ -8,6 +8,15 @@ function finish {
 }
 trap finish EXIT
 
+SDK_VER="${SDK_VER:-3.4.1}"
+HEXAGON_TOOLS_VER="${HEXAGON_TOOLS_VER:-8.2.07}"
+DEBUG_LEVEL="${HL_DEBUG_CODEGEN:-0}"
+BATCH_SIZE=1
+MAX_STAGES=12
+# BATCH_SIZE=32
+# MAX_STAGES=5
+OUTPUT_FILES="cpp,static_library,h,stmt,assembly,registration"
+
 # Build the generator to autotune.
 GENERATOR=./bin/random_pipeline.generator
 PIPELINE=random_pipeline
@@ -27,13 +36,12 @@ mkdir -p weights
 
 # A batch of this many samples is built in parallel, and then
 # benchmarked serially. Set to number of cores.
-BATCH_SIZE=32
 
 HL_TARGET=x86-64-avx2-disable_llvm_loop_vectorize-disable_llvm_loop_unroll-hvx_128
 
-HEXAGON_SDK_PATH=${HOME}/Qualcomm/Hexagon_SDK
+HEXAGON_SDK_ROOT="${HEXAGON_SDK_ROOT:-/local/mnt/workspace/Hexagon_SDK}"
 HEXAGON_REMOTE_LIB_PATH=`pwd`/../../src/runtime/hexagon_remote/bin/host/
-HEXAGON_LIBS="-L${HEXAGON_SDK_PATH}/3.3.3/tools/HEXAGON_Tools/8.1.05/Tools/lib/iss/ -L${HEXAGON_REMOTE_LIB_PATH} -lhalide_hexagon_host -lwrapper -Wl,-rpath,${HEXAGON_SDK_PATH}/3.3.3/tools/HEXAGON_Tools/8.1.05/Tools/lib/iss/ -Wl,-rpath,${HEXAGON_REMOTE_LIB_PATH}"
+HEXAGON_LIBS="-L${HEXAGON_SDK_ROOT}/${SDK_VER}/tools/HEXAGON_Tools/${HEXAGON_TOOLS_VER}/Tools/lib/iss/ -L${HEXAGON_REMOTE_LIB_PATH} -lhalide_hexagon_host -lwrapper -Wl,-rpath,${HEXAGON_SDK_ROOT}/${SDK_VER}/tools/HEXAGON_Tools/i${HEXAGON_TOOLS_VER}/Tools/lib/iss/ -Wl,-rpath,${HEXAGON_REMOTE_LIB_PATH}"
 
 # Build a single sample of the pipeline with a random schedule
 make_sample() {
@@ -42,10 +50,10 @@ make_sample() {
     rm -f "${D}/sample.sample"
     if [[ $D == */0 ]]; then
         # Sample 0 in each batch is best effort beam search, with no randomness
-        HL_MACHINE_PARAMS=32,1,1 HL_PERMIT_FAILED_UNROLL=1 HL_SEED=${2} HL_FEATURE_FILE=${D}/sample.sample HL_WEIGHTS_DIR=${PWD}/weights HL_RANDOM_DROPOUT=100 HL_BEAM_SIZE=20 ${GENERATOR} -g ${PIPELINE} -o ${D} -e static_library,h,stmt,assembly,registration target=${HL_TARGET} auto_schedule=true max_stages=12 seed=${3} -p ${PWD}/bin/libauto_schedule.so 2> ${D}/compile_log_stderr.txt > ${D}/compile_log_stdout.txt
+        HL_DEBUG_CODEGEN=${DEBUG_LEVEL} HL_MACHINE_PARAMS=32,1,1 HL_PERMIT_FAILED_UNROLL=1 HL_SEED=${2} HL_FEATURE_FILE=${D}/sample.sample HL_WEIGHTS_DIR=${PWD}/weights HL_RANDOM_DROPOUT=100 HL_BEAM_SIZE=20 ${GENERATOR} -g ${PIPELINE} -o ${D} -e ${OUTPUT_FILES} target=${HL_TARGET} auto_schedule=true max_stages=${MAX_STAGES} seed=${3} -p ${PWD}/bin/libauto_schedule.so 2> ${D}/compile_log_stderr.txt > ${D}/compile_log_stdout.txt
     else
         # The other samples are random probes biased by the cost model
-        HL_MACHINE_PARAMS=32,1,1 HL_PERMIT_FAILED_UNROLL=1 HL_SEED=${2} HL_FEATURE_FILE=${D}/sample.sample HL_WEIGHTS_DIR=${PWD}/weights HL_RANDOM_DROPOUT=80 HL_BEAM_SIZE=1 ${GENERATOR} -g ${PIPELINE} -o ${D} -e static_library,h,stmt,assembly,registration target=${HL_TARGET} auto_schedule=true max_stages=12 seed=${3} -p ${PWD}/bin/libauto_schedule.so 2> ${D}/compile_log_stderr.txt > ${D}/compile_log_stdout.txt
+        HL_DEBUG_CODEGEN=${DEBUG_LEVEL} HL_MACHINE_PARAMS=32,1,1 HL_PERMIT_FAILED_UNROLL=1 HL_SEED=${2} HL_FEATURE_FILE=${D}/sample.sample HL_WEIGHTS_DIR=${PWD}/weights HL_RANDOM_DROPOUT=80 HL_BEAM_SIZE=1 ${GENERATOR} -g ${PIPELINE} -o ${D} -e ${OUTPUT_FILES} target=${HL_TARGET} auto_schedule=true max_stages=${MAX_STAGES} seed=${3} -p ${PWD}/bin/libauto_schedule.so 2> ${D}/compile_log_stderr.txt > ${D}/compile_log_stdout.txt
     fi
 
     c++ -std=c++11 -I ../../include ../../tools/RunGenMain.cpp ${D}/*.registration.cpp ${D}/*.a -o ${D}/bench -ljpeg -ldl -lpthread -lz -lpng ${HEXAGON_LIBS}
@@ -65,6 +73,7 @@ benchmark_sample() {
 FIRST=$(ls ${SAMPLES} | cut -d_ -f2 | sort -n | tail -n1)
 
 for ((i=$((FIRST+1));i<1000000;i++)); do
+# for ((i=$((FIRST+1));i<$((FIRST+2));i++)); do
     # Compile a batch of samples using the generator in parallel
     DIR=${SAMPLES}/batch_${i}
 

diff --git a/apps/random_pipeline/random_pipeline_generator.cpp b/apps/random_pipeline/random_pipeline_generator.cpp
@@ -25,7 +25,7 @@ std::mt19937 rng;
 // Helpers to generate random values.
 int rand_int(int min, int max) { return (rng() % (max - min + 1)) + min; }
 bool rand_bool() { return rng() % 2 == 0; }
-float rand_float() { return rand_int(0, 1 << 30) / (float)(1 << 30); }
+// float rand_float() { return rand_int(0, 1 << 30) / (float)(1 << 30); }
 
 // Generate random expressions. Given a vector of expresions and a
 // tree depth, recursively generates an expression by combining
@@ -183,9 +183,11 @@ Expr rand_value(Type t) {
         return cast(t, rand_int(0,1));
     } else if (t.is_int() || t.is_uint()) {
         return cast(t, rand_int(1, 127));
+#if 0
     } else if (t.is_float()) {
       assert(false);
         return cast(t, rand_float());
+#endif
     } else {
         // Shouldn't get here.
         assert(false);
@@ -202,13 +204,13 @@ Expr random_expr(vector<Expr> inputs, int depth, int func_size) {
         Expr result =
             Internal::simplify(Internal::common_subexpression_elimination(random_expr_inner(inputs, depth, func_size)));
 
-        class Checker : public Internal::IRMutator2 {
+        class Checker : public Internal::IRMutator {
         public:
             Expr mutate(const Expr &e) override {
                 exprs_to_find.erase(e);
-                return IRMutator2::mutate(e);
+                return IRMutator::mutate(e);
             }
-            using Internal::IRMutator2::mutate;
+            using Internal::IRMutator::mutate;
             std::set<Expr, Internal::IRDeepCompare> exprs_to_find;
             Checker(const vector<Expr> &inputs) {
                 for (const auto &e : inputs) {
@@ -463,7 +465,7 @@ class RandomPipeline : public Halide::Generator<RandomPipeline> {
         activation(f.func.args()) = max(cast(output_type, 0), cast(output_type,f.func(coords)));
         return {activation, f.w, f.h, f.c};
     }
-
+#if 0
     Stage tanh_layer(Stage f) {
       assert(false);
         std::cout << "Tanh\n";
@@ -478,7 +480,7 @@ class RandomPipeline : public Halide::Generator<RandomPipeline> {
         activation(f.func.args()) = (exp_pos - 1) / (exp_pos + 1);
         return {activation, f.w, f.h, f.c};
     }
-
+#endif
     Stage pool2D_unrolled(Stage f, int kernel_min, int kernel_max) {
         vector<Var> args = f.func.args();
         Func pooled2D("pooled2D" + args[0].name() + args[1].name());
@@ -514,7 +516,8 @@ class RandomPipeline : public Halide::Generator<RandomPipeline> {
             }
         }
 
-        if (!def.type().is_bool()) {
+        if (!def.type().is_bool() && def.type().bits() < 32) {
+            // TODO(aankit): Change scaling
             def /= scale;
         }
 
@@ -548,7 +551,11 @@ class RandomPipeline : public Halide::Generator<RandomPipeline> {
             pooled2D_r(args) = const_true();
             pooled2D_r(args) = pooled2D_r(args) && f.func(coords);
         } else {
-            pooled2D_r(args) += f.func(coords) / scale;
+            // TODO(aankit): Change scaling
+            if (ty.bits() < 32)
+                pooled2D_r(args) += f.func(coords) / scale;
+            else
+                pooled2D_r(args) += f.func(coords);
         }
 
         return {pooled2D_r, (f.w + stride - 1) / stride, (f.h + stride - 1) / stride, f.c};
@@ -577,7 +584,12 @@ class RandomPipeline : public Halide::Generator<RandomPipeline> {
 #if 0
         pooled2D_w(args) = sum(cast<float>(f.func(coords))) / scale;
 #else
-        pooled2D_w(args) = sum(cast<int32_t>(f.func(coords))) / scale;
+        // TODO(aankit): Change scaling
+        Expr val = sum(cast<int32_t>(f.func(coords)));
+        if (val.type().bits() < 32)
+            pooled2D_w(args) = val / scale;
+        else
+            pooled2D_w(args) = val;
 #endif
         return {pooled2D_w, (f.w + stride - 1) / stride, (f.h + stride - 1) / stride, f.c};
     }
@@ -730,7 +742,11 @@ class RandomPipeline : public Halide::Generator<RandomPipeline> {
             s1 = cast(sum_type, s1);
             s2 = cast(sum_type, s2);
 
-            resampled(f.func.args()) = cast(input_type, ((factor - x) * s1 + x * s2) / (2*factor));
+            // TODO(aankit): Change scaling
+            if (input_type.bits() < 32)
+                resampled(f.func.args()) = cast(input_type, ((factor - x) * s1 + x * s2) / (2*factor));
+            else
+                resampled(f.func.args()) = cast(input_type, ((factor - x) * s1 + x * s2));
         }
 
         Stage s {resampled, f.w, f.h, f.c};
@@ -803,7 +819,7 @@ class RandomPipeline : public Halide::Generator<RandomPipeline> {
         binary(f.func.args()) = def;
         return {binary, f.w, f.h, std::min(f.c, g.c)};
     }
-
+#if 0
     Stage unary_op(Stage f) {
         std::cout << "Unary op\n";
         Func unary("unary_op");
@@ -822,7 +838,7 @@ class RandomPipeline : public Halide::Generator<RandomPipeline> {
         }
         return {unary, f.w, f.h, f.c};
     }
-
+#endif
     // Generate an all-to-all communication in dimension dim,
     // statically unrolled. Currently only every applied over the
     // channels dimension.
@@ -832,7 +848,8 @@ class RandomPipeline : public Halide::Generator<RandomPipeline> {
         if (f.c > 16) return all_to_all_r(f, dim);
 
         vector<Expr> reduction_coords = make_arguments(f.func.args());
-        Expr e = 0.f;
+        // Expr e = 0.f;
+        Expr e = 0;
         for (int i = 0; i < f.c; i++) {
             reduction_coords[dim] = i;
             e += f.func(reduction_coords) * ((i + 1) * f.c + (f.func.args()[dim] + 1));
@@ -1037,9 +1054,9 @@ class RandomPipeline : public Halide::Generator<RandomPipeline> {
         } else if (stage_type == 12) {
             int dim = rand_int(0, 2);
             return scan(f, dim);
+#if 0   // Uses types not available on HVX
         } else if (stage_type == 13 && f.size() < 10000) {
             return unary_op(f);
-#if 0 // Uses types not available on HVX
         } else if (stage_type == 14 && f.w > 32 && f.h > 32) {
             return tiled_histogram(f);
 #endif
@@ -1061,33 +1078,40 @@ class RandomPipeline : public Halide::Generator<RandomPipeline> {
         Func first;
         first(x, y, c) = input(x, y, c);
 
+        int W=300;
+        int H=300;
         vector<Stage> stages;
         // Assume input starts at ~2000x2000
-        stages.emplace_back(Stage{first, 2000, 2000, 3});
+        // stages.emplace_back(Stage{first, 2000, 2000, 3});
+        stages.emplace_back(Stage{first, W, H, 3});
 
         for (int i = 0; i < max_stages - 2; i++) {
             std::cout << "Approx size: " << stages.back().w << ", " << stages.back().h << ", " << stages.back().c << "\n";
             Stage next = random_stage(stages);
             stages.push_back(next);
             if (!auto_schedule) {
-	        stages.back().func.hexagon().compute_root().reorder(x, c, y).vectorize(x, 8).parallel(y, 8);
+                stages.back().func.compute_root().reorder(x, c, y).vectorize(x, 8).parallel(y, 8);
+                if (get_target().features_any_of({Target::HVX_64, Target::HVX_128}))
+                    stages.back().func.hexagon();
             }
         }
 
         Stage tail = stages.back();
 
         // Resample back to the correct resolution
-        tail = resample_to(tail, 2000, 2000, 3);
+        // tail = resample_to(tail, 2000, 2000, 3);
+        tail = resample_to(tail, W, H, 3);
         Stage casted = cast_stage(output.type(), tail);
         output = casted.func;
 
         if (!auto_schedule) {
-	    output.hexagon().compute_root().reorder(x, c, y).vectorize(x, 8).parallel(y);
+            output.compute_root().reorder(x, c, y).vectorize(x, 8).parallel(y);
+            if (get_target().features_any_of({Target::HVX_64, Target::HVX_128}))
+                output.hexagon();
         }
-
         if (auto_schedule) {
-            input.dim(0).set_bounds_estimate(0, 2000)
-                .dim(1).set_bounds_estimate(0, 2000)
+            input.dim(0).set_bounds_estimate(0, W)
+                .dim(1).set_bounds_estimate(0, H)
                 .dim(2).set_bounds_estimate(0, 3);
             uint8_weights.dim(0).set_bounds_estimate(0, 512)
                 .dim(1).set_bounds_estimate(-5, 5)
@@ -1120,12 +1144,12 @@ class RandomPipeline : public Halide::Generator<RandomPipeline> {
                 .dim(3).set_bounds_estimate(0, 512);
 #endif
 
-            output.estimate(output.args()[0], 0, 2000);
-            output.estimate(output.args()[1], 0, 2000);
+            output.estimate(output.args()[0], 0, W);
+            output.estimate(output.args()[1], 0, H);
             output.estimate(output.args()[2], 0, 3);
 
-            output.dim(0).set_bounds_estimate(0, 2000);
-            output.dim(1).set_bounds_estimate(0, 2000);
+            output.dim(0).set_bounds_estimate(0, W);
+            output.dim(1).set_bounds_estimate(0, H);
             output.dim(2).set_bounds_estimate(0, 3);
         }
     }

diff --git a/apps/random_pipeline/test.cpp b/apps/random_pipeline/test.cpp
@@ -27,7 +27,7 @@ void *my_malloc(void *ucon, size_t sz) {
 }
 
 int main(int argc, char **argv) {
-    Buffer<float> output(2000, 2000, 3);
+    Buffer<int32_t> output(300, 300, 3);
 
     for (int y = 0; y < output.height(); y++) {
         for (int x = 0; x < output.width(); x++) {
@@ -37,14 +37,13 @@ int main(int argc, char **argv) {
         }
     }
 
-    Buffer<float> input;
+    Buffer<int32_t> input;
     Buffer<uint8_t>  uint8_weights;
     Buffer<uint16_t>  uint16_weights;
     Buffer<uint32_t>  uint32_weights;
     Buffer<int8_t>  int8_weights;
     Buffer<int16_t>  int16_weights;
     Buffer<int32_t>  int32_weights;
-    Buffer<float>  float32_weights;
 
     assert(input.is_bounds_query());
     assert(uint8_weights.is_bounds_query());
@@ -53,7 +52,6 @@ int main(int argc, char **argv) {
     assert(int8_weights.is_bounds_query());
     assert(int16_weights.is_bounds_query());
     assert(int32_weights.is_bounds_query());
-    assert(float32_weights.is_bounds_query());
 
     random_pipeline(input,
                     uint8_weights,
@@ -62,11 +60,10 @@ int main(int argc, char **argv) {
                     int8_weights,
                     int16_weights,
                     int32_weights,
-                    float32_weights,
                     output);
 
     input.allocate();
-    input.fill(0.0f);
+    input.fill(0);
     uint8_weights.allocate();
     uint8_weights.fill(0);
     uint16_weights.allocate();
@@ -79,8 +76,6 @@ int main(int argc, char **argv) {
     int16_weights.fill(0);
     int32_weights.allocate();
     int32_weights.fill(0);
-    float32_weights.allocate();
-    float32_weights.fill(0);
 
     printf("Input size: %d %d %d\n", input.width(), input.height(), input.channels());
 
@@ -95,7 +90,6 @@ int main(int argc, char **argv) {
                             int8_weights,
                             int16_weights,
                             int32_weights,
-                            float32_weights,
                             output);
         }, config);
     printf("Time: %g\n", best * 1e3);

diff --git a/src/IRVisitor.cpp b/src/IRVisitor.cpp
@@ -1,8 +1,11 @@
 #include "IRVisitor.h"
+#include "IRPrinter.h"
 
 namespace Halide {
 namespace Internal {
 
+Halide::Internal::IRPrinter irp(std::cerr); 
+
 IRVisitor::~IRVisitor() {
 }
 

diff --git a/src/Pipeline.cpp b/src/Pipeline.cpp
@@ -164,7 +164,8 @@ string Pipeline::auto_schedule(const Target &target, const MachineParams &arch_p
     }
 
     user_assert(target.arch == Target::X86 || target.arch == Target::ARM ||
-                target.arch == Target::POWERPC || target.arch == Target::MIPS)
+                target.arch == Target::POWERPC || target.arch == Target::MIPS ||
+                target.arch == Target::Hexagon)
         << "Automatic scheduling is currently supported only on these architectures.";
     return generate_schedules(contents->outputs, target, arch_params);
 }

diff --git a/src/runtime/hexagon_remote/Makefile b/src/runtime/hexagon_remote/Makefile
@@ -60,7 +60,7 @@ CCFLAGS-arm-32-android = --sysroot ${ANDROID_NDK_ROOT}/platforms/android-21/arch
 CCFLAGS-host := ${CCFLAGS} -I../ -I ${HEXAGON_TOOLS_ROOT}/Tools/include/iss/ -fPIC \
 	-L${HEXAGON_TOOLS_ROOT}/Tools/lib/iss/ -lwrapper
 
-CCFLAGS-v60 := $(CCFLAGS-v60) -I ${HEXAGON_SDK_LIBS}/common/rtld/ship/hexagon_Release_toolv80_v62 ${COMMON_CCFLAGS} -I ${HEXAGON_SDK_INCLUDES} -I ${HEXAGON_SDK_LIBS}/common/qurt/ADSPv60MP/include
+CCFLAGS-v60 := $(CCFLAGS-v60) -I ${HEXAGON_SDK_LIBS}/common/rtld/ship/hexagon_Release_toolv80_v62 ${COMMON_CCFLAGS} -I ${HEXAGON_SDK_INCLUDES} -I ${HEXAGON_SDK_LIBS}/common/qurt/ADSPv60MP/include/qurt -mhvx -mhvx-length=128B
 
 CCFLAGS-arm-64-android := $(CCFLAGS-arm-64-android) ${COMMON_CCFLAGS} -llog -fPIE -pie
 CCFLAGS-arm-32-android := $(CCFLAGS-arm-32-android) ${COMMON_CCFLAGS} -llog -fPIE -pie