diff --git a/apps/random_pipeline/Makefile b/apps/random_pipeline/Makefile index 072a802c0577..f753e0d7cb5c 100644 --- a/apps/random_pipeline/Makefile +++ b/apps/random_pipeline/Makefile @@ -4,7 +4,7 @@ include ../support/autoscheduler.inc all: $(BIN)/test PIPELINE_SEED ?= 0 -PIPELINE_STAGES ?= 20 +PIPELINE_STAGES ?= 5 HL_RANDOM_DROPOUT ?= 100 HL_SEED ?= 0 HL_BEAM_SIZE ?= 1 diff --git a/apps/random_pipeline/autotune_loop.sh b/apps/random_pipeline/autotune_loop.sh index c32a85e25a4d..3143590b55a2 100755 --- a/apps/random_pipeline/autotune_loop.sh +++ b/apps/random_pipeline/autotune_loop.sh @@ -1,4 +1,4 @@ -# set -x +set -x # Install a watchdog to kill benchmarking processes that take too long bash ./watchdog_bench.sh & @@ -8,6 +8,15 @@ function finish { } trap finish EXIT +SDK_VER="${SDK_VER:-3.4.1}" +HEXAGON_TOOLS_VER="${HEXAGON_TOOLS_VER:-8.2.07}" +DEBUG_LEVEL="${HL_DEBUG_CODEGEN:-0}" +BATCH_SIZE=1 +MAX_STAGES=12 +# BATCH_SIZE=32 +# MAX_STAGES=5 +OUTPUT_FILES="cpp,static_library,h,stmt,assembly,registration" + # Build the generator to autotune. GENERATOR=./bin/random_pipeline.generator PIPELINE=random_pipeline @@ -27,13 +36,12 @@ mkdir -p weights # A batch of this many samples is built in parallel, and then # benchmarked serially. Set to number of cores. -BATCH_SIZE=32 HL_TARGET=x86-64-avx2-disable_llvm_loop_vectorize-disable_llvm_loop_unroll-hvx_128 -HEXAGON_SDK_PATH=${HOME}/Qualcomm/Hexagon_SDK +HEXAGON_SDK_ROOT="${HEXAGON_SDK_ROOT:-/local/mnt/workspace/Hexagon_SDK}" HEXAGON_REMOTE_LIB_PATH=`pwd`/../../src/runtime/hexagon_remote/bin/host/ -HEXAGON_LIBS="-L${HEXAGON_SDK_PATH}/3.3.3/tools/HEXAGON_Tools/8.1.05/Tools/lib/iss/ -L${HEXAGON_REMOTE_LIB_PATH} -lhalide_hexagon_host -lwrapper -Wl,-rpath,${HEXAGON_SDK_PATH}/3.3.3/tools/HEXAGON_Tools/8.1.05/Tools/lib/iss/ -Wl,-rpath,${HEXAGON_REMOTE_LIB_PATH}" +HEXAGON_LIBS="-L${HEXAGON_SDK_ROOT}/${SDK_VER}/tools/HEXAGON_Tools/${HEXAGON_TOOLS_VER}/Tools/lib/iss/ -L${HEXAGON_REMOTE_LIB_PATH} -lhalide_hexagon_host -lwrapper -Wl,-rpath,${HEXAGON_SDK_ROOT}/${SDK_VER}/tools/HEXAGON_Tools/i${HEXAGON_TOOLS_VER}/Tools/lib/iss/ -Wl,-rpath,${HEXAGON_REMOTE_LIB_PATH}" # Build a single sample of the pipeline with a random schedule make_sample() { @@ -42,10 +50,10 @@ make_sample() { rm -f "${D}/sample.sample" if [[ $D == */0 ]]; then # Sample 0 in each batch is best effort beam search, with no randomness - HL_MACHINE_PARAMS=32,1,1 HL_PERMIT_FAILED_UNROLL=1 HL_SEED=${2} HL_FEATURE_FILE=${D}/sample.sample HL_WEIGHTS_DIR=${PWD}/weights HL_RANDOM_DROPOUT=100 HL_BEAM_SIZE=20 ${GENERATOR} -g ${PIPELINE} -o ${D} -e static_library,h,stmt,assembly,registration target=${HL_TARGET} auto_schedule=true max_stages=12 seed=${3} -p ${PWD}/bin/libauto_schedule.so 2> ${D}/compile_log_stderr.txt > ${D}/compile_log_stdout.txt + HL_DEBUG_CODEGEN=${DEBUG_LEVEL} HL_MACHINE_PARAMS=32,1,1 HL_PERMIT_FAILED_UNROLL=1 HL_SEED=${2} HL_FEATURE_FILE=${D}/sample.sample HL_WEIGHTS_DIR=${PWD}/weights HL_RANDOM_DROPOUT=100 HL_BEAM_SIZE=20 ${GENERATOR} -g ${PIPELINE} -o ${D} -e ${OUTPUT_FILES} target=${HL_TARGET} auto_schedule=true max_stages=${MAX_STAGES} seed=${3} -p ${PWD}/bin/libauto_schedule.so 2> ${D}/compile_log_stderr.txt > ${D}/compile_log_stdout.txt else # The other samples are random probes biased by the cost model - HL_MACHINE_PARAMS=32,1,1 HL_PERMIT_FAILED_UNROLL=1 HL_SEED=${2} HL_FEATURE_FILE=${D}/sample.sample HL_WEIGHTS_DIR=${PWD}/weights HL_RANDOM_DROPOUT=80 HL_BEAM_SIZE=1 ${GENERATOR} -g ${PIPELINE} -o ${D} -e static_library,h,stmt,assembly,registration target=${HL_TARGET} auto_schedule=true max_stages=12 seed=${3} -p ${PWD}/bin/libauto_schedule.so 2> ${D}/compile_log_stderr.txt > ${D}/compile_log_stdout.txt + HL_DEBUG_CODEGEN=${DEBUG_LEVEL} HL_MACHINE_PARAMS=32,1,1 HL_PERMIT_FAILED_UNROLL=1 HL_SEED=${2} HL_FEATURE_FILE=${D}/sample.sample HL_WEIGHTS_DIR=${PWD}/weights HL_RANDOM_DROPOUT=80 HL_BEAM_SIZE=1 ${GENERATOR} -g ${PIPELINE} -o ${D} -e ${OUTPUT_FILES} target=${HL_TARGET} auto_schedule=true max_stages=${MAX_STAGES} seed=${3} -p ${PWD}/bin/libauto_schedule.so 2> ${D}/compile_log_stderr.txt > ${D}/compile_log_stdout.txt fi c++ -std=c++11 -I ../../include ../../tools/RunGenMain.cpp ${D}/*.registration.cpp ${D}/*.a -o ${D}/bench -ljpeg -ldl -lpthread -lz -lpng ${HEXAGON_LIBS} @@ -65,6 +73,7 @@ benchmark_sample() { FIRST=$(ls ${SAMPLES} | cut -d_ -f2 | sort -n | tail -n1) for ((i=$((FIRST+1));i<1000000;i++)); do +# for ((i=$((FIRST+1));i<$((FIRST+2));i++)); do # Compile a batch of samples using the generator in parallel DIR=${SAMPLES}/batch_${i} diff --git a/apps/random_pipeline/random_pipeline_generator.cpp b/apps/random_pipeline/random_pipeline_generator.cpp index 9a0fbaa9afa3..daa63a18d6d0 100644 --- a/apps/random_pipeline/random_pipeline_generator.cpp +++ b/apps/random_pipeline/random_pipeline_generator.cpp @@ -25,7 +25,7 @@ std::mt19937 rng; // Helpers to generate random values. int rand_int(int min, int max) { return (rng() % (max - min + 1)) + min; } bool rand_bool() { return rng() % 2 == 0; } -float rand_float() { return rand_int(0, 1 << 30) / (float)(1 << 30); } +// float rand_float() { return rand_int(0, 1 << 30) / (float)(1 << 30); } // Generate random expressions. Given a vector of expresions and a // tree depth, recursively generates an expression by combining @@ -183,9 +183,11 @@ Expr rand_value(Type t) { return cast(t, rand_int(0,1)); } else if (t.is_int() || t.is_uint()) { return cast(t, rand_int(1, 127)); +#if 0 } else if (t.is_float()) { assert(false); return cast(t, rand_float()); +#endif } else { // Shouldn't get here. assert(false); @@ -202,13 +204,13 @@ Expr random_expr(vector inputs, int depth, int func_size) { Expr result = Internal::simplify(Internal::common_subexpression_elimination(random_expr_inner(inputs, depth, func_size))); - class Checker : public Internal::IRMutator2 { + class Checker : public Internal::IRMutator { public: Expr mutate(const Expr &e) override { exprs_to_find.erase(e); - return IRMutator2::mutate(e); + return IRMutator::mutate(e); } - using Internal::IRMutator2::mutate; + using Internal::IRMutator::mutate; std::set exprs_to_find; Checker(const vector &inputs) { for (const auto &e : inputs) { @@ -463,7 +465,7 @@ class RandomPipeline : public Halide::Generator { activation(f.func.args()) = max(cast(output_type, 0), cast(output_type,f.func(coords))); return {activation, f.w, f.h, f.c}; } - +#if 0 Stage tanh_layer(Stage f) { assert(false); std::cout << "Tanh\n"; @@ -478,7 +480,7 @@ class RandomPipeline : public Halide::Generator { activation(f.func.args()) = (exp_pos - 1) / (exp_pos + 1); return {activation, f.w, f.h, f.c}; } - +#endif Stage pool2D_unrolled(Stage f, int kernel_min, int kernel_max) { vector args = f.func.args(); Func pooled2D("pooled2D" + args[0].name() + args[1].name()); @@ -514,7 +516,8 @@ class RandomPipeline : public Halide::Generator { } } - if (!def.type().is_bool()) { + if (!def.type().is_bool() && def.type().bits() < 32) { + // TODO(aankit): Change scaling def /= scale; } @@ -548,7 +551,11 @@ class RandomPipeline : public Halide::Generator { pooled2D_r(args) = const_true(); pooled2D_r(args) = pooled2D_r(args) && f.func(coords); } else { - pooled2D_r(args) += f.func(coords) / scale; + // TODO(aankit): Change scaling + if (ty.bits() < 32) + pooled2D_r(args) += f.func(coords) / scale; + else + pooled2D_r(args) += f.func(coords); } return {pooled2D_r, (f.w + stride - 1) / stride, (f.h + stride - 1) / stride, f.c}; @@ -577,7 +584,12 @@ class RandomPipeline : public Halide::Generator { #if 0 pooled2D_w(args) = sum(cast(f.func(coords))) / scale; #else - pooled2D_w(args) = sum(cast(f.func(coords))) / scale; + // TODO(aankit): Change scaling + Expr val = sum(cast(f.func(coords))); + if (val.type().bits() < 32) + pooled2D_w(args) = val / scale; + else + pooled2D_w(args) = val; #endif return {pooled2D_w, (f.w + stride - 1) / stride, (f.h + stride - 1) / stride, f.c}; } @@ -730,7 +742,11 @@ class RandomPipeline : public Halide::Generator { s1 = cast(sum_type, s1); s2 = cast(sum_type, s2); - resampled(f.func.args()) = cast(input_type, ((factor - x) * s1 + x * s2) / (2*factor)); + // TODO(aankit): Change scaling + if (input_type.bits() < 32) + resampled(f.func.args()) = cast(input_type, ((factor - x) * s1 + x * s2) / (2*factor)); + else + resampled(f.func.args()) = cast(input_type, ((factor - x) * s1 + x * s2)); } Stage s {resampled, f.w, f.h, f.c}; @@ -803,7 +819,7 @@ class RandomPipeline : public Halide::Generator { binary(f.func.args()) = def; return {binary, f.w, f.h, std::min(f.c, g.c)}; } - +#if 0 Stage unary_op(Stage f) { std::cout << "Unary op\n"; Func unary("unary_op"); @@ -822,7 +838,7 @@ class RandomPipeline : public Halide::Generator { } return {unary, f.w, f.h, f.c}; } - +#endif // Generate an all-to-all communication in dimension dim, // statically unrolled. Currently only every applied over the // channels dimension. @@ -832,7 +848,8 @@ class RandomPipeline : public Halide::Generator { if (f.c > 16) return all_to_all_r(f, dim); vector reduction_coords = make_arguments(f.func.args()); - Expr e = 0.f; + // Expr e = 0.f; + Expr e = 0; for (int i = 0; i < f.c; i++) { reduction_coords[dim] = i; e += f.func(reduction_coords) * ((i + 1) * f.c + (f.func.args()[dim] + 1)); @@ -1037,9 +1054,9 @@ class RandomPipeline : public Halide::Generator { } else if (stage_type == 12) { int dim = rand_int(0, 2); return scan(f, dim); +#if 0 // Uses types not available on HVX } else if (stage_type == 13 && f.size() < 10000) { return unary_op(f); -#if 0 // Uses types not available on HVX } else if (stage_type == 14 && f.w > 32 && f.h > 32) { return tiled_histogram(f); #endif @@ -1061,33 +1078,40 @@ class RandomPipeline : public Halide::Generator { Func first; first(x, y, c) = input(x, y, c); + int W=300; + int H=300; vector stages; // Assume input starts at ~2000x2000 - stages.emplace_back(Stage{first, 2000, 2000, 3}); + // stages.emplace_back(Stage{first, 2000, 2000, 3}); + stages.emplace_back(Stage{first, W, H, 3}); for (int i = 0; i < max_stages - 2; i++) { std::cout << "Approx size: " << stages.back().w << ", " << stages.back().h << ", " << stages.back().c << "\n"; Stage next = random_stage(stages); stages.push_back(next); if (!auto_schedule) { - stages.back().func.hexagon().compute_root().reorder(x, c, y).vectorize(x, 8).parallel(y, 8); + stages.back().func.compute_root().reorder(x, c, y).vectorize(x, 8).parallel(y, 8); + if (get_target().features_any_of({Target::HVX_64, Target::HVX_128})) + stages.back().func.hexagon(); } } Stage tail = stages.back(); // Resample back to the correct resolution - tail = resample_to(tail, 2000, 2000, 3); + // tail = resample_to(tail, 2000, 2000, 3); + tail = resample_to(tail, W, H, 3); Stage casted = cast_stage(output.type(), tail); output = casted.func; if (!auto_schedule) { - output.hexagon().compute_root().reorder(x, c, y).vectorize(x, 8).parallel(y); + output.compute_root().reorder(x, c, y).vectorize(x, 8).parallel(y); + if (get_target().features_any_of({Target::HVX_64, Target::HVX_128})) + output.hexagon(); } - if (auto_schedule) { - input.dim(0).set_bounds_estimate(0, 2000) - .dim(1).set_bounds_estimate(0, 2000) + input.dim(0).set_bounds_estimate(0, W) + .dim(1).set_bounds_estimate(0, H) .dim(2).set_bounds_estimate(0, 3); uint8_weights.dim(0).set_bounds_estimate(0, 512) .dim(1).set_bounds_estimate(-5, 5) @@ -1120,12 +1144,12 @@ class RandomPipeline : public Halide::Generator { .dim(3).set_bounds_estimate(0, 512); #endif - output.estimate(output.args()[0], 0, 2000); - output.estimate(output.args()[1], 0, 2000); + output.estimate(output.args()[0], 0, W); + output.estimate(output.args()[1], 0, H); output.estimate(output.args()[2], 0, 3); - output.dim(0).set_bounds_estimate(0, 2000); - output.dim(1).set_bounds_estimate(0, 2000); + output.dim(0).set_bounds_estimate(0, W); + output.dim(1).set_bounds_estimate(0, H); output.dim(2).set_bounds_estimate(0, 3); } } diff --git a/apps/random_pipeline/test.cpp b/apps/random_pipeline/test.cpp index a59809623921..2e4e39ba1184 100644 --- a/apps/random_pipeline/test.cpp +++ b/apps/random_pipeline/test.cpp @@ -27,7 +27,7 @@ void *my_malloc(void *ucon, size_t sz) { } int main(int argc, char **argv) { - Buffer output(2000, 2000, 3); + Buffer output(300, 300, 3); for (int y = 0; y < output.height(); y++) { for (int x = 0; x < output.width(); x++) { @@ -37,14 +37,13 @@ int main(int argc, char **argv) { } } - Buffer input; + Buffer input; Buffer uint8_weights; Buffer uint16_weights; Buffer uint32_weights; Buffer int8_weights; Buffer int16_weights; Buffer int32_weights; - Buffer float32_weights; assert(input.is_bounds_query()); assert(uint8_weights.is_bounds_query()); @@ -53,7 +52,6 @@ int main(int argc, char **argv) { assert(int8_weights.is_bounds_query()); assert(int16_weights.is_bounds_query()); assert(int32_weights.is_bounds_query()); - assert(float32_weights.is_bounds_query()); random_pipeline(input, uint8_weights, @@ -62,11 +60,10 @@ int main(int argc, char **argv) { int8_weights, int16_weights, int32_weights, - float32_weights, output); input.allocate(); - input.fill(0.0f); + input.fill(0); uint8_weights.allocate(); uint8_weights.fill(0); uint16_weights.allocate(); @@ -79,8 +76,6 @@ int main(int argc, char **argv) { int16_weights.fill(0); int32_weights.allocate(); int32_weights.fill(0); - float32_weights.allocate(); - float32_weights.fill(0); printf("Input size: %d %d %d\n", input.width(), input.height(), input.channels()); @@ -95,7 +90,6 @@ int main(int argc, char **argv) { int8_weights, int16_weights, int32_weights, - float32_weights, output); }, config); printf("Time: %g\n", best * 1e3); diff --git a/src/IRVisitor.cpp b/src/IRVisitor.cpp index 7fc3c8480bab..c42f68ad7325 100644 --- a/src/IRVisitor.cpp +++ b/src/IRVisitor.cpp @@ -1,8 +1,11 @@ #include "IRVisitor.h" +#include "IRPrinter.h" namespace Halide { namespace Internal { +Halide::Internal::IRPrinter irp(std::cerr); + IRVisitor::~IRVisitor() { } diff --git a/src/Pipeline.cpp b/src/Pipeline.cpp index b88b93572f22..f3b1872cf205 100644 --- a/src/Pipeline.cpp +++ b/src/Pipeline.cpp @@ -164,7 +164,8 @@ string Pipeline::auto_schedule(const Target &target, const MachineParams &arch_p } user_assert(target.arch == Target::X86 || target.arch == Target::ARM || - target.arch == Target::POWERPC || target.arch == Target::MIPS) + target.arch == Target::POWERPC || target.arch == Target::MIPS || + target.arch == Target::Hexagon) << "Automatic scheduling is currently supported only on these architectures."; return generate_schedules(contents->outputs, target, arch_params); } diff --git a/src/runtime/hexagon_remote/Makefile b/src/runtime/hexagon_remote/Makefile index 3f58b7d76047..976404dfb003 100644 --- a/src/runtime/hexagon_remote/Makefile +++ b/src/runtime/hexagon_remote/Makefile @@ -60,7 +60,7 @@ CCFLAGS-arm-32-android = --sysroot ${ANDROID_NDK_ROOT}/platforms/android-21/arch CCFLAGS-host := ${CCFLAGS} -I../ -I ${HEXAGON_TOOLS_ROOT}/Tools/include/iss/ -fPIC \ -L${HEXAGON_TOOLS_ROOT}/Tools/lib/iss/ -lwrapper -CCFLAGS-v60 := $(CCFLAGS-v60) -I ${HEXAGON_SDK_LIBS}/common/rtld/ship/hexagon_Release_toolv80_v62 ${COMMON_CCFLAGS} -I ${HEXAGON_SDK_INCLUDES} -I ${HEXAGON_SDK_LIBS}/common/qurt/ADSPv60MP/include +CCFLAGS-v60 := $(CCFLAGS-v60) -I ${HEXAGON_SDK_LIBS}/common/rtld/ship/hexagon_Release_toolv80_v62 ${COMMON_CCFLAGS} -I ${HEXAGON_SDK_INCLUDES} -I ${HEXAGON_SDK_LIBS}/common/qurt/ADSPv60MP/include/qurt -mhvx -mhvx-length=128B CCFLAGS-arm-64-android := $(CCFLAGS-arm-64-android) ${COMMON_CCFLAGS} -llog -fPIE -pie CCFLAGS-arm-32-android := $(CCFLAGS-arm-32-android) ${COMMON_CCFLAGS} -llog -fPIE -pie