Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion apps/random_pipeline/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ include ../support/autoscheduler.inc
all: $(BIN)/test

PIPELINE_SEED ?= 0
PIPELINE_STAGES ?= 20
PIPELINE_STAGES ?= 5
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess making change to this default is not needed as we don't really use it.

HL_RANDOM_DROPOUT ?= 100
HL_SEED ?= 0
HL_BEAM_SIZE ?= 1
Expand Down
21 changes: 15 additions & 6 deletions apps/random_pipeline/autotune_loop.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# set -x
set -x

# Install a watchdog to kill benchmarking processes that take too long
bash ./watchdog_bench.sh &
Expand All @@ -8,6 +8,15 @@ function finish {
}
trap finish EXIT

SDK_VER="${SDK_VER:-3.4.1}"
HEXAGON_TOOLS_VER="${HEXAGON_TOOLS_VER:-8.2.07}"
DEBUG_LEVEL="${HL_DEBUG_CODEGEN:-0}"
BATCH_SIZE=1
MAX_STAGES=12
# BATCH_SIZE=32
# MAX_STAGES=5
OUTPUT_FILES="cpp,static_library,h,stmt,assembly,registration"

# Build the generator to autotune.
GENERATOR=./bin/random_pipeline.generator
PIPELINE=random_pipeline
Expand All @@ -27,13 +36,12 @@ mkdir -p weights

# A batch of this many samples is built in parallel, and then
# benchmarked serially. Set to number of cores.
BATCH_SIZE=32

HL_TARGET=x86-64-avx2-disable_llvm_loop_vectorize-disable_llvm_loop_unroll-hvx_128

HEXAGON_SDK_PATH=${HOME}/Qualcomm/Hexagon_SDK
HEXAGON_SDK_ROOT="${HEXAGON_SDK_ROOT:-/local/mnt/workspace/Hexagon_SDK}"
HEXAGON_REMOTE_LIB_PATH=`pwd`/../../src/runtime/hexagon_remote/bin/host/
HEXAGON_LIBS="-L${HEXAGON_SDK_PATH}/3.3.3/tools/HEXAGON_Tools/8.1.05/Tools/lib/iss/ -L${HEXAGON_REMOTE_LIB_PATH} -lhalide_hexagon_host -lwrapper -Wl,-rpath,${HEXAGON_SDK_PATH}/3.3.3/tools/HEXAGON_Tools/8.1.05/Tools/lib/iss/ -Wl,-rpath,${HEXAGON_REMOTE_LIB_PATH}"
HEXAGON_LIBS="-L${HEXAGON_SDK_ROOT}/${SDK_VER}/tools/HEXAGON_Tools/${HEXAGON_TOOLS_VER}/Tools/lib/iss/ -L${HEXAGON_REMOTE_LIB_PATH} -lhalide_hexagon_host -lwrapper -Wl,-rpath,${HEXAGON_SDK_ROOT}/${SDK_VER}/tools/HEXAGON_Tools/i${HEXAGON_TOOLS_VER}/Tools/lib/iss/ -Wl,-rpath,${HEXAGON_REMOTE_LIB_PATH}"

# Build a single sample of the pipeline with a random schedule
make_sample() {
Expand All @@ -42,10 +50,10 @@ make_sample() {
rm -f "${D}/sample.sample"
if [[ $D == */0 ]]; then
# Sample 0 in each batch is best effort beam search, with no randomness
HL_MACHINE_PARAMS=32,1,1 HL_PERMIT_FAILED_UNROLL=1 HL_SEED=${2} HL_FEATURE_FILE=${D}/sample.sample HL_WEIGHTS_DIR=${PWD}/weights HL_RANDOM_DROPOUT=100 HL_BEAM_SIZE=20 ${GENERATOR} -g ${PIPELINE} -o ${D} -e static_library,h,stmt,assembly,registration target=${HL_TARGET} auto_schedule=true max_stages=12 seed=${3} -p ${PWD}/bin/libauto_schedule.so 2> ${D}/compile_log_stderr.txt > ${D}/compile_log_stdout.txt
HL_DEBUG_CODEGEN=${DEBUG_LEVEL} HL_MACHINE_PARAMS=32,1,1 HL_PERMIT_FAILED_UNROLL=1 HL_SEED=${2} HL_FEATURE_FILE=${D}/sample.sample HL_WEIGHTS_DIR=${PWD}/weights HL_RANDOM_DROPOUT=100 HL_BEAM_SIZE=20 ${GENERATOR} -g ${PIPELINE} -o ${D} -e ${OUTPUT_FILES} target=${HL_TARGET} auto_schedule=true max_stages=${MAX_STAGES} seed=${3} -p ${PWD}/bin/libauto_schedule.so 2> ${D}/compile_log_stderr.txt > ${D}/compile_log_stdout.txt
else
# The other samples are random probes biased by the cost model
HL_MACHINE_PARAMS=32,1,1 HL_PERMIT_FAILED_UNROLL=1 HL_SEED=${2} HL_FEATURE_FILE=${D}/sample.sample HL_WEIGHTS_DIR=${PWD}/weights HL_RANDOM_DROPOUT=80 HL_BEAM_SIZE=1 ${GENERATOR} -g ${PIPELINE} -o ${D} -e static_library,h,stmt,assembly,registration target=${HL_TARGET} auto_schedule=true max_stages=12 seed=${3} -p ${PWD}/bin/libauto_schedule.so 2> ${D}/compile_log_stderr.txt > ${D}/compile_log_stdout.txt
HL_DEBUG_CODEGEN=${DEBUG_LEVEL} HL_MACHINE_PARAMS=32,1,1 HL_PERMIT_FAILED_UNROLL=1 HL_SEED=${2} HL_FEATURE_FILE=${D}/sample.sample HL_WEIGHTS_DIR=${PWD}/weights HL_RANDOM_DROPOUT=80 HL_BEAM_SIZE=1 ${GENERATOR} -g ${PIPELINE} -o ${D} -e ${OUTPUT_FILES} target=${HL_TARGET} auto_schedule=true max_stages=${MAX_STAGES} seed=${3} -p ${PWD}/bin/libauto_schedule.so 2> ${D}/compile_log_stderr.txt > ${D}/compile_log_stdout.txt
fi

c++ -std=c++11 -I ../../include ../../tools/RunGenMain.cpp ${D}/*.registration.cpp ${D}/*.a -o ${D}/bench -ljpeg -ldl -lpthread -lz -lpng ${HEXAGON_LIBS}
Expand All @@ -65,6 +73,7 @@ benchmark_sample() {
FIRST=$(ls ${SAMPLES} | cut -d_ -f2 | sort -n | tail -n1)

for ((i=$((FIRST+1));i<1000000;i++)); do
# for ((i=$((FIRST+1));i<$((FIRST+2));i++)); do
# Compile a batch of samples using the generator in parallel
DIR=${SAMPLES}/batch_${i}

Expand Down
74 changes: 49 additions & 25 deletions apps/random_pipeline/random_pipeline_generator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ std::mt19937 rng;
// Helpers to generate random values.
int rand_int(int min, int max) { return (rng() % (max - min + 1)) + min; }
bool rand_bool() { return rng() % 2 == 0; }
float rand_float() { return rand_int(0, 1 << 30) / (float)(1 << 30); }
// float rand_float() { return rand_int(0, 1 << 30) / (float)(1 << 30); }

// Generate random expressions. Given a vector of expresions and a
// tree depth, recursively generates an expression by combining
Expand Down Expand Up @@ -183,9 +183,11 @@ Expr rand_value(Type t) {
return cast(t, rand_int(0,1));
} else if (t.is_int() || t.is_uint()) {
return cast(t, rand_int(1, 127));
#if 0
} else if (t.is_float()) {
assert(false);
return cast(t, rand_float());
#endif
} else {
// Shouldn't get here.
assert(false);
Expand All @@ -202,13 +204,13 @@ Expr random_expr(vector<Expr> inputs, int depth, int func_size) {
Expr result =
Internal::simplify(Internal::common_subexpression_elimination(random_expr_inner(inputs, depth, func_size)));

class Checker : public Internal::IRMutator2 {
class Checker : public Internal::IRMutator {
public:
Expr mutate(const Expr &e) override {
exprs_to_find.erase(e);
return IRMutator2::mutate(e);
return IRMutator::mutate(e);
}
using Internal::IRMutator2::mutate;
using Internal::IRMutator::mutate;
std::set<Expr, Internal::IRDeepCompare> exprs_to_find;
Checker(const vector<Expr> &inputs) {
for (const auto &e : inputs) {
Expand Down Expand Up @@ -463,7 +465,7 @@ class RandomPipeline : public Halide::Generator<RandomPipeline> {
activation(f.func.args()) = max(cast(output_type, 0), cast(output_type,f.func(coords)));
return {activation, f.w, f.h, f.c};
}

#if 0
Stage tanh_layer(Stage f) {
assert(false);
std::cout << "Tanh\n";
Expand All @@ -478,7 +480,7 @@ class RandomPipeline : public Halide::Generator<RandomPipeline> {
activation(f.func.args()) = (exp_pos - 1) / (exp_pos + 1);
return {activation, f.w, f.h, f.c};
}

#endif
Stage pool2D_unrolled(Stage f, int kernel_min, int kernel_max) {
vector<Var> args = f.func.args();
Func pooled2D("pooled2D" + args[0].name() + args[1].name());
Expand Down Expand Up @@ -514,7 +516,8 @@ class RandomPipeline : public Halide::Generator<RandomPipeline> {
}
}

if (!def.type().is_bool()) {
if (!def.type().is_bool() && def.type().bits() < 32) {
// TODO(aankit): Change scaling
def /= scale;
}

Expand Down Expand Up @@ -548,7 +551,11 @@ class RandomPipeline : public Halide::Generator<RandomPipeline> {
pooled2D_r(args) = const_true();
pooled2D_r(args) = pooled2D_r(args) && f.func(coords);
} else {
pooled2D_r(args) += f.func(coords) / scale;
// TODO(aankit): Change scaling
if (ty.bits() < 32)
pooled2D_r(args) += f.func(coords) / scale;
else
pooled2D_r(args) += f.func(coords);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Incase of >= 32 bit, should we cast it down to 16 bit and retain the division by constant for now? We risk truncation, but not lose the scaling completely.

}

return {pooled2D_r, (f.w + stride - 1) / stride, (f.h + stride - 1) / stride, f.c};
Expand Down Expand Up @@ -577,7 +584,12 @@ class RandomPipeline : public Halide::Generator<RandomPipeline> {
#if 0
pooled2D_w(args) = sum(cast<float>(f.func(coords))) / scale;
#else
pooled2D_w(args) = sum(cast<int32_t>(f.func(coords))) / scale;
// TODO(aankit): Change scaling
Expr val = sum(cast<int32_t>(f.func(coords)));
if (val.type().bits() < 32)
pooled2D_w(args) = val / scale;
else
pooled2D_w(args) = val;
#endif
return {pooled2D_w, (f.w + stride - 1) / stride, (f.h + stride - 1) / stride, f.c};
}
Expand Down Expand Up @@ -730,7 +742,11 @@ class RandomPipeline : public Halide::Generator<RandomPipeline> {
s1 = cast(sum_type, s1);
s2 = cast(sum_type, s2);

resampled(f.func.args()) = cast(input_type, ((factor - x) * s1 + x * s2) / (2*factor));
// TODO(aankit): Change scaling
if (input_type.bits() < 32)
resampled(f.func.args()) = cast(input_type, ((factor - x) * s1 + x * s2) / (2*factor));
else
resampled(f.func.args()) = cast(input_type, ((factor - x) * s1 + x * s2));
}

Stage s {resampled, f.w, f.h, f.c};
Expand Down Expand Up @@ -803,7 +819,7 @@ class RandomPipeline : public Halide::Generator<RandomPipeline> {
binary(f.func.args()) = def;
return {binary, f.w, f.h, std::min(f.c, g.c)};
}

#if 0
Stage unary_op(Stage f) {
std::cout << "Unary op\n";
Func unary("unary_op");
Expand All @@ -822,7 +838,7 @@ class RandomPipeline : public Halide::Generator<RandomPipeline> {
}
return {unary, f.w, f.h, f.c};
}

#endif
// Generate an all-to-all communication in dimension dim,
// statically unrolled. Currently only every applied over the
// channels dimension.
Expand All @@ -832,7 +848,8 @@ class RandomPipeline : public Halide::Generator<RandomPipeline> {
if (f.c > 16) return all_to_all_r(f, dim);

vector<Expr> reduction_coords = make_arguments(f.func.args());
Expr e = 0.f;
// Expr e = 0.f;
Expr e = 0;
for (int i = 0; i < f.c; i++) {
reduction_coords[dim] = i;
e += f.func(reduction_coords) * ((i + 1) * f.c + (f.func.args()[dim] + 1));
Expand Down Expand Up @@ -1037,9 +1054,9 @@ class RandomPipeline : public Halide::Generator<RandomPipeline> {
} else if (stage_type == 12) {
int dim = rand_int(0, 2);
return scan(f, dim);
#if 0 // Uses types not available on HVX
} else if (stage_type == 13 && f.size() < 10000) {
return unary_op(f);
#if 0 // Uses types not available on HVX
} else if (stage_type == 14 && f.w > 32 && f.h > 32) {
return tiled_histogram(f);
#endif
Expand All @@ -1061,33 +1078,40 @@ class RandomPipeline : public Halide::Generator<RandomPipeline> {
Func first;
first(x, y, c) = input(x, y, c);

int W=300;
int H=300;
vector<Stage> stages;
// Assume input starts at ~2000x2000
stages.emplace_back(Stage{first, 2000, 2000, 3});
// stages.emplace_back(Stage{first, 2000, 2000, 3});
stages.emplace_back(Stage{first, W, H, 3});

for (int i = 0; i < max_stages - 2; i++) {
std::cout << "Approx size: " << stages.back().w << ", " << stages.back().h << ", " << stages.back().c << "\n";
Stage next = random_stage(stages);
stages.push_back(next);
if (!auto_schedule) {
stages.back().func.hexagon().compute_root().reorder(x, c, y).vectorize(x, 8).parallel(y, 8);
stages.back().func.compute_root().reorder(x, c, y).vectorize(x, 8).parallel(y, 8);
if (get_target().features_any_of({Target::HVX_64, Target::HVX_128}))
stages.back().func.hexagon();
}
}

Stage tail = stages.back();

// Resample back to the correct resolution
tail = resample_to(tail, 2000, 2000, 3);
// tail = resample_to(tail, 2000, 2000, 3);
tail = resample_to(tail, W, H, 3);
Stage casted = cast_stage(output.type(), tail);
output = casted.func;

if (!auto_schedule) {
output.hexagon().compute_root().reorder(x, c, y).vectorize(x, 8).parallel(y);
output.compute_root().reorder(x, c, y).vectorize(x, 8).parallel(y);
if (get_target().features_any_of({Target::HVX_64, Target::HVX_128}))
output.hexagon();
}

if (auto_schedule) {
input.dim(0).set_bounds_estimate(0, 2000)
.dim(1).set_bounds_estimate(0, 2000)
input.dim(0).set_bounds_estimate(0, W)
.dim(1).set_bounds_estimate(0, H)
.dim(2).set_bounds_estimate(0, 3);
uint8_weights.dim(0).set_bounds_estimate(0, 512)
.dim(1).set_bounds_estimate(-5, 5)
Expand Down Expand Up @@ -1120,12 +1144,12 @@ class RandomPipeline : public Halide::Generator<RandomPipeline> {
.dim(3).set_bounds_estimate(0, 512);
#endif

output.estimate(output.args()[0], 0, 2000);
output.estimate(output.args()[1], 0, 2000);
output.estimate(output.args()[0], 0, W);
output.estimate(output.args()[1], 0, H);
output.estimate(output.args()[2], 0, 3);

output.dim(0).set_bounds_estimate(0, 2000);
output.dim(1).set_bounds_estimate(0, 2000);
output.dim(0).set_bounds_estimate(0, W);
output.dim(1).set_bounds_estimate(0, H);
output.dim(2).set_bounds_estimate(0, 3);
}
}
Expand Down
12 changes: 3 additions & 9 deletions apps/random_pipeline/test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ void *my_malloc(void *ucon, size_t sz) {
}

int main(int argc, char **argv) {
Buffer<float> output(2000, 2000, 3);
Buffer<int32_t> output(300, 300, 3);

for (int y = 0; y < output.height(); y++) {
for (int x = 0; x < output.width(); x++) {
Expand All @@ -37,14 +37,13 @@ int main(int argc, char **argv) {
}
}

Buffer<float> input;
Buffer<int32_t> input;
Buffer<uint8_t> uint8_weights;
Buffer<uint16_t> uint16_weights;
Buffer<uint32_t> uint32_weights;
Buffer<int8_t> int8_weights;
Buffer<int16_t> int16_weights;
Buffer<int32_t> int32_weights;
Buffer<float> float32_weights;

assert(input.is_bounds_query());
assert(uint8_weights.is_bounds_query());
Expand All @@ -53,7 +52,6 @@ int main(int argc, char **argv) {
assert(int8_weights.is_bounds_query());
assert(int16_weights.is_bounds_query());
assert(int32_weights.is_bounds_query());
assert(float32_weights.is_bounds_query());

random_pipeline(input,
uint8_weights,
Expand All @@ -62,11 +60,10 @@ int main(int argc, char **argv) {
int8_weights,
int16_weights,
int32_weights,
float32_weights,
output);

input.allocate();
input.fill(0.0f);
input.fill(0);
uint8_weights.allocate();
uint8_weights.fill(0);
uint16_weights.allocate();
Expand All @@ -79,8 +76,6 @@ int main(int argc, char **argv) {
int16_weights.fill(0);
int32_weights.allocate();
int32_weights.fill(0);
float32_weights.allocate();
float32_weights.fill(0);

printf("Input size: %d %d %d\n", input.width(), input.height(), input.channels());

Expand All @@ -95,7 +90,6 @@ int main(int argc, char **argv) {
int8_weights,
int16_weights,
int32_weights,
float32_weights,
output);
}, config);
printf("Time: %g\n", best * 1e3);
Expand Down
3 changes: 3 additions & 0 deletions src/IRVisitor.cpp
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
#include "IRVisitor.h"
#include "IRPrinter.h"

namespace Halide {
namespace Internal {

Halide::Internal::IRPrinter irp(std::cerr);

IRVisitor::~IRVisitor() {
}

Expand Down
3 changes: 2 additions & 1 deletion src/Pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,8 @@ string Pipeline::auto_schedule(const Target &target, const MachineParams &arch_p
}

user_assert(target.arch == Target::X86 || target.arch == Target::ARM ||
target.arch == Target::POWERPC || target.arch == Target::MIPS)
target.arch == Target::POWERPC || target.arch == Target::MIPS ||
target.arch == Target::Hexagon)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As we are proceeding with offload mode first, Target::Hexagon is still not supported.

<< "Automatic scheduling is currently supported only on these architectures.";
return generate_schedules(contents->outputs, target, arch_params);
}
Expand Down
2 changes: 1 addition & 1 deletion src/runtime/hexagon_remote/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ CCFLAGS-arm-32-android = --sysroot ${ANDROID_NDK_ROOT}/platforms/android-21/arch
CCFLAGS-host := ${CCFLAGS} -I../ -I ${HEXAGON_TOOLS_ROOT}/Tools/include/iss/ -fPIC \
-L${HEXAGON_TOOLS_ROOT}/Tools/lib/iss/ -lwrapper

CCFLAGS-v60 := $(CCFLAGS-v60) -I ${HEXAGON_SDK_LIBS}/common/rtld/ship/hexagon_Release_toolv80_v62 ${COMMON_CCFLAGS} -I ${HEXAGON_SDK_INCLUDES} -I ${HEXAGON_SDK_LIBS}/common/qurt/ADSPv60MP/include
CCFLAGS-v60 := $(CCFLAGS-v60) -I ${HEXAGON_SDK_LIBS}/common/rtld/ship/hexagon_Release_toolv80_v62 ${COMMON_CCFLAGS} -I ${HEXAGON_SDK_INCLUDES} -I ${HEXAGON_SDK_LIBS}/common/qurt/ADSPv60MP/include/qurt -mhvx -mhvx-length=128B

CCFLAGS-arm-64-android := $(CCFLAGS-arm-64-android) ${COMMON_CCFLAGS} -llog -fPIE -pie
CCFLAGS-arm-32-android := $(CCFLAGS-arm-32-android) ${COMMON_CCFLAGS} -llog -fPIE -pie
Expand Down