#include <stdio.h>
#include "Halide.h"
#include "clock.h"
#include "halide_image_io.h"
using namespace Halide::Tools;
Var x, y, c, i, ii, xo, yo, xi, yi;
class MyPipeline {
public:
Func lut, padded, padded16, sharpen, curved;
: input(in) {
padded(x, y, c) = input(
clamp(x, 0, input.width() - 1),
clamp(y, 0, input.height() - 1), c);
sharpen(x, y, c) = (padded16(x, y, c) * 2 -
(padded16(x - 1, y, c) +
padded16(x, y - 1, c) +
padded16(x + 1, y, c) +
padded16(x, y + 1, c)) /
4);
curved(x, y, c) = lut(sharpen(x, y, c));
}
void schedule_for_cpu() {
curved.
split(y, yo, yi, 16)
}
bool schedule_for_gpu() {
Target target = find_gpu_target();
return false;
}
Var block, thread;
lut.
split(i, block, thread, 16);
curved.
gpu_tile(x, y, xo, yo, xi, yi, 8, 8);
printf(
"Target: %s\n", target.
to_string().c_str());
return true;
}
void test_performance() {
Buffer<uint8_t> output(input.width(), input.height(), input.channels());
double best_time = 0.0;
for (int i = 0; i < 3; i++) {
double t1 = current_time();
for (int j = 0; j < 100; j++) {
}
output.copy_to_host();
double t2 = current_time();
double elapsed = (t2 - t1) / 100;
if (i == 0 || elapsed < best_time) {
best_time = elapsed;
}
}
printf("%1.4f milliseconds\n", best_time);
}
void test_correctness(Buffer<uint8_t> reference_output) {
Buffer<uint8_t> output =
curved.
realize({input.width(), input.height(), input.channels()});
for (int c = 0; c < input.channels(); c++) {
for (int y = 0; y < input.height(); y++) {
for (int x = 0; x < input.width(); x++) {
if (output(x, y, c) != reference_output(x, y, c)) {
printf("Mismatch between output (%d) and "
"reference output (%d) at %d, %d, %d\n",
output(x, y, c),
reference_output(x, y, c),
x, y, c);
exit(1);
}
}
}
}
}
};
int main(int argc, char **argv) {
Buffer<uint8_t> reference_output(input.width(), input.height(), input.channels());
printf("Running pipeline on CPU:\n");
MyPipeline p1(input);
p1.schedule_for_cpu();
p1.curved.realize(reference_output);
printf("Running pipeline on GPU:\n");
MyPipeline p2(input);
bool has_gpu_target = p2.schedule_for_gpu();
if (has_gpu_target) {
printf("Testing GPU correctness:\n");
p2.test_correctness(reference_output);
} else {
printf("No GPU target available on the host\n");
}
printf("Testing performance on CPU:\n");
p1.test_performance();
if (has_gpu_target) {
printf("Testing performance on GPU:\n");
p2.test_performance();
}
return 0;
}
std::vector<Target::Feature> features_to_try;
if (sizeof(void*) == 8) {
}
} else {
}
return new_target;
}
}
printf("Requested GPU(s) are not supported. (Do you have the proper hardware and/or driver installed?)\n");
return target;
}
A Halide::Buffer is a named shared reference to a Halide::Runtime::Buffer.
Func & gpu_blocks(const VarOrRVar &block_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Tell Halide that the following dimensions correspond to GPU block indices.
void compile_jit(const Target &target=get_jit_target_from_environment())
Eagerly jit compile the function to machine code.
Func & reorder(const std::vector< VarOrRVar > &vars)
Reorder variables to have the given nesting order, from innermost out.
Func & split(const VarOrRVar &old, const VarOrRVar &outer, const VarOrRVar &inner, const Expr &factor, TailStrategy tail=TailStrategy::Auto)
Split a dimension into inner and outer subdimensions with the given names, where the inner dimension ...
Func & compute_root()
Compute all of this function once ahead of time.
Realization realize(std::vector< int32_t > sizes={}, const Target &target=Target(), const ParamMap ¶m_map=ParamMap::empty_map())
Evaluate this function over some rectangular domain and return the resulting buffer or buffers.
Func & unroll(const VarOrRVar &var)
Mark a dimension to be completely unrolled.
Func & store_at(const Func &f, const Var &var)
Allocate storage for this function within f's loop over var.
Func & gpu_threads(const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Tell Halide that the following dimensions correspond to GPU thread indices.
Func & parallel(const VarOrRVar &var)
Mark a dimension to be traversed in parallel.
Func & vectorize(const VarOrRVar &var)
Mark a dimension to be computed all-at-once as a single vector.
Func & bound(const Var &var, Expr min, Expr extent)
Statically declare that the range over which a function should be evaluated is given by the second an...
Func & compute_at(const Func &f, const Var &var)
Compute this function as needed for each unique value of the given var for the given calling function...
Func & gpu_tile(const VarOrRVar &x, const VarOrRVar &bx, const VarOrRVar &tx, const Expr &x_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Short-hand for tiling a domain and mapping the tile indices to GPU block indices and the coordinates ...
A Halide variable, to be used when defining functions.
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
Target get_host_target()
Return the target corresponding to the host machine.
bool host_supports_target_device(const Target &t)
This attempts to sniff whether a given Target (and its implied DeviceAPI) is usable on the current ho...
Expr clamp(Expr a, const Expr &min_val, const Expr &max_val)
Clamps an expression to lie within the given bounds.
Expr pow(Expr x, Expr y)
Return one floating point expression raised to the power of another.
Expr cast(Expr a)
Cast an expression to the halide type corresponding to the C++ type T.
A struct representing a target machine and os to generate code for.
bool has_gpu_feature() const
Is a fully feature GPU compute runtime enabled?
enum Halide::Target::OS os
std::string to_string() const
Convert the Target into a string form that can be reconstituted by merge_string(),...
Feature
Optional features a target can have.
Target with_feature(Feature f) const
Return a copy of the target with the given feature set.