diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 00000000..265a485f --- /dev/null +++ b/.gitattributes @@ -0,0 +1,2 @@ + +intro.mp4 filter=lfs diff=lfs merge=lfs -text diff --git a/ACKNOWLEDGEMENT.md b/ACKNOWLEDGEMENT.md index bb3f002e..f9626ce6 100644 --- a/ACKNOWLEDGEMENT.md +++ b/ACKNOWLEDGEMENT.md @@ -17,14 +17,14 @@ Many academic researchers have contributed to the project: Many Intel people have helped enabling the technology: -+ System Software Group: Geoff Lowney, John C. Kreatsoulas, Nithin George, Paul Petersen, Gorge Powley, Daya Khudia, Charlotte Dryden, Adam Herr ++ System Software Group: Geoff Lowney, John C. Kreatsoulas, Nithin George, Paul Petersen, Gorge Powley, Carmen Badea, Daya Khudia, Adam Herr, Charlotte Dryden, Pablo Reble, Vishakha Agrawal, Mike Voss, Vasanth Tovinkere + Intel Labs: Christopher J. Hughes, Pradeep Dubey, Jim Held, Timothy Mattson, Sanket Tavarageri, Kunal Banerjee, Bharat Kaul, Justin Gottschlich, Todd A. Anderson, Michael Beale -+ Accelerated Computing Systems and Graphics Group: Hong Jiang, Kari Pulli, Lidong Xu, Fangwen Fu, Hongzheng Li, Sabareesh Ganapathy ++ Accelerated Computing Systems and Graphics Group: Hong Jiang, Lidong Xu, Kari Pulli, Fangwen Fu, Hongzheng Li, Sabareesh Ganapathy + GPU Software Engineering Group: Kai Yu Chen, Guei-Yuan Lueh, Yuting Yang -+ Programmable Solution Group: Jose Alvarez, Bernhard Friebe, Mohamed Issa, Aravind Dasu, John Freeman, Davor Capalija, Tomasz Czajkowski ++ Programmable Solution Group: Jose Alvarez, Bernhard Friebe, Dan Prikster, Mohamed Issa, Aravind Dasu, John Freeman, Gordon Chiu, Davor Capalija, Tomasz Czajkowski, Andrei Hagiescu + Intel FPGA DevCloud: Lawrence Landis, Jimmy Tran \ No newline at end of file diff --git a/Halide/Makefile b/Halide/Makefile index 945d601a..daf66b2c 100644 --- a/Halide/Makefile +++ b/Halide/Makefile @@ -1,3 +1,4 @@ + # 'make' builds libHalide.a, the internal test suite, and runs the internal test suite # 'make run_tests' builds and runs all the end-to-end tests in the test subdirectory # 'make {error,performance}_foo' builds and runs test/{...}/foo.cpp for any @@ -136,7 +137,7 @@ ifeq ($(OS), Windows_NT) else WITH_INTROSPECTION ?= not-empty endif -WITH_EXCEPTIONS ?= +WITH_EXCEPTIONS ?= 1 WITH_LLVM_INSIDE_SHARED_LIBHALIDE ?= not-empty WITH_V8 ?= @@ -796,6 +797,7 @@ T2S_DIR = $(ROOT_DIR)/../t2s T2S_SOURCE_FILES = \ AutorunKernels.cpp \ BuildCallRelation.cpp \ + ChannelPromotion.cpp \ CheckFuncConstraints.cpp \ CheckRecursiveCalls.cpp \ CombineChannels.cpp \ @@ -814,14 +816,15 @@ T2S_SOURCE_FILES = \ MinimizeShregs.cpp \ NoIfSimplify.cpp \ Overlay.cpp \ + PatternMatcher.cpp \ Place.cpp \ PreprocessBeforeLower.cpp \ ScatterAndBuffer.cpp \ SliceExprTree.cpp \ SpaceTimeTransform.cpp \ + Stensor.cpp \ StructType.cpp \ - Utilities.cpp \ - roofline.cpp + Utilities.cpp T2S_HEADER_FILES = \ AutorunKernels.h \ @@ -841,14 +844,15 @@ T2S_HEADER_FILES = \ MinimizeShregs.h \ NoIfSimplify.h \ Overlay.h \ + PatternMatcher.h \ Place.h \ PreprocessBeforeLower.h \ ScatterAndBuffer.h \ SliceExprTree.h \ SpaceTimeTransform.h \ + Stensor.h \ StructType.h \ - Utilities.h \ - roofline.h + Utilities.h OBJECTS += $(T2S_SOURCE_FILES:%.cpp=$(BUILD_DIR)/t2s/%.o) HEADERS += $(T2S_HEADER_FILES:%.h=$(T2S_DIR)/src/%.h) diff --git a/Halide/apps/fft/complex.h b/Halide/apps/fft/complex.h index 124adbb0..013f0057 100644 --- a/Halide/apps/fft/complex.h +++ b/Halide/apps/fft/complex.h @@ -68,6 +68,9 @@ inline Halide::Expr im(Halide::Expr x) { inline ComplexExpr conj(ComplexExpr z) { return ComplexExpr(re(z), -im(z)); } +inline Halide::Expr conj(Halide::Expr x) { + return x; +} // Unary negation. inline ComplexExpr operator-(ComplexExpr z) { @@ -106,6 +109,10 @@ inline ComplexExpr operator/(ComplexExpr a, Halide::Expr b) { return ComplexExpr(re(a) / b, im(a) / b); } +inline ComplexExpr operator/(ComplexExpr a, ComplexExpr b) { + return a * conj(b) / (re(b)*re(b) + im(b)*im(b)); +} + // Compute exp(j*x) inline ComplexExpr expj(Halide::Expr x) { return ComplexExpr(Halide::cos(x), Halide::sin(x)); @@ -116,9 +123,14 @@ inline ComplexExpr sum(ComplexExpr z, const std::string &s = "sum") { return ComplexExpr(Halide::sum(re(z), s + "_re"), Halide::sum(im(z), s + "_im")); } + inline ComplexExpr select(Halide::Expr c, ComplexExpr t, ComplexExpr f) { - return ComplexExpr(Halide::select(c, re(t), re(f)), - Halide::select(c, im(t), im(f))); + Halide::Expr re_part=Halide::select(c, re(t), re(f)); + Halide::Expr im_part=Halide::select(c, im(t), im(f)); + return ComplexExpr(re_part, im_part); +} +inline ComplexExpr select(Halide::Expr c, ComplexExpr t) { + return ComplexExpr(Halide::select(c, re(t)), Halide::select(c, im(t))); } inline ComplexExpr select(Halide::Expr c1, ComplexExpr t1, Halide::Expr c2, ComplexExpr t2, @@ -136,5 +148,18 @@ inline ComplexExpr cast(Halide::Type type, ComplexExpr z) { inline ComplexExpr likely(ComplexExpr z) { return ComplexExpr(Halide::likely(re(z)), Halide::likely(im(z))); } +template +inline T select(Halide::Expr c, ComplexExpr t, FuncRefT f){ + return select(c,t,f.toT()); +} +template +inline T select(Halide::Expr c, FuncRefT t, ComplexExpr f){ + return select(c,t.toT(),f); +} +template +inline T select(Halide::Expr c, FuncRefT t, FuncRefT f){ + return select(c,t.toT(),f.toT()); +} + #endif diff --git a/Halide/apps/fft/funct.h b/Halide/apps/fft/funct.h index 79d558d4..2d06e6a8 100644 --- a/Halide/apps/fft/funct.h +++ b/Halide/apps/fft/funct.h @@ -5,7 +5,9 @@ #include #include "Halide.h" +#include "../../../../Halide/src/Func.h" +struct ComplexExpr; template class FuncRefT : public T { Halide::FuncRef untyped; @@ -34,14 +36,23 @@ class FuncRefT : public T { Stage operator/=(T x) { return untyped = T(Tuple(untyped)) / x; } + T toT(){ + if(typeid(ComplexExpr)==typeid(T)){ + return ComplexExpr(Halide::Call::make(untyped.function(), untyped.arguments(), 0), Halide::Call::make(untyped.function(), untyped.arguments(), 1)); + } + } }; + template class FuncT : public Halide::Func { public: typedef Halide::Var Var; typedef Halide::Expr Expr; typedef Halide::Func Func; + typedef Halide::Place Place; + typedef Halide::Type Type; + typedef Halide::FuncRef FuncRef; explicit FuncT(const std::string &name) : Func(name) { @@ -54,10 +65,26 @@ class FuncT : public Halide::Func { explicit FuncT(Func f) : Func(f) { } + + explicit FuncT(Place place) + : Func(place) { + } + explicit FuncT(const std::string &name, Place place) + : Func(name, place) { + } + explicit FuncT(Halide::Internal::Function f) : Func(f) { } + explicit FuncT(const std::vector &return_types, const std::vector &args, Place place) + : Func(return_types, args, place) { + } + + explicit FuncT(const std::string &name, const std::vector &return_types, const std::vector &args, Place place) + : Func(name, return_types, args, place) { + } + template FuncRefT operator()(Args &&... args) const { return Func::operator()(std::forward(args)...); @@ -78,7 +105,7 @@ class FuncT : public Halide::Func { // there is one. template T operator-(FuncRefT x) { - return -static_cast(x); + return -x.toT(); } template T operator~(FuncRefT x) { @@ -87,43 +114,64 @@ T operator~(FuncRefT x) { template T operator+(FuncRefT a, T b) { - return static_cast(a) + b; + return a.toT()+b; } template T operator-(FuncRefT a, T b) { - return static_cast(a) - b; + return a.toT()-b; } template T operator*(FuncRefT a, T b) { - return static_cast(a) * b; + return a.toT()*b; } template T operator/(FuncRefT a, T b) { - return static_cast(a) / b; + return a.toT()/b; } template T operator%(FuncRefT a, T b) { - return static_cast(a) % b; + return a.toT()%b; } template T operator+(T a, FuncRefT b) { - return a + static_cast(b); + return a+b.toT(); } template T operator-(T a, FuncRefT b) { - return a - static_cast(b); + return a-b.toT(); } template T operator*(T a, FuncRefT b) { - return a * static_cast(b); + return a*b.toT(); } template T operator/(T a, FuncRefT b) { - return a / static_cast(b); + return a/b.toT(); } template T operator%(T a, FuncRefT b) { - return a % static_cast(b); + return a%b.toT(); +} + +template +T operator+(FuncRefT a, FuncRefT b) { + return a.toT()+b.toT(); +} +template +T operator-(FuncRefT a, FuncRefT b) { + return a.toT()-b.toT(); +} +template +T operator*(FuncRefT a, FuncRefT b) { + return a.toT()*b.toT(); +} +template +T operator/(FuncRefT a, FuncRefT b) { + return a.toT()/b.toT(); +} +template +T operator%(FuncRefT a, FuncRefT b) { + return a.toT()%b.toT(); } template @@ -175,4 +223,4 @@ Halide::Expr operator>(T a, FuncRefT b) { return a > static_cast(b); } -#endif \ No newline at end of file +#endif diff --git a/Halide/src/CodeGen_C.cpp b/Halide/src/CodeGen_C.cpp index 9c123725..23326e23 100644 --- a/Halide/src/CodeGen_C.cpp +++ b/Halide/src/CodeGen_C.cpp @@ -1597,7 +1597,7 @@ class GatherKernelInfo : public IRVisitor { CodeGen_C* parent; public: vector kernel_names; - + GatherKernelInfo() {} void visit(const For *op) override { @@ -1715,7 +1715,7 @@ void CodeGen_C::compile(const Module &input) { f.body.accept(&g); } - stream << "int MAX_DEVICES = 4;\n" + stream << "int MAX_DEVICES = 4;\n" << "int NUM_QUEUES_TO_CREATE = " << g.kernel_names.size() << ";\n" << "int NUM_KERNELS_TO_CREATE = " << g.kernel_names.size() << ";\n" << "cl_int status;\n" @@ -1724,13 +1724,13 @@ void CodeGen_C::compile(const Module &input) { << "cl_device_id devices[4];\n" << "int current_kernel = 0;\n" << "cl_kernel kernel[" << g.kernel_names.size() << "];\n\n"; - + stream << "const char *kernel_name[] = {\n"; for (auto name : g.kernel_names) { stream << " \"" << name << "\",\n"; } stream << "};\n"; - + } for (const auto &b : input.buffers()) { @@ -2803,7 +2803,7 @@ void CodeGen_C::visit(const For *op) { stream << "sizeof(cl_mem), " << "(void *)&((device_handle *)_halide_buffer_get_device(" << print_name(arg.name + ".buffer") << "))->mem"; } else { - stream << "sizeof(" << print_type(arg.type) << "), " + stream << "sizeof(" << print_type(arg.type) << "), " << "(void *)&" << arg.name; } stream << ");\n" diff --git a/Halide/src/CodeGen_OpenCL_Dev.cpp b/Halide/src/CodeGen_OpenCL_Dev.cpp index dc75f7b9..5f7c41d6 100644 --- a/Halide/src/CodeGen_OpenCL_Dev.cpp +++ b/Halide/src/CodeGen_OpenCL_Dev.cpp @@ -266,27 +266,40 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const For *loop) { stream << get_indent() << "#pragma unroll\n"; CodeGen_C::visit(loop); } else { - /* - If not explicitly define a env variable(DELAYUNROLL or PRAGMAUNROLL), the unrolling strategy will be automatically + /* + If not explicitly define a env variable(DELAYUNROLL or PRAGMAUNROLL), the unrolling strategy will be automatically decided by the compiler. - To physically unroll a loop, we require that: - 1. there is a conditional execution of channel read/write inside the current loop, e.g. + To physically unroll a loop, we require that: + 1. there is a conditional execution of channel read/write inside the current loop, e.g. unrolled k = 0 to K if (cond) read/write channel - 2. there is a irregular loop inside the current loop and the irregular bound depends on current loop var, e.g. + 2. there is a irregular loop inside the current loop and the irregular bound depends on current loop var, e.g. unrolled k = 0 to K unrolled j = k to J + 3. there is a shift register whose bounds depends on current loop var, e.g., + float _Z_shreg_0, _Z_shreg_1, _Z_shreg_2, _Z_shreg_3; + unrolled j = 0 to J + access _Z_shreg_j // j needs to be replaced with 0, 1, 2, 3 - For other cases, we simply insert the #pragma unroll directive before a loop. The offline compiler attempts to fully + For other cases, we simply insert the #pragma unroll directive before a loop. The offline compiler attempts to fully unroll the loop. */ user_assert(loop->for_type != ForType::Parallel) << "Cannot use parallel loops inside OpenCL kernel\n"; if (loop->for_type == ForType::Unrolled) { CheckConditionalChannelAccess checker(this, loop->name); loop->body.accept(&checker); - if (checker.conditional_access || checker.irregular_loop_dep || - !is_const(loop->min) || !is_const(loop->extent)) { + + // Check the condition 1 and 2 + bool needs_unrolling = checker.conditional_access || checker.irregular_loop_dep; + // Check the condition 3 + for (auto &kv : space_vars) { + for (auto &v : kv.second) { + if (v.as()->name == loop->name) + needs_unrolling |= true; + } + } + if (needs_unrolling) { Expr extent = simplify(loop->extent); Stmt body = loop->body; const IntImm *e = extent.as(); @@ -337,19 +350,18 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::CheckConditionalChannelAccess::visit( if (!is_const(op->min)) { irregular_loop_dep = true; debug(4) << "Loop min: " << op->min << "\n"; - debug(4) << "Physically unroll loop " << current_loop_name << " because of the irregular loop " - << op->name << " inside. \n"; + debug(4) << "Physically unroll loop " << current_loop_name << " because of the irregular loop " + << op->name << " inside. \n"; } else if (!is_const(op->extent)) { irregular_loop_dep = true; debug(4) << "Loop extent: " << op->extent << "\n"; - debug(4) << "Physically unroll loop " << current_loop_name << " because of the irregular loop " - << op->name << " inside. \n"; + debug(4) << "Physically unroll loop " << current_loop_name << " because of the irregular loop " + << op->name << " inside. \n"; } } IRVisitor::visit(op); } - void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Ramp *op) { string id_base = print_expr(op->base); string id_stride = print_expr(op->stride); @@ -483,14 +495,12 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Call *op) { } } debug(4) << "modified channel name: " << channel_name << "\n"; - // if ((int)suffix.size() > 0 && suffix[0] >= '0' && suffix[0] <= '9') { - // channel_name = (v->value).substr(0, size); - // } - stream << get_indent() << print_type(op->type) << " " << id << " = read_channel_intel(" << print_name(channel_name) << string_channel_index << ");\n"; - // if(starts_with(channel_name,"A_feeder")) - // { - // // stream<<"printf(\"read success\\n\");\n"; - // } + string type = print_type(op->type); + if (op->type.is_handle() && !op->type.is_generated_struct()) { + type = print_name(channel_name + ".array.t"); + } + string read_call = "read_channel_intel(" + print_name(channel_name) + string_channel_index + ")"; + stream << get_indent() << type << " " << id << " = " << read_call << ";\n"; } else if (op->is_intrinsic(Call::read_channel_nb)) { std::string string_channel_index; const StringImm *v = op->args[0].as(); @@ -601,6 +611,32 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Call *op) { std::string write_data = print_expr(op->args[1]); rhs << write_data; stream << get_indent() << print_name(write_success->value) << " = write_channel_nb_intel(" << rhs.str() << ");\n"; + } else if (op->is_intrinsic(Call::read_array)) { + std::string arr_name = op->args[0].as()->value; + // read the entire array as a whole + if (op->args.size() == 1) { + id = print_name(arr_name); + } else { + std::string string_index = ".s"; + for (size_t i = 1; i < op->args.size(); i++) + string_index += "[" + print_expr(op->args[i]) + "]"; + id = '_' + unique_name('_'); + stream << get_indent() << print_type(op->type) << " " << id + <<" = " << print_name(arr_name) << string_index << ";\n"; + } + } else if (op->is_intrinsic(Call::write_array)) { + std::string arr_name = op->args[0].as()->value; + // write the entire array as a whole + if (op->args.size() == 2) { + std::string write_data = print_expr(op->args[1]); + stream << get_indent() << print_name(arr_name) << " = " << write_data << ";\n"; + } else { + std::string write_data = print_expr(op->args[1]); + std::string string_index = ".s"; + for (size_t i = 2; i < op->args.size(); i++) + string_index += "[" + print_expr(op->args[i]) + "]"; + stream << get_indent() << print_name(arr_name) << string_index << " = " << write_data << ";\n"; + } } else if (op->is_intrinsic(Call::read_shift_reg)) { std::string string_index; const StringImm *v = op->args[0].as(); @@ -682,7 +718,6 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Call *op) { } } ostringstream rhs; - int size = (v->value).rfind("."); std::string shreg_name = v->value; debug(4) << "shreg name: " << shreg_name << "\n"; @@ -707,14 +742,12 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Call *op) { rhs << print_name(shreg_name) << suffix_index << string_index; print_assignment(op->type, rhs.str()); } - } else if (op->is_intrinsic(Call::write_shift_reg)) { const StringImm *v = op->args[0].as(); std::string reg_name = extract_first_token(v->value); // shift reg has regular bounds if (space_vars.find(reg_name) == space_vars.end()) { ostringstream rhs; - int size = (v->value).rfind("."); std::string shreg_name = v->value; debug(4) << "shreg name: " << shreg_name << "\n"; @@ -759,7 +792,7 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Call *op) { } std::string write_data = print_expr(op->args[op->args.size()-1]); - // After writing to a shift register, the original cached value is no longer valid. + // After writing to a shift register, the original cached value is no longer valid. auto cached = cache.find(rhs.str()); if (cached != cache.end()) { cache.erase(cached); @@ -832,7 +865,7 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Call *op) { } print_assignment(op->type, rhs.str()); } else if (ends_with(op->name, ".temp")) { - std::string name = op->name; + std::string name = op->name; // Do not directly print to stream: there might have been a cached value useable. ostringstream rhs; rhs << print_name(name); @@ -870,14 +903,18 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Call *op) { } } stream << get_indent() << print_expr(op->args[0]) << ";\n"; - stream << get_indent() << addr_temp << " = " << addr_temp << " + " << offset << ";\n"; - } else if (op->is_intrinsic(Call::overlay_switch)) { + stream << get_indent() << addr_temp << " = " << addr_temp << " + " << offset << ";\n"; + } else if (op->is_intrinsic(Call::fpga_reg)) { + ostringstream rhs; + rhs << "__fpga_reg(__fpga_reg(" << print_expr(op->args[0]) << "))"; + print_assignment(op->type, rhs.str()); + } else if (op->is_intrinsic(Call::overlay_switch)) { internal_assert(op->args.size() > 0); // Prepare task object to be dispatched internal_assert(op->args[0].as()); - std::string type = op->args[0].as()->value; + std::string type = op->args[0].as()->value; if (type == "before_switch") { ostringstream rhs; @@ -917,14 +954,14 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Call *op) { internal_assert(op->args.size() > 1); internal_assert(op->args[1].as()); int queue_index = op->args[1].as()->value; - // stream << get_indent() + // stream << get_indent() // << "mem_fence(CLK_CHANNEL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);\n"; - // stream << get_indent() << "index_t f" << queue_index + // stream << get_indent() << "index_t f" << queue_index // << " = read_channel_intel(q" << queue_index << "_end);\n"; stream << get_indent() << "mem_fence(CLK_CHANNEL_MEM_FENCE);\n"; stream << get_indent() << "write_channel_intel(q_ret[" << queue_index << "], inputs.finish);\n"; - // Print data from task channel for autorun kernels + // Print data from task channel for autorun kernels } else if (type == "data") { ostringstream rhs; internal_assert(op->args[1].as()); @@ -982,7 +1019,7 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Call *op) { }; int dep_queue_index = -1; - std::map info; + std::map info; std::vector inputs; auto task_id = op->args[0].as()->value; @@ -1030,11 +1067,11 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Call *op) { } } - // Print task dispatching logic + // Print task dispatching logic int dep_num = 0; - stream << get_indent() << "// Sending task" << task_id << " to the scheduler...\n"; - stream << get_indent() << "task.queue = " << queueNo << ";\n"; - stream << get_indent() << "task.index.task_id = " << task_id << ";\n\n"; + stream << get_indent() << "// Sending task" << task_id << " to the scheduler...\n"; + stream << get_indent() << "task.queue = " << queueNo << ";\n"; + stream << get_indent() << "task.index.task_id = " << task_id << ";\n\n"; int cond_num = 0; ostringstream temp; @@ -1050,7 +1087,7 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Call *op) { cond += " && "; } } - + temp << get_indent() << "index_t dep_" << task_id << "_" << dep_num << " = {" << kv.first << ", {"; @@ -1071,7 +1108,7 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Call *op) { } temp << "}};\n"; - temp << get_indent() << "task.deps[" << dep_num << "]" + temp << get_indent() << "task.deps[" << dep_num << "]" << " = dep_" << task_id << "_" << dep_num << ";\n"; dep_num ++; } @@ -1079,26 +1116,27 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Call *op) { stream << temp.str(); stream << get_indent() << "task.num_of_deps = " << dep_num << ";\n\n"; // if (cond_num == 0) { - // stream << get_indent() << "task.num_of_deps = " + // stream << get_indent() << "task.num_of_deps = " // << dep_num << ";\n\n"; // } else { - // stream << get_indent() << "task.num_of_deps = (" - // << cond << " ? " << dep_num << " : " + // stream << get_indent() << "task.num_of_deps = (" + // << cond << " ? " << dep_num << " : " // << dep_num - cond_num << ");\n\n"; // } - // Prepare input information - stream << get_indent() << "inputs.finish = task.index;\n"; + // Prepare input information + stream << get_indent() << "inputs.finish = task.index;\n"; for (auto& input : inputs) { string name = input.name; string prefix = (starts_with(name, "inputs.args")) ? "_" : ""; - stream << get_indent() << input.name << " = " + stream << get_indent() << input.name << " = " << prefix << input.value << ";\n"; } - stream << get_indent() << "task.inputs = inputs;\n"; + stream << get_indent() << "task.inputs = inputs;\n"; stream << get_indent() << "write_channel_intel(qt, task);\n"; stream << get_indent() << "mem_fence(CLK_CHANNEL_MEM_FENCE);\n\n"; } else { + // Other intrinsics CodeGen_C::visit(op); } } @@ -1311,7 +1349,7 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Store *op) { << " + " << id_ramp_base << ")) = " << id_value << ";\n"; } else { stream << get_indent() << "vstore" << t.lanes() << "(" - << id_value << "," + << id_value << ", " << 0 << ", (" << get_memory_space(op->name) << " " << print_type(t.element_of()) << "*)" << print_name(op->name) << " + " << id_ramp_base @@ -1771,8 +1809,8 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::add_kernel(Stmt s, // Emit the function prototype. IsAutorun checker; s.accept(&checker); - char *overlay_kenrel_name = getenv("HL_OVERLAY_KERNEL"); - if (checker.is_autorun || (overlay_kenrel_name != NULL && args.size() == 0)) { + char *overlay_kernel_name = getenv("HL_OVERLAY_KERNEL"); + if (checker.is_autorun || (overlay_kernel_name != NULL && args.size() == 0)) { stream << "__attribute__((max_global_work_dim(0)))\n"; stream << "__attribute__((autorun))\n"; } @@ -1781,7 +1819,7 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::add_kernel(Stmt s, for (size_t i = 0; i < args.size(); i++) { if (args[i].is_buffer) { stream << " " << get_memory_space(args[i].name) << " "; - // TODO: Update buffer attributes if written in kernel ip + // TODO: Update buffer attributes if written in kernel ip char *overlay_num = getenv("HL_OVERLAY_NUM"); if (!args[i].write && overlay_num == NULL) stream << "const "; stream << print_type(args[i].type) << " *" @@ -1828,6 +1866,10 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::add_kernel(Stmt s, } } + DeclareArrays da(this); + s.accept(&da); + stream << da.arrays.str(); + print(s); close_scope("kernel " + name); @@ -1855,8 +1897,8 @@ void CodeGen_OpenCL_Dev::init_module() { const Target &target = clc.get_target(); - // Check whether it's compiled for ip kernel - char *overlay_kenrel = getenv("HL_OVERLAY_KERNEL"); + // Check whether it's compiled for ip kernel + char *overlay_kenrel = getenv("HL_OVERLAY_KERNEL"); if (overlay_kenrel != NULL) return; // This identifies the program as OpenCL C (as opposed to SPIR). @@ -1988,7 +2030,7 @@ void CodeGen_OpenCL_Dev::init_module() { src_stream << "#include \"ihc_apint.h\"\n"; char *space_dim = getenv("HL_SPACE_DIM"); char *overlay_dtype = getenv("HL_OVERLAY_DTYPE"); - src_stream << "#define DTYPE " << string(overlay_dtype) + src_stream << "#define DTYPE " << string(overlay_dtype) << " // default data type\n"; src_stream << "#define K " << ip_num << " // number of IPs\n"; @@ -2026,7 +2068,7 @@ typedef struct task { bool last; } task_t; -// Task graph +// Task graph typedef struct graph { uint16_t slots; // valid bit used for allocation uint16_t issue; // valid bit used for task issuance @@ -2039,14 +2081,14 @@ channel task_t qt __attribute__((depth(64))); channel int qf __attribute__((depth(64))); )"; - // Print req and ack channels + // Print req and ack channels src_stream << "channel arg_t q[" << ip_num << "] __attribute__((depth(64)));\n"; src_stream << "channel index_t q_ret[" << ip_num << "] __attribute__((depth(64)));\n"; src_stream << R"( // Map from task to array index in the graph int map( graph_t* graph, index_t key) { for (int index = 0; index < SIZE; index++) { - index_t k = graph->tasks[index].index; + index_t k = graph->tasks[index].index; if (k.task_id == key.task_id) { bool match = true; for (int i = 0; i < M; i++) { @@ -2086,19 +2128,19 @@ void allocate(graph_t* graph, task_t task) { // Find out tasks with all deps satisified bool dispatch(graph_t* graph, task_t* task) { for (int i = 0; i < SIZE; i++) { - // valid task slot with no dependency + // valid task slot with no dependency if ((graph->slots & (1 << i)) && graph->deps[i] == 0x0000) { if (!(graph->issue & (1 << i))) { graph->issue |= (1 << i); - *task = graph->tasks[i]; + *task = graph->tasks[i]; return true; } } } return false; -} +} -// Update the graph +// Update the graph void update(graph_t* graph, index_t key) { int index = map(graph, key); // invalidate the slot @@ -2118,21 +2160,21 @@ void update(graph_t* graph, index_t key) { __attribute__((max_global_work_dim(0))) __attribute__((autorun)) __kernel void scheduler() { - + // create task graph graph_t graph; graph.slots = 0x0000; graph.issue = 0x0000; int task_count = 0; - bool task_not_end = true; + bool task_not_end = true; // perform scheduling while(1) { while (task_not_end) { if (graph.slots == 0xFFFF) break; // stop allocating when graph is full - task_t task = read_channel_intel(qt); // read task generated by the application + task_t task = read_channel_intel(qt); // read task generated by the application if (task.last) { // printf("Received the last task....\n"); @@ -2154,7 +2196,7 @@ __kernel void scheduler() { )"; for (int t = 0; t < ip_num; t++) { src_stream << string(16, ' ') << "case " << t << ": {\n"; - src_stream << string(20, ' ') << "write_channel_intel(q[" + src_stream << string(20, ' ') << "write_channel_intel(q[" << t << "], task.inputs);\n"; src_stream << string(20, ' ') << "break;\n"; src_stream << string(16, ' ') << "}\n"; @@ -2163,7 +2205,7 @@ __kernel void scheduler() { } } - // update the graph with ack information + // update the graph with ack information mem_fence(CLK_CHANNEL_MEM_FENCE); for (int i = 0; i < K; i++) { bool ret_valid = true; @@ -2177,7 +2219,7 @@ __kernel void scheduler() { src_stream << string(20, ' ') << "while (ret_valid) {\n"; src_stream << string(24, ' ') << "update(&graph, ret);\n" << string(24, ' ') << "ret = read_channel_nb_intel(q_ret["<< t <<"], &ret_valid); \n"; - src_stream << string(20, ' ') << "}\n" + src_stream << string(20, ' ') << "}\n" << string(20, ' ') << "break;\n"; src_stream << string(16, ' ') << "}\n"; } @@ -2194,10 +2236,10 @@ __kernel void scheduler() { } )"; - // Include the kernel ip functions - char *overlay_kenrel_files = getenv("HL_OVERLAY_FILES"); + // Include the kernel ip functions + char *overlay_kenrel_files = getenv("HL_OVERLAY_FILES"); user_assert(overlay_kenrel_files != NULL) << "HL_OVERLAY_FILES empty...\n"; - + std::string text(overlay_kenrel_files); std::size_t start = 0, end = 0; while ((end = text.find(" ", start)) != std::string::npos) { @@ -2243,15 +2285,15 @@ void CodeGen_OpenCL_Dev::compile_to_aocx(std::ostringstream &src_stream) { // Otherwise, dump the source code to ~/tmp/a.cl and compile it to ~/tmp/a.aocx. char *aocx_name = getenv("BITSTREAM"); std::string bitstream_file = (aocx_name != NULL) ? std::string(aocx_name) : (std::string(getenv("HOME")) + "/tmp/a.aocx"); - user_assert(ends_with(bitstream_file, ".aocx")) << " Bitstream file name expected to end with \".aocx\"\n"; + user_assert(ends_with(bitstream_file, ".aocx")) << " Bitstream file name expected to end with \".aocx\"\n"; user_assert(bitstream_file.size() < 300) << "The full name of the bitstream file is too long. " << "Consider to define a environment variable BITSTREAM within 300 characters instead. Current file name:\n" << bitstream_file << "\n"; - // If HL_OVERLAY_KERNEL is set, only compile the CL files in local - char *overlay_kenrel_name = getenv("HL_OVERLAY_KERNEL"); - if (overlay_kenrel_name != NULL) { - auto cl_name = std::string(overlay_kenrel_name) + ".cl"; + // If HL_OVERLAY_KERNEL is set, only compile the CL files in local + char *overlay_kernel_name = getenv("HL_OVERLAY_KERNEL"); + if (overlay_kernel_name != NULL) { + auto cl_name = std::string(overlay_kernel_name) + ".cl"; std::ofstream fp(cl_name.c_str(), std::ios::out); internal_assert(fp) << "Error: failed to open file " << cl_name << " for output.\n"; fp << src_stream.str() << "\n"; @@ -2316,10 +2358,10 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::print_global_data_structures_before_k } void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::DeclareChannels::visit(const Realize *op) { - if (ends_with(op->name, ".channel")) { + if (ends_with(op->name, ".channel") || ends_with(op->name, ".channel.array")) { // Get the bounds in which all bounds are for the dimensions of the channel array, except the last one is for the min depth. Region bounds = op->bounds; - std::string bounds_str = "" ; + std::string bounds_str = ""; std::string attributes = ""; for (size_t i = 0; i < bounds.size(); i++) { Range b = bounds[i]; @@ -2335,21 +2377,44 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::DeclareChannels::visit(const Realize } internal_assert(op->types.size() == 1) << "In generating Intel OpenCL for FPGAs, a single type is expected for a channel.\n"; std::string type = parent->print_type(op->types[0]); + std::ostringstream oss; - oss << "channel " << type << " " << parent->print_name(op->name) << bounds_str << attributes << ";\n"; - channels += oss.str(); + if (ends_with(op->name, ".channel")) { + oss << "channel " << type << " " << parent->print_name(op->name) << bounds_str << attributes << ";\n"; + channels += oss.str(); + } else { + string printed_name = parent->print_name(op->name); + string type_name = printed_name + "_t"; + size_t pos_last_token = printed_name.rfind('_'); + string channel_name = printed_name.substr(0, pos_last_token); + oss << "typedef struct { " << type << " s" << bounds_str << "; } " << type_name << ";\n"; + oss << "channel " << type_name << " " << channel_name << attributes << ";\n"; + channels += oss.str(); + } } IRVisitor::visit(op); } -/* +void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::DeclareArrays::visit(const Call *op) { + if (op->is_intrinsic(Call::write_array)) { + string printed_name = parent->print_name(op->args[0].as()->value); + string type_name = printed_name + "_t"; + if (array_set.find(printed_name) == array_set.end()) { + array_set.insert(printed_name); + arrays << parent->get_indent() << type_name << " " << printed_name << ";\n"; + } + } + return IRVisitor::visit(op); +} + +/* Check if shift reg has irregular bounds. e.g. realize shift regs [k, J-k] [0, K] [0, T] ... unrolled k = 0 to K unrolled j = k to J ... -The corresponding systolic array of above case is triangular in shape. +The corresponding systolic array of above case is triangular in shape. */ bool CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::is_irregular(Region &bounds) { bool irregular_bounds = false; @@ -2410,7 +2475,7 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::GatherShiftRegsAllocates::print_irreg rhs << type << " " << new_name << bounds_str << ";\n"; shift_regs_allocates[reg_name].push_back(rhs.str()); } - + } } @@ -2426,7 +2491,7 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::GatherShiftRegsAllocates::visit(const } void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::GatherShiftRegsAllocates::visit(const Realize *op) { - if (ends_with(op->name, ".channel")) { + if (ends_with(op->name, ".channel") || ends_with(op->name, ".channel.array")) { } else if (ends_with(op->name, ".shreg")) { ostringstream rhs; Region bounds = op->bounds; @@ -2436,13 +2501,13 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::GatherShiftRegsAllocates::visit(const if (parent->is_irregular(bounds)) { debug(3) << op->name << " has irregular bounds. \n"; /* - For irregular case, we physically unroll the space dimension of shift registers. - e.g. + For irregular case, we physically unroll the space dimension of shift registers. + e.g. realize shreg[k, 3-k][0, 3][0, 100] of type float32 - [k, 3-k] and [0, 3] are space dimensions. - The generated code should be like this: - float shreg_0_0[100]; float shreg_0_1[100]; float shreg_0_2[100]; - float shreg_1_1[100]; float shreg_1_2[100]; + [k, 3-k] and [0, 3] are space dimensions. + The generated code should be like this: + float shreg_0_0[100]; float shreg_0_1[100]; float shreg_0_2[100]; + float shreg_1_1[100]; float shreg_1_2[100]; float shreg_2_2[100]; */ @@ -2455,7 +2520,7 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::GatherShiftRegsAllocates::visit(const // debug(2) << "space var " << var << "\n"; // Divide reg bounds into space bounds and time bounds. - // We only unroll the space part. + // We only unroll the space part. Region space_bounds, time_bounds; for (size_t i = 0; i < bounds.size(); i++) { if (i < space_var_num) @@ -2515,7 +2580,7 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::GatherShiftRegsAllocates::visit(const } void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Realize *op) { - if (ends_with(op->name, ".channel")) { + if (ends_with(op->name, ".channel") || ends_with(op->name, ".channel")) { // We have already declared the channel before the kernel with print_global_data_structures_before_kernel(). // Just skip it and get into the body. print_stmt(op->body); @@ -2574,9 +2639,9 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Provide *op){ std::string name = op->name.substr(0, op->name.length()-std::string(".ibuffer").size()); std::vector access_exprs; for(size_t i = 0; i < op->args.size(); i++) { - access_exprs.push_back(print_expr(op->args[i])); + access_exprs.push_back(print_expr(op->args[i])); } - string buffer_name = name + '.' + std::to_string(0) + ".ibuffer"; + string buffer_name = name + '.' + std::to_string(0) + ".ibuffer"; stream << get_indent() << print_name(buffer_name); for(size_t i = 0; i < op->args.size(); i++) { stream << "["; @@ -2586,14 +2651,14 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Provide *op){ stream << " = " << id_value << ";\n"; cache.clear(); } else if (ends_with(op->name, ".temp")) { - internal_assert(op->values.size() == 1); - string id_value = print_expr(op->values[0]); - std::string name = op->name; + internal_assert(op->values.size() == 1); + string id_value = print_expr(op->values[0]); + std::string name = op->name; std::vector access_exprs; for(size_t i = 0; i < op->args.size(); i++) { - access_exprs.push_back(print_expr(op->args[i])); + access_exprs.push_back(print_expr(op->args[i])); } - stream << get_indent() << print_name(name); + stream << get_indent() << print_name(name); // do_indent(); for(size_t i = 0; i < op->args.size(); i++) { stream << "["; diff --git a/Halide/src/CodeGen_OpenCL_Dev.h b/Halide/src/CodeGen_OpenCL_Dev.h index 0ab5dc2f..1e023e8d 100644 --- a/Halide/src/CodeGen_OpenCL_Dev.h +++ b/Halide/src/CodeGen_OpenCL_Dev.h @@ -107,6 +107,17 @@ class CodeGen_OpenCL_Dev : public CodeGen_GPU_Dev { void visit(const Realize *op) override; }; + // For declaring temporary array variable + class DeclareArrays : public IRVisitor { + using IRVisitor::visit; + CodeGen_OpenCL_C* parent; + std::set array_set; + public: + std::ostringstream arrays; + DeclareArrays(CodeGen_OpenCL_C* parent) : parent(parent) {} + void visit(const Call *op) override; + }; + // For unrolling loop with different strategies class CheckConditionalChannelAccess : public IRVisitor { using IRVisitor::visit; diff --git a/Halide/src/Func.cpp b/Halide/src/Func.cpp index f82e8db1..8dafbd23 100644 --- a/Halide/src/Func.cpp +++ b/Halide/src/Func.cpp @@ -1916,9 +1916,6 @@ void Func::apply_same_loop_transform_to_merged_ures() { if (var_name_match(merged_dims[i].var, func_dims[k].var)) { auto for_type = func_dims[k].for_type; auto device_api = func_dims[k].device_api; - debug(1) << "Not check race condition for merged URE " << merged_f.name() - << " with dimension " << merged_dims[i].var - << ". If you encountered an issue, please check set_dim_type.\n"; internal_assert(supported_types.count(for_type) > 0) << "Not implmented the loop transformation (with ForType = " << for_type << ") " << "for a merged URE " << merged_f.name() << "\n"; diff --git a/Halide/src/Func.h b/Halide/src/Func.h index c0181b02..5968497b 100644 --- a/Halide/src/Func.h +++ b/Halide/src/Func.h @@ -735,10 +735,6 @@ class Func { /** The imaging pipeline that outputs this Func alone. */ Pipeline pipeline_; - /** Get the imaging pipeline that outputs this Func alone, - * creating it (and freezing the Func) if necessary. */ - Pipeline pipeline(); - // Helper function for recursive reordering support Func &reorder_storage(const std::vector &dims, size_t start); @@ -748,6 +744,10 @@ class Func { friend class Overlay; public: + /** Get the imaging pipeline that outputs this Func alone, + * creating it (and freezing the Func) if necessary. */ + Pipeline pipeline(); + /** After we transform (i.e. unroll, vectorize, remove, etc.) a loop, call this function * to apply the same transform to the same loop in all the merged UREs of this Func, because * the UREs and this Func will be merged later and have to share the same loop structure. diff --git a/Halide/src/IR.cpp b/Halide/src/IR.cpp index 110a5562..c3ba613f 100644 --- a/Halide/src/IR.cpp +++ b/Halide/src/IR.cpp @@ -597,6 +597,7 @@ const char *const intrinsic_op_names[] = { "div_round_to_zero", "dynamic_shuffle", "extract_mask_element", + "fpga_reg", "gather", "glsl_texture_load", "glsl_texture_store", @@ -622,6 +623,7 @@ const char *const intrinsic_op_names[] = { "quiet_div", "quiet_mod", "random", + "read_array", "read_channel", "read_channel_nb", "read_field", @@ -646,6 +648,7 @@ const char *const intrinsic_op_names[] = { "stringify", "undef", "unsafe_promise_clamped", + "write_array", "write_channel", "write_channel_nb", "write_mem_channel", diff --git a/Halide/src/IR.h b/Halide/src/IR.h index aab21b59..74e11d11 100644 --- a/Halide/src/IR.h +++ b/Halide/src/IR.h @@ -508,6 +508,7 @@ struct Call : public ExprNode { div_round_to_zero, dynamic_shuffle, extract_mask_element, + fpga_reg, gather, glsl_texture_load, glsl_texture_store, @@ -533,6 +534,7 @@ struct Call : public ExprNode { quiet_div, quiet_mod, random, + read_array, read_channel, read_channel_nb, read_field, @@ -557,6 +559,7 @@ struct Call : public ExprNode { stringify, undef, unsafe_promise_clamped, + write_array, write_channel, write_channel_nb, write_mem_channel, diff --git a/Halide/src/Lower.cpp b/Halide/src/Lower.cpp index c435fcb5..0e159817 100644 --- a/Halide/src/Lower.cpp +++ b/Halide/src/Lower.cpp @@ -73,6 +73,7 @@ // T2S related #include "../../t2s/src/AutorunKernels.h" +#include "../../t2s/src/ChannelPromotion.h" #include "../../t2s/src/CheckRecursiveCalls.h" #include "../../t2s/src/ComputeLoopBounds.h" #include "../../t2s/src/CombineChannels.h" @@ -85,6 +86,7 @@ #include "../../t2s/src/MinimizeShregs.h" #include "../../t2s/src/NoIfSimplify.h" #include "../../t2s/src/Overlay.h" +#include "../../t2s/src/PatternMatcher.h" #include "../../t2s/src/Place.h" #include "../../t2s/src/ScatterAndBuffer.h" #include "../../t2s/src/SpaceTimeTransform.h" @@ -457,11 +459,11 @@ Module lower(const vector &output_funcs, << s << "\n\n"; } - debug(1) << "Detecting vector interleavings...\n"; + // debug(1) << "Detecting vector interleavings...\n"; // s = rewrite_interleavings(s); - s = simplify(s); - debug(2) << "Lowering after rewriting vector interleavings:\n" - << s << "\n\n"; + // s = simplify(s); + // debug(2) << "Lowering after rewriting vector interleavings:\n" + // << s << "\n\n"; debug(1) << "Partitioning loops to simplify boundary conditions...\n"; @@ -512,6 +514,13 @@ Module lower(const vector &output_funcs, } debug(1) << "CSE...\n"; s = common_subexpression_elimination(s); + debug(2) << "Lowering after CSE:\n" + << s << "\n\n"; + + debug(1) << "Matching compute patterns...\n"; + s = match_patterns(s); + debug(2) << "Lowering after matching patterns:\n" + << s <<"\n\n"; if (t.has_feature(Target::OpenGL)) { debug(1) << "Detecting varying attributes...\n"; @@ -525,6 +534,13 @@ Module lower(const vector &output_funcs, << s << "\n\n"; } + if (t.has_feature(Target::IntelFPGA)) { + debug(1) << "Inserting FPGA register calls\n"; + s = insert_fpga_reg(s, env); + debug(2) << "Lowering after inserting FPGA register calls:\n" + << s << "\n\n"; + } + debug(1) << "Lowering unsafe promises...\n"; s = lower_unsafe_promises(s, t); debug(2) << "Lowering after lowering unsafe promises:\n" @@ -537,11 +553,21 @@ Module lower(const vector &output_funcs, debug(1) << "Lowering after final simplification:\n" << s << "\n\n"; - // For overlay, we don't need to flatten task loops. + debug(1) << "Replace memory channel with references...\n"; + s = replace_mem_channels(s, env, funcs_using_mem_channels); + debug(2) << "Lowering after replacing memory channels:\n" + << s << "\n\n"; + + debug(1) << "Promoting channels...\n"; + s = channel_promotion(s); + debug(2) << "Lowering after channel promotion:\n" + << s << "\n\n"; + + // For overlay, we don't need to flatten task loops. char *overlay_num = getenv("HL_OVERLAY_NUM"); if (overlay_num == NULL) { debug(1) << "Flatten the loops...\n"; - s = simplify(flatten_loops(s, env, funcs_using_mem_channels)); + s = simplify(flatten_loops(s, env)); debug(2) << "Lowering after loop flattening:\n" << s << "\n\n"; } diff --git a/Halide/src/Schedule.h b/Halide/src/Schedule.h index 8fa9f93e..0b1ce246 100644 --- a/Halide/src/Schedule.h +++ b/Halide/src/Schedule.h @@ -465,15 +465,15 @@ struct PrefetchDirective { struct SpaceTimeTransformParams { std::vector sch_vector; SpaceTimeTransform check_time; + std::vector src_vars; std::vector dst_vars; std::vector> proj_matrix; std::map reverse; - // The following fields record the original specification, without any processing (In comparison, the above fields + // The following field records the original specification, without any processing (In comparison, the above fields // like sch_vector, proj_matrix, etc. could have been processed to be different from the original specification. See // PreprocessBeforeLower.cpp). bool sch_vector_specified; // If false, a scheduling vector was not actually specified, so this is an "unscheduled" // stt, and the above sch_vector is what compiler automatically makes. - size_t num_src_vars_specified; // Number of the original src_vars specified. }; /** Record arguments for each invocation. */ diff --git a/Halide/src/SharedUtilsInC.cpp b/Halide/src/SharedUtilsInC.cpp new file mode 120000 index 00000000..8d0cb909 --- /dev/null +++ b/Halide/src/SharedUtilsInC.cpp @@ -0,0 +1 @@ +../../t2s/src/SharedUtilsInC.cpp \ No newline at end of file diff --git a/Halide/src/SharedUtilsInC.h b/Halide/src/SharedUtilsInC.h new file mode 120000 index 00000000..87c5b50b --- /dev/null +++ b/Halide/src/SharedUtilsInC.h @@ -0,0 +1 @@ +../../t2s/src/SharedUtilsInC.h \ No newline at end of file diff --git a/Halide/src/runtime/nvidia_libdevice_bitcode/libdevice.compute_20.10.bc b/Halide/src/runtime/nvidia_libdevice_bitcode/libdevice.compute_20.10.bc new file mode 100644 index 00000000..455b676b Binary files /dev/null and b/Halide/src/runtime/nvidia_libdevice_bitcode/libdevice.compute_20.10.bc differ diff --git a/Halide/src/runtime/nvidia_libdevice_bitcode/libdevice.compute_30.10.bc b/Halide/src/runtime/nvidia_libdevice_bitcode/libdevice.compute_30.10.bc new file mode 100644 index 00000000..031525e9 Binary files /dev/null and b/Halide/src/runtime/nvidia_libdevice_bitcode/libdevice.compute_30.10.bc differ diff --git a/Halide/src/runtime/nvidia_libdevice_bitcode/libdevice.compute_35.10.bc b/Halide/src/runtime/nvidia_libdevice_bitcode/libdevice.compute_35.10.bc new file mode 100644 index 00000000..2634126e Binary files /dev/null and b/Halide/src/runtime/nvidia_libdevice_bitcode/libdevice.compute_35.10.bc differ diff --git a/Halide/src/runtime/opencl.cpp b/Halide/src/runtime/opencl.cpp index e9dd4ffb..ca075f97 100644 --- a/Halide/src/runtime/opencl.cpp +++ b/Halide/src/runtime/opencl.cpp @@ -609,14 +609,15 @@ WEAK int create_opencl_context(void *user_context, cl_context *ctx, cl_command_q }}}} // namespace Halide::Runtime::Internal::OpenCL extern "C" { -double exec_times[MAX_COMMAND_QUEUES][3]={{0,0,0}}; +int64_t exec_times[MAX_COMMAND_QUEUES][3]={{0,0,0}}; const char *entry_names[MAX_COMMAND_QUEUES]={NULL}; WEAK int halide_opencl_wait_for_kernels_finish(void *user_context) { // When we use multiple command queues, we flushed kernels, and did not wait for // them to finish. Here we wait for all of them to finish. #ifdef OCL_MULTI_CMD_Q - void *fp = fopen("profile_info.txt", "w"); + int64_t k_earliest_start_time; + int64_t k_latest_end_time; for ( cl_int i = current_command_queue; i >= 0; i-- ) { if (command_queues[i] != NULL) { // TOFIX: overlay does not work well with WAIT_FINISH @@ -627,11 +628,25 @@ WEAK int halide_opencl_wait_for_kernels_finish(void *user_context) { exec_times[i][1]=halide_current_time_ns(user_context); exec_times[i][2]=exec_times[i][1]-exec_times[i][0]; - //fprintf(fp,"Kernel %d(%s): %lf\n",i,entry_names[i],exec_times[i][2]); - fprintf(fp,"%d\n",(int)exec_times[i][2]); + + if (i == (cl_int) current_command_queue) { + k_earliest_start_time = exec_times[i][0]; + k_latest_end_time = exec_times[i][1]; + } else { + if (exec_times[i][0] < k_earliest_start_time) { + k_earliest_start_time = exec_times[i][0]; + } + if (exec_times[i][1] > k_latest_end_time) { + k_latest_end_time = exec_times[i][1]; + } + } } } } + int64_t k_overall_exec_time = k_latest_end_time - k_earliest_start_time; + + void *fp = fopen("exec_time.txt", "w"); + fprintf(fp,"%f\n", (double)k_overall_exec_time); fclose(fp); debug(user_context) << "CLFinish: All command queues finished\n"; #endif diff --git a/README.md b/README.md index 5a7fb361..587ec348 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ -T2SP (Temporal To Spatial Programming, previously called T2S) enables software programmers to build a high-performance design on a spatial architecture (like FPGAs) in a constructive, incremental, and productive way. Particularly, programmers can quickly build sophisticated systolic arrays on spatial architectures, where systolic arrays are often the key to achieve high performance. This methodology extends [Halide](https://halide-lang.org/) from temporal architectures (CPUs and GPUs) to spatial architectures. +T2SP (Temporal To Spatial Programming, previously called T2S) enables software programmers to build a high-performance design for a tensor compute on a spatial architecture (like FPGAs) in a constructive, incremental, and productive way. Particularly, programmers can quickly build sophisticated systolic arrays on spatial architectures, where systolic arrays are often the key to achieve high performance. This methodology extends [Halide](https://halide-lang.org/) from temporal architectures (CPUs and GPUs) to spatial architectures. -T2SP is available under a permissive license, the [BSD+Patent license](./LICENSE.md). +T2SP is available under a permissive license, the [BSD+Patent license](./LICENSE.md). In this initial release, we support only Intel FPGAs. We assume your FPGA is local to you, or within Intel FPGA DevCloud, and the operating system is Linux. Other platforms might also work, but are not tested. @@ -86,7 +86,7 @@ This may take ~1 hour on a DevCloud machine. If you have your own gcc, llvm or clang and thus did not use `install-tools.sh` as shown above, in `my-setenv.sh`, modify the following path variables appropriately: ``` - export GCC_PATH=... + GCC_PATH=... export LLVM_CONFIG=... export CLANG=... ``` @@ -130,9 +130,8 @@ Set up the environment (whenever a new terminal is open) with one of the followi ``` + For debugging the T2SP compiler with source code information, ```make -j OPTIMIZE="-O0 -g"``` instead. -+ For the T2SP compiler to throw exceptions instead of silently exit in case of any error, ```make -j WITH_EXCEPTIONS=1``` instead. + To debug runtime, ```make -j OPTIMIZE_RUNTIME="-O0 -g" ``` instead. -+ To enable all of them, ```make -j OPTIMIZE="-O0 -g" WITH_EXCEPTIONS=1 OPTIMIZE_RUNTIME="-O0 -g" ``` instead. ++ To enable both of them, ```make -j OPTIMIZE="-O0 -g" OPTIMIZE_RUNTIME="-O0 -g" ``` instead. # Regression tests @@ -148,13 +147,49 @@ To remove all the temporary files generated during the regression testing: ./test.sh clean ``` +# Performance tests + +Current release contains only SGEMM on Arria 10 FPGA. Follow the details at `t2s/tests/performance/gemm/README.md`. + +# Features + +The current release contains the following features: + ++ Expressing systolic arrays + + UREs (uniform recurrence equations) and space-time transforms are supported for expressing systolic arrays in general. Currently, a space-time transform must be unimodular. + ++ Defining an abstract, performance portable memory hierarchy + + A memory hierarchy is defined for each tensor by streaming the tensor across DRAM, SRAM, and registers. The memory hierarchy is then specialized by the compiler for specific hardware with portable performance. The current release targets FPGAs only. Next releases will support GPUs as well. + ++ Isolation + + Split a compute into spatial pieces, so that each piece can be optimized individually. + ++ Data optimizations + + Data gathering, scattering, double buffering, serialization + ++ Loop optimizations + + Loop flattening, removal, unrolling, vectorization + # Tutorials +A 10-minute video `intro.mp4`, located at the root of the repository, introduces the basic concept of T2SP. + We have a set of [tutorials](https://github.com/intel/FPGA-Devcloud/tree/master/main/QuickStartGuides/T2S) at Intel FPGA DevCloud. A compiler binary is also there, and all dependencies have been installed, so you may start using the programming environment immediately. -# Next release +# Next releases + ++ SGEMM performance test that works across Arria 10 FPGA, GEN 9.5 GPU, and GEN 12 GPU with high and portable performance. Aim to open by the end of October, 2021. + + This will be our first test that builds a hardware systolic array on an FPGA and a software systolic array on a GPU with high performance in a single specification. + ++ Other portable performance tests, including 2-D convolution, Capsule convolution, and PairHMM, are aimed to open in November, 2021. -+ A new language interface for productive and portable performance across FPGAs and GPUs, and a performance test suite. This work enables programmers to build a hardware systolic array on an FPGA and a software systolic array on a GPU with portably high performance in a single specification. Aim to open in November, 2021. ++ Support for Stratix 10 FPGA will be released afterwards (The current release works for S10 as well, but lacks some critical optimizations for it). # Citation diff --git a/install-tools.sh b/install-tools.sh old mode 100644 new mode 100755 diff --git a/intro.mp4 b/intro.mp4 new file mode 100644 index 00000000..e60ea7a6 --- /dev/null +++ b/intro.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:052ffe5cfd57c42be930140649ee1080f2d2ab1d261262726b365646485c106e +size 119514644 diff --git a/setenv.sh b/setenv.sh old mode 100644 new mode 100755 index 6f83fa95..e6e5c93a --- a/setenv.sh +++ b/setenv.sh @@ -18,10 +18,11 @@ if [ "$1" != "devcloud" -a "$1" != "local" ]; then fi export T2S_PATH="$( cd "$(dirname $(realpath "$BASH_SOURCE") )" >/dev/null 2>&1 ; pwd -P )" # The path to this script -export TOOLS_PATH=$T2S_PATH/install +TOOLS_PATH=$T2S_PATH/install # Modify these 3 paths if you installed your own versions of gcc or llvm-clang -export GCC_PATH=$TOOLS_PATH/gcc-7.5.0 +# gcc should be located at $GCC_PATH/bin +GCC_PATH=$TOOLS_PATH/gcc-7.5.0 export LLVM_CONFIG=$TOOLS_PATH/bin/llvm-config export CLANG=$TOOLS_PATH/bin/clang diff --git a/t2s/src/AOT-OpenCL-Runtime.cpp b/t2s/src/AOT-OpenCL-Runtime.cpp index 18dd6c0b..ae01b36c 100644 --- a/t2s/src/AOT-OpenCL-Runtime.cpp +++ b/t2s/src/AOT-OpenCL-Runtime.cpp @@ -17,6 +17,7 @@ * SPDX-License-Identifier: BSD-2-Clause-Patent *******************************************************************************/ #include "AOT-OpenCL-Runtime.h" +#include "SharedUtilsInC.h" #define WEAK __attribute__((weak)) #define ACL_ALIGNMENT 64 @@ -130,7 +131,6 @@ WEAK int32_t halide_device_and_host_malloc(void *user_context, struct halide_buf const char *name = getenv("INTEL_FPGA_OCL_PLATFORM_NAME"); platform = findPlatform(name); -printf("%s\n", name); if(platform == NULL) { DPRINTF("ERROR: Unable to find Intel(R) FPGA OpenCL platform\n"); return -1; @@ -297,16 +297,16 @@ WEAK int32_t halide_device_and_host_free(void *user_context, void *obj) { WEAK void halide_device_and_host_free_as_destructor(void *user_context, void *obj) { } +// Return execution time in nanoseconds, as well as the start and end time in nanoseconds double compute_kernel_execution_time(cl_event &event, double &start_d, double &end_d) { cl_ulong start, end; clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); - start_d = (double)1.0e-9 * start; - end_d = (double)1.0e-9 * end; - //return (double)(end-start); - return (double)1.0e-9 * (end - start); // nanoseconds to seconds + start_d = (double)start; + end_d = (double)end; + return (double)(end-start); } WEAK int32_t halide_opencl_wait_for_kernels_finish(void *user_context) { @@ -360,41 +360,36 @@ WEAK int32_t halide_opencl_wait_for_kernels_finish(void *user_context) { double k_start_time[NUM_KERNELS_TO_CREATE]; double k_end_time[NUM_KERNELS_TO_CREATE]; double k_exec_time[NUM_KERNELS_TO_CREATE]; - double max_time = 0; for (int i = 0; i < NUM_KERNELS_TO_CREATE; i++) { k_exec_time[i] = compute_kernel_execution_time(kernel_exec_event[i], k_start_time[i], k_end_time[i]); - if (k_exec_time[i] > max_time) { - max_time = k_exec_time[i]; - } } - char *output_file = getenv("BITSTREAM"); - int i; - for (i = strlen(output_file) - 1; i >= 0 && output_file[i] != '/'; i--) { ; } - strcpy(output_file + i + 1, "exec_time.txt"); - double k_earliest_start_time = k_start_time[0]; double k_latest_end_time = k_end_time[0]; - for (int i = 1; i < NUM_KERNELS_TO_CREATE; i++) { - if (k_start_time[i] < k_earliest_start_time) + if (k_start_time[i] < k_earliest_start_time) { k_earliest_start_time = k_start_time[i]; - - if (k_end_time[i] > k_latest_end_time) + } + if (k_end_time[i] > k_latest_end_time) { k_latest_end_time = k_end_time[i]; + } } - uint64_t k_overall_exec_time = (uint64_t)((k_latest_end_time - k_earliest_start_time) * 1e9); - - FILE *fp = fopen(output_file, "w+"); + double k_overall_exec_time = k_latest_end_time - k_earliest_start_time; + char *bitstream_dir = bitstream_directory(); + char *exec_time_file = concat_directory_and_file(bitstream_dir, "exec_time.txt"); + FILE *fp = fopen(exec_time_file, "w"); if (fp == NULL) { - DPRINTF("Failed to open the AOCX file (fopen).\n"); + DPRINTF("Failed to open %s for writing.\n", exec_time_file); + free(bitstream_dir); + free(exec_time_file); return -1; } - - fprintf(fp, "%ld\n", k_overall_exec_time); - + fprintf(fp, "%f\n", k_overall_exec_time); + fclose(fp); + free(bitstream_dir); + free(exec_time_file); return 0; } @@ -406,25 +401,29 @@ WEAK int halide_opencl_buffer_copy(void *user_context, struct halide_buffer_t *s bool from_host = (src->device == 0) || (src->host_dirty() && src->host != NULL); if (!from_host && to_host) { - std::cout << "Command queue " << current_kernel << ": copying " << src->size_in_bytes() << " bytes data from device to host. \n"; + std::cout << "Command queue " << current_kernel << ": copying " << src->size_in_bytes() << " bytes data from device to host. "; status = clEnqueueReadBuffer(cmdQueue[current_kernel], ((device_handle *)src->device)->mem, CL_TRUE, 0, src->size_in_bytes(), (void *)(dst->host), 0, NULL, NULL); + std::cout << "Done.\n"; } else if (from_host && !to_host) { - std::cout << "Command queue " << current_kernel << ": copying " << src->size_in_bytes() << " bytes data from host to device. \n"; + std::cout << "Command queue " << current_kernel << ": copying " << src->size_in_bytes() << " bytes data from host to device. "; status = clEnqueueWriteBuffer(cmdQueue[current_kernel], ((device_handle *)dst->device)->mem, CL_TRUE, 0, src->size_in_bytes(), (void *)(src->host), 0, NULL, NULL); + std::cout << "Done.\n"; } else if (!from_host && !to_host) { - std::cout << "Command queue " << current_kernel << ": copying " << src->size_in_bytes() << " bytes data from device to device. \n"; + std::cout << "Command queue " << current_kernel << ": copying " << src->size_in_bytes() << " bytes data from device to device. "; status = clEnqueueCopyBuffer(cmdQueue[current_kernel], ((device_handle *)src->device)->mem, ((device_handle *)dst->device)->mem, 0, 0, src->size_in_bytes(), 0, NULL, NULL); + std::cout << "Done.\n"; } else if (dst->host != src->host) { - std::cout << "Copying " << src->size_in_bytes() << " bytes data from host to host. \n"; + std::cout << "Copying " << src->size_in_bytes() << " bytes data from host to host. "; memcpy((void *)(dst->host), (void *)(src->host), src->size_in_bytes()); + std::cout << "Done.\n"; } else { - std::cout << "Do nothing. \n"; + std::cout << "halide_opencl_buffer_copy: host to host copy with source address equal to destination address. Do nothing. \n"; } return 0; } diff --git a/t2s/src/ChannelPromotion.cpp b/t2s/src/ChannelPromotion.cpp new file mode 100644 index 00000000..95b372d7 --- /dev/null +++ b/t2s/src/ChannelPromotion.cpp @@ -0,0 +1,398 @@ +/******************************************************************************* +* Copyright 2021 Intel Corporation +* +* Licensed under the BSD-2-Clause Plus Patent License (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* https://opensource.org/licenses/BSDplusPatent +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: BSD-2-Clause-Patent +*******************************************************************************/ +#include "IR.h" +#include "IRMutator.h" +#include "IROperator.h" +#include "Simplify.h" +#include "Substitute.h" +#include "Utilities.h" + +#include "./DebugPrint.h" +#include "./LoopRemoval.h" + +namespace Halide { +namespace Internal { + +struct PromotedChannel { + bool is_write_chn; // True if it is a write_channel intrinsic + bool is_simple_cond; // True if it is guarded by a simple condition that allows promotion + string name; // Channel name after eliminating unnecessary suffix + string promotion_loop; // The loop that the channel should be promoted above + vector args; // The read/write_channel arguments +}; + +std::string get_channel_name(Expr e) { + auto v = e.as(); + internal_assert(v); + + int size = (v->value).rfind("."); + std::string channel_name = v->value; + debug(4) << "channel name: " << channel_name << "\n"; + if (size < (int)(v->value.size())-1) { + std::string suffix = (v->value).substr(size + 1, (int)v->value.size() - size - 1); + // Eliminate useless suffix + while (suffix != "channel") { + channel_name = (channel_name).substr(0, size); + size = (channel_name).rfind("."); + if (size < (int)(channel_name.size())-1) { + suffix = (channel_name).substr(size + 1, (int)channel_name.size() - size - 1); + } else break; + if (suffix != "channel") + channel_name = v->value; + } + } + debug(4) << "modified channel name: " << channel_name << "\n"; + return channel_name; +} + +auto get_promoted_channel(vector &channels, string name, bool is_write_chn) +-> vector::iterator { + auto it = channels.begin(); + for (; it != channels.end(); ++it) { + if (it->name == name && it->is_write_chn == is_write_chn) break; + } + return it; +} + +class ChannelVisitor : public IRVisitor { + public: + ChannelVisitor() {} + + vector channels; + vector unrolled_loops; + vector loop_vars; + + // Path condition to a channel write/read + Expr condition = const_true(); + + private: + using IRVisitor::visit; + + void visit(const For* op) override { + if (op->for_type == ForType::Unrolled) { + unrolled_loops.push_back(op->name); + } + loop_vars.push_back(op->name); + + op->body.accept(this); + + if (op->for_type == ForType::Unrolled) { + unrolled_loops.pop_back(); + } + loop_vars.pop_back(); + } + + void visit(const IfThenElse* op) override { + if (loop_vars.size() > 0) { + Expr temp_cond = condition; + condition = simplify(temp_cond && (op->condition)); + op->then_case.accept(this); + + if (op->else_case.defined()) { + condition = simplify(temp_cond && !(op->condition)); + op->else_case.accept(this); + } + condition = temp_cond; + return; + } + IRVisitor::visit(op); + } + + void visit(const Select *op) override { + if (loop_vars.size() > 0) { + Expr temp_cond = condition; + condition = simplify(temp_cond && (op->condition)); + op->true_value.accept(this); + + if (op->false_value.defined()) { + condition = simplify(temp_cond && !(op->condition)); + op->false_value.accept(this); + } + condition = temp_cond; + return; + } + IRVisitor::visit(op); + } + + bool find_loop_in_args(vector args, string loop) { + for (auto v : args) { + if (v.as() && v.as()->name == loop) + return true; + } + return false; + } + + bool check_cond(Expr condition, vector args, string &promotion_loop) { + // Collect occured loop variables in the condition + vector conjunction = break_logic_into_conjunction(condition); + std::set occured_loops; + bool is_simple_cond = true; + for (auto &c : conjunction) { + auto eq = c.as(); + if (!eq) { + if (!is_one(c)) { + is_simple_cond = false; + } + continue; + } + // Two cases of simple condition: + // 1. var_0 == var_1 + // for (gather_iii, 0, III) + // unrolled for (iii, 0, III) + // if (gather_iii == iii) + // read/write_channel(name, iii) + // The channel operation can be promoted out of gather_iii (occured_loop) + // since its loop body accesses only a part of the channel array + // 2. var_0 == 0 (const) + // unrolled for (iii, 0, III) + // read/write_channel(name, 0) + // The channel operation can be promoted out of iii (boarder) + auto lhs = eq->a.as(); + auto rhs = eq->b.as(); + if ((lhs && rhs) + || (lhs && is_const(eq->b)) + || (rhs && is_const(eq->a))) { + if (lhs && !find_loop_in_args(args, lhs->name)) { + occured_loops.insert(lhs->name); + } + if (rhs && !find_loop_in_args(args, rhs->name)) { + occured_loops.insert(rhs->name); + } + } else { + is_simple_cond = false; + } + } + // Find the promotion loop outward + for (auto it = loop_vars.rbegin(); it != loop_vars.rend(); ++it) { + promotion_loop = *it; + // Promote channels outside the argument loops (always) + // or the occured loops (if simple condition) + if (!find_loop_in_args(args, *it)) { + auto o = occured_loops.find(*it); + if (o != occured_loops.end()) { + occured_loops.erase(o); + } else { + break; + } + } + } + // If occured_loop is not empty, there exists condition like this: + // for (gather_iii, 0, III) + // for (dummy, 0, D) + // for (iii, 0, III) + // if (gather_iii == iii) ... + // In such case, we cannot guarantee the correctness of channel promotion + return is_simple_cond && occured_loops.empty(); + } + + class VarsFinder : public IRVisitor + { + vector &vars; + public: + VarsFinder(vector &_v): vars(_v) {} + using IRVisitor::visit; + + // Return true if a variable in vars is encountered. + bool found = false; + + void visit(const Variable* op) override { + for (auto e : vars) { + auto v = e.as(); + if (v && op->name == v->name) + found = true; + } + } + + void visit(const Call *op) override { + // Do not check call arguments + if (op->is_intrinsic(Call::read_shift_reg)) + return; + IRVisitor::visit(op); + } + }; + + void visit(const Call* op) override { + if (op->is_intrinsic(Call::write_channel) || op->is_intrinsic(Call::read_channel)) { + bool is_write_chn = op->is_intrinsic(Call::write_channel) ? true : false; + auto args = op->args; + string chn_name = get_channel_name(args[0]); + + bool need_promotion = false; + string promotion_loop = loop_vars.back(); + bool is_simple_cond = check_cond(condition, args, promotion_loop); + if (promotion_loop != loop_vars.back()) { + need_promotion = true; + } + + if (!is_write_chn) { + // Check the producer-consumer relation to determine if the promotion is valid + auto it = get_promoted_channel(channels, chn_name, true); + if (it == channels.end()) { + // The producer is not promoted, so the consumer cannot be promoted + need_promotion = false; + } else { + if (is_simple_cond) { + // The producer is promoted, and the consumer has simple condition + need_promotion = true; + } else { + // The consumer cannot be promoted, so revoke producer's promotion + need_promotion = false; + it = channels.erase(it); + } + } + } else { + // For write channels, if channel arguments occurs in the condition, we cannot ensure the entire array + // is write at once (example is CNN-Kung-Song). So we disable promotion for safe. + VarsFinder vf(args); + condition.accept(&vf); + if (vf.found) { + need_promotion = false; + } + } + if (need_promotion) { + auto it = get_promoted_channel(channels, chn_name, is_write_chn); + if (it != channels.end()) { + // We allow multiple read/write to one channel + // But they must be promoted at the same loop + internal_assert(promotion_loop == it->promotion_loop); + } else { + // Record the promoted channel + PromotedChannel c = { is_write_chn, is_simple_cond, chn_name, promotion_loop, args }; + channels.push_back(std::move(c)); + } + } + } + IRVisitor::visit(op); + } +}; + +class ChannelPromotor : public IRMutator { + public: + ChannelPromotor(ChannelVisitor &_cv) + : channels(_cv.channels) {} + + vector &channels; + + private: + using IRMutator::visit; + + // Add suffix .array to promoted channels + Stmt visit(const Realize* op) override { + Stmt body = mutate(op->body); + string name = op->name; + for (auto &c : channels) { + if (c.name == op->name) { + name += ".array"; + break; + } + } + return Realize::make(name, op->types, op->memory_type, op->bounds, op->condition, body); + } + + Stmt visit(const For* op) override { + Stmt body = mutate(op->body); + for (auto &c : channels) { + if (op->name == c.promotion_loop) { + string chn_array = c.name + ".array"; + if (c.is_write_chn) { + // Read/write the entire array as a whole + // realize bool c.name.temp + // c.name.temp = 0 + // for (name, min, extent) {...} + // if (c.name.temp == 1) + // write_channel(c.name, read_array(chn_array)) + Expr read_array = Call::make(Handle(), "read_array", { chn_array }, Call::PureIntrinsic); + Expr write = Call::make(Handle(), "write_channel", { c.name, read_array }, Call::PureIntrinsic); + Stmt write_stmt = Evaluate::make(write); + if (!c.is_simple_cond) { + Expr get_flag = Call::make(Bool(), c.name+".temp", {}, Call::Intrinsic); + write_stmt = IfThenElse::make(get_flag == 1, write_stmt); + } + body = Block::make(body, write_stmt); + if (!c.is_simple_cond) { + Stmt init_flag = Provide::make(c.name+".temp", {0}, {}); + body = Block::make(init_flag, body); + body = Realize::make(c.name+".temp", { Bool() }, MemoryType::Auto, {}, const_true(), body); + } + } else { + // Read the entire array as a whole + // write_array(chn_array, read_channel(c.name)) + // for (name, min, extent) {...} + Expr read = Call::make(Handle(), "read_channel", { c.name }, Call::PureIntrinsic); + Expr write_array = Call::make(Handle(), "write_array", { chn_array, read }, Call::PureIntrinsic); + Stmt read_stmt = Evaluate::make(write_array); + body = Block::make(read_stmt, body); + } + } + } + body = For::make(op->name, op->min, op->extent, op->for_type, op->device_api, body); + return body; + } + + Expr visit(const Call* op) override { + if (op->is_intrinsic(Call::write_channel) || op->is_intrinsic(Call::read_channel)) { + bool is_write_chn = op->is_intrinsic(Call::write_channel) ? true : false; + string chn_name = get_channel_name(op->args[0]); + vector args; + for (size_t i = 0; i < op->args.size(); i++) { + args.push_back(mutate(op->args[i])); + } + + auto it = get_promoted_channel(channels, chn_name, is_write_chn); + // Replace read/write_channel with read/write_array + // The channel operation is recreated when visiting For nodes + if (it != channels.end()) { + string call_name = is_write_chn ? "write_array" : "read_array"; + args[0] = StringImm::make(chn_name + ".array"); + return Call::make(op->type, call_name, args, Call::PureIntrinsic); + } + return Call::make(op->type, op->name, args, op->call_type); + } + return IRMutator::visit(op); + } + + Stmt visit(const Evaluate *op) override { + Expr value = mutate(op->value); + Stmt ret = Evaluate::make(value); + + auto call = value.as(); + if (call && call->is_intrinsic(Call::write_array)) { + string chn_name = get_channel_name(call->args[0].as()->value); + auto it = get_promoted_channel(channels, chn_name, true); + + if (!it->is_simple_cond) { + string name = chn_name + ".temp"; + Stmt set_flag = Provide::make(name, {1}, {}); + ret = Block::make(ret, set_flag); + } + } + return ret; + } +}; + +Stmt channel_promotion(Stmt s) { + ChannelVisitor cv; + ChannelPromotor cp(cv); + s.accept(&cv); + s = cp.mutate(s); + return s; +} + +} +} diff --git a/t2s/src/ChannelPromotion.h b/t2s/src/ChannelPromotion.h new file mode 100644 index 00000000..c3ec0ce6 --- /dev/null +++ b/t2s/src/ChannelPromotion.h @@ -0,0 +1,45 @@ +/******************************************************************************* +* Copyright 2021 Intel Corporation +* +* Licensed under the BSD-2-Clause Plus Patent License (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* https://opensource.org/licenses/BSDplusPatent +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: BSD-2-Clause-Patent +*******************************************************************************/ +#ifndef T2S_CHANNEL_PROMOTION_H +#define T2S_CHANNEL_PROMOTION_H + +/** \file + * + * Defines a pass to move a channel read/write above a loop to convert "an array of channels" into + * "a channel of array", since too many asynchronous channels may consume resources and lower frequency. + * For example, channel float A[I] is converted into channel float[I] A, and transform code into: + * float[I] arrA = read_channel("A") + * unrolled for (i, 0, I) { + * // original: float a = read_channel("A", i) + * float a = arrA[i] + * } + */ + +#include "../../Halide/src/IR.h" + +namespace Halide { +namespace Internal { + +/* Promote channels */ +extern Stmt channel_promotion(Stmt s); + +} +} + +#endif diff --git a/t2s/src/CombineChannels.cpp b/t2s/src/CombineChannels.cpp index cff5f7f4..ae5849c0 100644 --- a/t2s/src/CombineChannels.cpp +++ b/t2s/src/CombineChannels.cpp @@ -129,6 +129,7 @@ class GatherChannelAccess : public IRVisitor { // TOFIX: Currently do not handle non-blocking channel reads if (op->is_intrinsic(Call::read_channel) || op->is_intrinsic(Call::write_channel)) { const StringImm *v = op->args[0].as(); + internal_assert(v); string channel = v->value; vector args; if (op->is_intrinsic(Call::read_channel)) { @@ -529,6 +530,7 @@ class CombineChannels : public IRMutator { Expr visit(const Call* op) override { if (op->is_intrinsic(Call::read_channel)) { const StringImm *v = op->args[0].as(); + internal_assert(v); string channel = v->value; if (channel_to_combined.find(channel) == channel_to_combined.end()) { return IRMutator::visit(op); @@ -561,6 +563,7 @@ class CombineChannels : public IRMutator { return IRMutator::mutate(stmt); } const StringImm *v = call->args[0].as(); + internal_assert(v); string channel = v->value; if (channel_to_combined.find(channel) == channel_to_combined.end()) { return IRMutator::mutate(stmt); diff --git a/t2s/src/DebugPrint.cpp b/t2s/src/DebugPrint.cpp index 3f2f883e..5b8f053c 100644 --- a/t2s/src/DebugPrint.cpp +++ b/t2s/src/DebugPrint.cpp @@ -176,6 +176,22 @@ string names_to_string(const vector &v) { return s.str(); } +string names_to_string(const vector &v) { + std::ostringstream s; + for (size_t i = 0; i < v.size(); i++) { + s << ((i==0) ? "" : ", ") << v[i].name(); + } + return s.str(); +} + +string names_to_string(const vector &v) { + std::ostringstream s; + for (size_t i = 0; i < v.size(); i++) { + s << ((i==0) ? "" : ", ") << v[i].name(); + } + return s.str(); +} + string to_string(const map &boxes) { std::ostringstream s; for (auto b: boxes) { diff --git a/t2s/src/DebugPrint.h b/t2s/src/DebugPrint.h index b9b8d583..9aa7789b 100644 --- a/t2s/src/DebugPrint.h +++ b/t2s/src/DebugPrint.h @@ -84,6 +84,8 @@ string to_string(const set &v, bool separate = true) { } string names_to_string(const vector &v); +string names_to_string(const vector &v); +string names_to_string(const vector &v); string to_string(const map &boxes); } diff --git a/t2s/src/FlattenLoops.cpp b/t2s/src/FlattenLoops.cpp index 892a3371..8a222584 100644 --- a/t2s/src/FlattenLoops.cpp +++ b/t2s/src/FlattenLoops.cpp @@ -1044,12 +1044,12 @@ class ReplaceMemChannel : public IRMutator { } }; -Stmt flatten_loops(Stmt s, const std::map &env, const std::map &funcs_using_mem_channels) { +Stmt replace_mem_channels(Stmt s, const std::map &env, const std::map &funcs_using_mem_channels) { map mem_addr; FindAddressesOfMemChannels finder(mem_addr, env); s.accept(&finder); ReplaceMemChannel replacer(mem_addr); - Stmt stmt0 = replacer.mutate(s); + s = replacer.mutate(s); std::set funcs; for(auto entry : env){ @@ -1057,19 +1057,31 @@ Stmt flatten_loops(Stmt s, const std::map &env, const std funcs.insert(entry.first); } } - Stmt stmt1 = remove_lets(stmt0, false, true, true, true, funcs); - debug(2) << "IR after removing LetStmts in device kernels first try ...\n\n" << stmt1 << "\n"; + s = remove_lets(s, false, true, true, true, funcs); + debug(2) << "IR after removing LetStmts in device kernels ...\n\n" << s << "\n"; + return s; +} + +Stmt flatten_loops(Stmt s, const std::map &env) { ConstLoopFlattening clf; - Stmt stmt2 = clf.mutate(stmt1); - debug(2) << "IR after const loop flattening ...\n\n" << stmt2 << "\n"; - Stmt stmt3 = remove_lets(stmt2, false, true, true, true, funcs); - debug(2) << "IR after removing LetStmts in device kernels second try...\n\n" << stmt3 << "\n"; + s = clf.mutate(s); + debug(2) << "IR after const loop flattening ...\n\n" << s << "\n"; + + std::set funcs; + for(auto entry : env){ + if (entry.second.place() == Place::Device) { + funcs.insert(entry.first); + } + } + s = remove_lets(s, false, true, true, true, funcs); + debug(2) << "IR after removing LetStmts in device kernels ...\n\n" << s << "\n"; + // DynamicLoopFlattening dlf; // Stmt stmt3 = dlf.mutate(stmt2); // debug(2) << "IR after dynamic loop flattening ...\n\n" << stmt3 << "\n"; // LoopMerging mgl; // Stmt stmt4 = mgl.mutate(stmt3); - return stmt3; + return s; } } diff --git a/t2s/src/FlattenLoops.h b/t2s/src/FlattenLoops.h index 4cffd7d7..9a2f6a93 100644 --- a/t2s/src/FlattenLoops.h +++ b/t2s/src/FlattenLoops.h @@ -34,7 +34,8 @@ using std::vector; namespace Halide { namespace Internal { -Stmt flatten_loops (Stmt s, const std::map &env, const std::map &funcs_using_mem_channels); +Stmt replace_mem_channels(Stmt s, const std::map &env, const std::map &funcs_using_mem_channels); +Stmt flatten_loops(Stmt s, const std::map &env); } } diff --git a/t2s/src/Gather.cpp b/t2s/src/Gather.cpp index 02bf5922..fd49de01 100644 --- a/t2s/src/Gather.cpp +++ b/t2s/src/Gather.cpp @@ -180,7 +180,7 @@ class TestGathering : public IRVisitor{ void visit(const Shuffle *op) override{ if (in_gather_func && !op->vectors.empty()){ const Call* arg0 = op->vectors[0].as(); - if(arg0->is_intrinsic(Call::read_channel)){ + if(arg0 && arg0->is_intrinsic(Call::read_channel)){ const StringImm* channel_name = arg0->args[0].as(); assert(channel_name); if(ends_with(channel_name->value, func_name + ".channel.0")){ diff --git a/t2s/src/MinimizeShregs.cpp b/t2s/src/MinimizeShregs.cpp index f2345ac4..f0426086 100644 --- a/t2s/src/MinimizeShregs.cpp +++ b/t2s/src/MinimizeShregs.cpp @@ -163,12 +163,14 @@ class ShiftRegAccessCollector : public IRVisitor { IRVisitor::visit(op); if (op->is_intrinsic(Call::write_shift_reg)) { const StringImm *v = op->args[0].as(); + internal_assert(v); string func_name = remove_postfix(v->value, ".shreg"); vector args = {op->args.begin() + 1, op->args.end() - 1}; Access access = {true, func_name, args, op->type}; accesses.push_back(access); } else if (op->is_intrinsic(Call::read_shift_reg)) { const StringImm *v = op->args[0].as(); + internal_assert(v); string func_name = remove_postfix(v->value, ".shreg"); vector args = {op->args.begin() + 1, op->args.end()}; Access access = {false, func_name, args, op->type}; @@ -377,6 +379,7 @@ bool vector_is_negative(const vector &v) { // Test if the var is one of the vars. bool var_is_member(const Expr &var, const vector &vars) { + internal_assert(var.as()); for (auto &v : vars) { if (v.as()->name == var.as()->name) { return true; @@ -758,6 +761,7 @@ vector map_args(const string &func_name, for (size_t i = 0; i < alloc.linearized_dims.size(); i++) { auto &z = alloc.linearized_dims[i]; int lin_distance = linearized_distance(alloc.args, distance, z, loop_extents); + internal_assert(alloc.linearized_extents[i].as()); int lin_extent = alloc.linearized_extents[i].as()->value; internal_assert(lin_distance >= 0 && lin_distance <= lin_extent); if (lin_distance == lin_extent) { @@ -840,6 +844,7 @@ void get_new_args_and_extents(const map &loop_extents, } else { for (size_t i = 0; i < alloc.PE_dims.size(); i++) { int index = alloc.PE_dims[i]; + internal_assert(alloc.args[index].as()); const string &original_loop_name = alloc.args[index].as()->name; std::string new_loop_name = unique_name("dummy") + ".s0." + extract_last_token(original_loop_name); new_args.push_back(Variable::make(Int(32), new_loop_name)); @@ -879,6 +884,7 @@ void shift_linearized_dim(const For *op, // T // Get the dimension of the given loop int current_dim = -1; for (size_t i = 0; i < args.size(); i++) { + internal_assert(args[i].as()); if (op->name == args[i].as()->name) { current_dim = i; break; @@ -985,6 +991,7 @@ void shift_linearized_dim(const For *op, // T // Add PE loops size_t start_of_PE_dims = outer_args.size() - alloc.PE_dims.size(); // Start position of PE_dims in outer_args for (size_t i = start_of_PE_dims; i < outer_args.size(); i++) { + internal_assert(outer_args[i].as()); const string &new_loop_name = outer_args[i].as()->name; const Expr &min = loop_min(alloc.args, loop_mins, alloc.PE_dims[i - start_of_PE_dims]); const Expr &extent = loop_extent(alloc.args, loop_extents, alloc.PE_dims[i - start_of_PE_dims]); @@ -1105,6 +1112,7 @@ class MinimizeShiftRegs : public IRMutator { Expr visit(const Call *op) override { if (op->is_intrinsic(Call::write_shift_reg)) { const StringImm *v = op->args[0].as(); + internal_assert(v); string func_name = remove_postfix(v->value, ".shreg"); if (func_to_regalloc.find(func_name) == func_to_regalloc.end()) { return IRMutator::visit(op); @@ -1118,6 +1126,7 @@ class MinimizeShiftRegs : public IRMutator { return new_call; } else if (op->is_intrinsic(Call::read_shift_reg)) { const StringImm *v = op->args[0].as(); + internal_assert(v); string func_name = remove_postfix(v->value, ".shreg"); if (func_to_regalloc.find(func_name) == func_to_regalloc.end()) { return IRMutator::visit(op); @@ -1207,6 +1216,7 @@ class RemoveUnitBoundsOfShiftRegs : public IRMutator { Expr visit(const Call *op) override { if (op->is_intrinsic(Call::write_shift_reg)) { const StringImm *v = op->args[0].as(); + internal_assert(v); string func_name = remove_postfix(v->value, ".shreg"); if (func_to_bounds_to_remove.find(func_name) == func_to_bounds_to_remove.end()) { return IRMutator::visit(op); @@ -1221,6 +1231,7 @@ class RemoveUnitBoundsOfShiftRegs : public IRMutator { return new_call; } else if (op->is_intrinsic(Call::read_shift_reg)) { const StringImm *v = op->args[0].as(); + internal_assert(v); string func_name = remove_postfix(v->value, ".shreg"); if (func_to_bounds_to_remove.find(func_name) == func_to_bounds_to_remove.end()) { return IRMutator::visit(op); diff --git a/t2s/src/Overlay.cpp b/t2s/src/Overlay.cpp index ec9cc2da..0328fb20 100644 --- a/t2s/src/Overlay.cpp +++ b/t2s/src/Overlay.cpp @@ -331,14 +331,17 @@ class OverlayIntrinsicUpdating : public IRMutator { Stmt visit(const Store *op) override { auto in = op->value.as(); - user_assert(in->is_intrinsic(Call::overlay)); + internal_assert(in); + internal_assert(in->is_intrinsic(Call::overlay)); type = in->type; output_task_name = op->name; index = op->index; // Extract expected arguments + internal_assert(in->args[0].as()); + internal_assert(in->args[1].as()); int task_id = in->args[0].as()->value; int queue = in->args[1].as()->value; - user_assert(in->args.size() > 1); + internal_assert(in->args.size() > 1); for (int i = 2; i < (signed)in->args.size(); i++) { expected_args.push_back(in->args[i]); @@ -349,7 +352,7 @@ class OverlayIntrinsicUpdating : public IRMutator { } // Assert the output buffer is passed in - user_assert(output_buffer.defined()); + internal_assert(output_buffer.defined()); // Create insertion points // E.g. @@ -385,7 +388,7 @@ class OverlayIntrinsicUpdating : public IRMutator { // Push back distance int index = 0; - user_assert(dep_info.distances.size() == loop_vars.size()); + internal_assert(dep_info.distances.size() == loop_vars.size()); for (auto &v : dep_info.distances) { Expr expr = simplify(v + loop_vars[index]); task_args.push_back(expr); @@ -574,7 +577,7 @@ class OverlayIntrinsicUpdating : public IRMutator { // Analyze the assignment (ordered) map // binding: Map from command queue index --> map of arg assignment // arg_map: vector of enqueued buffers - // user_assert(assignment.size() == arg_map.size()); + // internal_assert(assignment.size() == arg_map.size()); for (unsigned int task_id = 0; task_id < arg_map.size(); task_id++) { debug(4) << "\nTask " << task_id << " Assignment..."; @@ -628,7 +631,7 @@ class OverlayIntrinsicUpdating : public IRMutator { // later be used to create overlay instrinsic args) } else if (key.find(".") != string::npos) { ImageParamOrExpr param = buffer_or_expr[arg_pos[key]]; - user_assert(!param.is_image); + internal_assert(!param.is_image); Expr e = param.expr; debug(4) << " Set Scalar " << key << " as " << e << " labeled as " << kv.second << "\n"; @@ -813,7 +816,7 @@ class SubstituteArgs : public IRMutator { } // The last left buffer is for function output - user_assert((unsigned)buffer_index == buffers.size() - 1); + internal_assert((unsigned)buffer_index == buffers.size() - 1); auto buffer_name = buffers[buffer_index]; arg_map[buffer_name] = "inputs.args" + std::to_string(buffer_index); debug(4) << "Assign buffer index " << arg_map[buffer_name] @@ -856,7 +859,7 @@ class SubstituteArgs : public IRMutator { // Check all the hidden args have been assigned int hidden_args_num = buffers.size() + mins.size() + extents.size() + strides.size(); int assigned_args_num = (buffer_index + 1) + (constant_index - scalar_index); - user_assert(hidden_args_num == assigned_args_num) + internal_assert(hidden_args_num == assigned_args_num) << "Required " << hidden_args_num << " hidden parameters (" << assigned_args_num << " given)\n"; } @@ -903,7 +906,7 @@ Stmt MakeAutorunKernel(Stmt s, vector sym_args, // as parameters passed from scheduler // unsigned int num_of_buffers = buffers.size(); - // user_assert(num_of_buffers == sym_args.size() + 1) + // internal_assert(num_of_buffers == sym_args.size() + 1) // << "Missing arguments at command()... " << (num_of_buffers - 1) // << " arguments required but " << sym_args.size() << " given\n"; @@ -1205,14 +1208,14 @@ Stmt create_overlay_schedule(Stmt s, const std::map &env) // Argument Inference and Body Mutation // 1. Add task receiver at the beginning of the kernel // 2. Add ack sender at the end of the kernel - user_assert(sym_args.size() != 0); - user_assert(curr_queue_index != -1); + internal_assert(sym_args.size() != 0); + internal_assert(curr_queue_index != -1); map arg_map; map arg_pos; IpFuncBodyUpdate update(in.inferred_args, overlay_kenrel_name, curr_queue_index, sym_args, arg_map, arg_pos); s = update.mutate(s); - user_assert(env.count(env_key_name)); + internal_assert(env.count(env_key_name)); env.at(env_key_name).overlay().definition().assignMap()[curr_queue_index].arg_map = arg_map; env.at(env_key_name).overlay().definition().assignMap()[curr_queue_index].arg_pos = arg_pos; diff --git a/t2s/src/PatternMatcher.cpp b/t2s/src/PatternMatcher.cpp new file mode 100644 index 00000000..ea275527 --- /dev/null +++ b/t2s/src/PatternMatcher.cpp @@ -0,0 +1,213 @@ +/******************************************************************************* +* Copyright 2021 Intel Corporation +* +* Licensed under the BSD-2-Clause Plus Patent License (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* https://opensource.org/licenses/BSDplusPatent +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: BSD-2-Clause-Patent +*******************************************************************************/ +#include "../../Halide/src/IRMutator.h" +#include "../../Halide/src/IRVisitor.h" +#include "../../Halide/src/Simplify.h" +#include "../../Halide/src/IREquality.h" +#include "./PatternMatcher.h" +#include "./Utilities.h" + +namespace Halide { +namespace Internal { + +using std::string; +using std::vector; + +/* The original inner product operation is expressed in UREs as follows: Z(k, ...) = select(k == 0, 0, Z(k-1, ...)) + A * B + * The guarding condition (k == 0) is tested at each iteration, which may confuse the backend compilers to find an optimized IP. + * We automatically detect such pattern and eliminate the guarding condition. Specifially, the lowered code seems like: + * Z.temp = 0 // (1) This temporary variable is used for reduction and initialized with 0 (true value in select) + * for (k, 0, K) { + * Z.temp = Z.temp + A * B // (2) The inner product operation. + * } + * Z(0, ...) = Z.temp // Write back. After MinimizeShregs phase, only one register is allocated for reduction. + */ +class InnerProductMatcher : public IRMutator +{ + struct InnerProduct { + string name; // The temporary variable used for reduction + string sink_loop; // Move the initial part outside of this loop + Type type; // Type of the variable + Expr init_value; // Expr to initialize temporary variable (1) + Expr update_value; // Expr to update temporary variable (2) + const Call *ori_call; // The original write_shift_reg call for inner product + }; + vector loops; + vector inner_products; + vector> allocs; + Stmt update; // Stmt to replace write_shift_reg call (passed to enclosing Evaluate node) + + bool find_inner_product(string w_name, Expr w_value, vector w_dims) { + // An inner product usually contains Add node whose lhs is a select + auto add = w_value.as(); + auto sel = add ? add->a.as()) { + printf("is select\n"); + } else if (op->as()) { + printf("is shuffle\n"); + } else if (op->as()) { + printf("is ramp\n"); + } else if (op->as()) { + printf("is load\n"); + } else if (op->as()) { + printf("is store\n"); + } else if (op->as()) { + printf("is broadcast\n"); + } else if (op->as()) { + printf("is let\n"); + } else if (op->as()) { + printf("is variable\n"); + } else { + printf("don't know this ir\n"); + } +} + +#endif