From 85e8c692ce688c81283c79d2c668a2b8e09c2e7c Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Mon, 25 Jul 2022 13:14:59 -0400 Subject: [PATCH 01/55] create VectorIntrininsic node --- src/Bounds.cpp | 5 +++++ src/CodeGen_LLVM.cpp | 4 ++++ src/CodeGen_LLVM.h | 1 + src/Expr.h | 1 + src/IR.cpp | 19 +++++++++++++++++++ src/IR.h | 9 +++++++++ src/IREquality.cpp | 8 ++++++++ src/IRMutator.cpp | 8 ++++++++ src/IRMutator.h | 1 + src/IRPrinter.cpp | 10 ++++++++++ src/IRPrinter.h | 1 + src/IRVisitor.cpp | 12 ++++++++++++ src/IRVisitor.h | 5 +++++ src/Simplify_Exprs.cpp | 5 +++++ src/Simplify_Internal.h | 1 + src/StmtToHtml.cpp | 7 +++++++ 16 files changed, 97 insertions(+) diff --git a/src/Bounds.cpp b/src/Bounds.cpp index eb2f9138268b..107bd04185c4 100644 --- a/src/Bounds.cpp +++ b/src/Bounds.cpp @@ -1110,6 +1110,11 @@ class Bounds : public IRVisitor { op->value.accept(this); } + void visit(const VectorIntrinsic *op) override { + // TODO(rootjalex): we may need to implement bounds queries. + internal_error << "Unexpected VectorIntrinsic in bounds query: " << Expr(op) << "\n"; + } + void visit(const Call *op) override { TRACK_BOUNDS_INTERVAL; TRACK_BOUNDS_INFO("name:", op->name); diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp index 938d7a0e23b6..4990ba6e78c1 100644 --- a/src/CodeGen_LLVM.cpp +++ b/src/CodeGen_LLVM.cpp @@ -4000,6 +4000,10 @@ void CodeGen_LLVM::visit(const Shuffle *op) { } } +void CodeGen_LLVM::visit(const VectorIntrinsic *op) { + internal_error << "CodeGen_LLVM received VectorIntrinsic node, should be handled by architecture-specific CodeGen class:\n" << Expr(op) << "\n"; +} + void CodeGen_LLVM::visit(const VectorReduce *op) { codegen_vector_reduce(op, Expr()); } diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h index 5982aa1672fd..7c5078428431 100644 --- a/src/CodeGen_LLVM.h +++ b/src/CodeGen_LLVM.h @@ -362,6 +362,7 @@ class CodeGen_LLVM : public IRVisitor { void visit(const IfThenElse *) override; void visit(const Evaluate *) override; void visit(const Shuffle *) override; + void visit(const VectorIntrinsic *) override; void visit(const VectorReduce *) override; void visit(const Prefetch *) override; void visit(const Atomic *) override; diff --git a/src/Expr.h b/src/Expr.h index ac0ec6521d68..aaab7dac23be 100644 --- a/src/Expr.h +++ b/src/Expr.h @@ -57,6 +57,7 @@ enum class IRNodeType { Call, Let, Shuffle, + VectorIntrinsic, VectorReduce, // Stmts LetStmt, diff --git a/src/IR.cpp b/src/IR.cpp index 740234b8e31f..776a8da806ee 100644 --- a/src/IR.cpp +++ b/src/IR.cpp @@ -901,6 +901,17 @@ Stmt Atomic::make(const std::string &producer_name, return node; } +Expr VectorIntrinsic::make(Type type, const std::string &name, const std::vector &args) { + user_assert(!name.empty()) << "VectorIntrinsic without a name\n"; + user_assert(!args.empty()) << "VectorInrinsic without arguments\n"; + + VectorIntrinsic *node = new VectorIntrinsic; + node->type = type; + node->name = name; + node->args = args; + return node; +} + Expr VectorReduce::make(VectorReduce::Operator op, Expr vec, int lanes) { @@ -1081,6 +1092,10 @@ void ExprNode::accept(IRVisitor *v) const { v->visit((const Shuffle *)this); } template<> +void ExprNode::accept(IRVisitor *v) const { + v->visit((const VectorIntrinsic *)this); +} +template<> void ExprNode::accept(IRVisitor *v) const { v->visit((const VectorReduce *)this); } @@ -1270,6 +1285,10 @@ Expr ExprNode::mutate_expr(IRMutator *v) const { return v->visit((const Shuffle *)this); } template<> +Expr ExprNode::mutate_expr(IRMutator *v) const { + return v->visit((const VectorIntrinsic *)this); +} +template<> Expr ExprNode::mutate_expr(IRMutator *v) const { return v->visit((const VectorReduce *)this); } diff --git a/src/IR.h b/src/IR.h index c6085614b59d..d732d1a43ea7 100644 --- a/src/IR.h +++ b/src/IR.h @@ -886,6 +886,15 @@ struct Atomic : public StmtNode { static const IRNodeType _node_type = IRNodeType::Atomic; }; +struct VectorIntrinsic : public ExprNode { + std::string name; + std::vector args; + + static Expr make(Type type, const std::string &name, const std::vector &args); + + static const IRNodeType _node_type = IRNodeType::VectorIntrinsic; +}; + /** Horizontally reduce a vector to a scalar or narrower vector using * the given commutative and associative binary operator. The reduction * factor is dictated by the number of lanes in the input and output diff --git a/src/IREquality.cpp b/src/IREquality.cpp index 20cb616d2c32..15a9bc01cbbb 100644 --- a/src/IREquality.cpp +++ b/src/IREquality.cpp @@ -98,6 +98,7 @@ class IRComparer : public IRVisitor { void visit(const Shuffle *) override; void visit(const Prefetch *) override; void visit(const Atomic *) override; + void visit(const VectorIntrinsic *) override; void visit(const VectorReduce *) override; }; @@ -629,6 +630,13 @@ void IRComparer::visit(const Atomic *op) { compare_stmt(s->body, op->body); } +void IRComparer::visit(const VectorIntrinsic *op) { + const VectorIntrinsic *e = expr.as(); + + compare_names(e->name, op->name); + compare_expr_vector(e->args, op->args); +} + void IRComparer::visit(const VectorReduce *op) { const VectorReduce *e = expr.as(); diff --git a/src/IRMutator.cpp b/src/IRMutator.cpp index 005937a17008..e075897d6694 100644 --- a/src/IRMutator.cpp +++ b/src/IRMutator.cpp @@ -327,6 +327,14 @@ Expr IRMutator::visit(const Shuffle *op) { return Shuffle::make(new_vectors, op->indices); } +Expr IRMutator::visit(const VectorIntrinsic *op) { + auto [new_args, changed] = mutate_with_changes(op->args); + if (!changed) { + return op; + } + return VectorIntrinsic::make(op->type, op->name, new_args); +} + Expr IRMutator::visit(const VectorReduce *op) { Expr value = mutate(op->value); if (value.same_as(op->value)) { diff --git a/src/IRMutator.h b/src/IRMutator.h index c7a1984269d3..e460b036b80f 100644 --- a/src/IRMutator.h +++ b/src/IRMutator.h @@ -81,6 +81,7 @@ class IRMutator { virtual Expr visit(const Call *); virtual Expr visit(const Let *); virtual Expr visit(const Shuffle *); + virtual Expr visit(const VectorIntrinsic *); virtual Expr visit(const VectorReduce *); virtual Stmt visit(const LetStmt *); diff --git a/src/IRPrinter.cpp b/src/IRPrinter.cpp index 38f57e46649e..f609f28763fd 100644 --- a/src/IRPrinter.cpp +++ b/src/IRPrinter.cpp @@ -1073,6 +1073,16 @@ void IRPrinter::visit(const Shuffle *op) { } } +void IRPrinter::visit(const VectorIntrinsic *op) { + stream << "(" + << op->type + << ")vector_intrinsic(\"" + << op->name + << "\", "; + print_list(op->args); + stream << ")"; +} + void IRPrinter::visit(const VectorReduce *op) { stream << "(" << op->type diff --git a/src/IRPrinter.h b/src/IRPrinter.h index 666235988cd7..1e7cc048b805 100644 --- a/src/IRPrinter.h +++ b/src/IRPrinter.h @@ -194,6 +194,7 @@ class IRPrinter : public IRVisitor { void visit(const IfThenElse *) override; void visit(const Evaluate *) override; void visit(const Shuffle *) override; + void visit(const VectorIntrinsic *) override; void visit(const VectorReduce *) override; void visit(const Prefetch *) override; void visit(const Atomic *) override; diff --git a/src/IRVisitor.cpp b/src/IRVisitor.cpp index 7f9993987200..3b1956a51d8a 100644 --- a/src/IRVisitor.cpp +++ b/src/IRVisitor.cpp @@ -257,6 +257,12 @@ void IRVisitor::visit(const Shuffle *op) { } } +void IRVisitor::visit(const VectorIntrinsic *op) { + for (const auto &arg : op->args) { + arg.accept(this); + } +} + void IRVisitor::visit(const VectorReduce *op) { op->value.accept(this); } @@ -515,6 +521,12 @@ void IRGraphVisitor::visit(const Shuffle *op) { } } +void IRGraphVisitor::visit(const VectorIntrinsic *op) { + for (const auto &arg : op->args) { + include(arg); + } +} + void IRGraphVisitor::visit(const VectorReduce *op) { include(op->value); } diff --git a/src/IRVisitor.h b/src/IRVisitor.h index 4e1650ff22be..c9c170dd851d 100644 --- a/src/IRVisitor.h +++ b/src/IRVisitor.h @@ -71,6 +71,7 @@ class IRVisitor { virtual void visit(const IfThenElse *); virtual void visit(const Evaluate *); virtual void visit(const Shuffle *); + virtual void visit(const VectorIntrinsic *); virtual void visit(const VectorReduce *); virtual void visit(const Prefetch *); virtual void visit(const Fork *); @@ -142,6 +143,7 @@ class IRGraphVisitor : public IRVisitor { void visit(const IfThenElse *) override; void visit(const Evaluate *) override; void visit(const Shuffle *) override; + void visit(const VectorIntrinsic *) override; void visit(const VectorReduce *) override; void visit(const Prefetch *) override; void visit(const Acquire *) override; @@ -224,6 +226,8 @@ class VariadicVisitor { return ((T *)this)->visit((const Let *)node, std::forward(args)...); case IRNodeType::Shuffle: return ((T *)this)->visit((const Shuffle *)node, std::forward(args)...); + case IRNodeType::VectorIntrinsic: + return ((T *)this)->visit((const VectorIntrinsic *)node, std::forward(args)...); case IRNodeType::VectorReduce: return ((T *)this)->visit((const VectorReduce *)node, std::forward(args)...); // Explicitly list the Stmt types rather than using a @@ -286,6 +290,7 @@ class VariadicVisitor { case IRNodeType::Call: case IRNodeType::Let: case IRNodeType::Shuffle: + case IRNodeType::VectorIntrinsic: case IRNodeType::VectorReduce: internal_error << "Unreachable"; break; diff --git a/src/Simplify_Exprs.cpp b/src/Simplify_Exprs.cpp index 574754686cc6..5aefebfa611f 100644 --- a/src/Simplify_Exprs.cpp +++ b/src/Simplify_Exprs.cpp @@ -59,6 +59,11 @@ Expr Simplify::visit(const Broadcast *op, ExprInfo *bounds) { } } +Expr Simplify::visit(const VectorIntrinsic *op, ExprInfo *bounds) { + clear_bounds_info(bounds); + return op; +} + Expr Simplify::visit(const VectorReduce *op, ExprInfo *bounds) { Expr value = mutate(op->value, bounds); diff --git a/src/Simplify_Internal.h b/src/Simplify_Internal.h index a510e5c51f64..7dae150042a4 100644 --- a/src/Simplify_Internal.h +++ b/src/Simplify_Internal.h @@ -333,6 +333,7 @@ class Simplify : public VariadicVisitor { Expr visit(const Load *op, ExprInfo *bounds); Expr visit(const Call *op, ExprInfo *bounds); Expr visit(const Shuffle *op, ExprInfo *bounds); + Expr visit(const VectorIntrinsic *op, ExprInfo *bounds); Expr visit(const VectorReduce *op, ExprInfo *bounds); Expr visit(const Let *op, ExprInfo *bounds); Stmt visit(const LetStmt *op); diff --git a/src/StmtToHtml.cpp b/src/StmtToHtml.cpp index 21bc74dd20ac..1f5b1d20ccff 100644 --- a/src/StmtToHtml.cpp +++ b/src/StmtToHtml.cpp @@ -712,6 +712,13 @@ class StmtToHtml : public IRVisitor { stream << close_span(); } + void visit(const VectorIntrinsic *op) override { + stream << open_span("VectoIntrinsic"); + stream << open_span("Type") << op->type << close_span(); + print_list(symbol("vector_intrinsic") + "(\"" + op->name + "\"", op->args, ")"); + stream << close_span(); + } + void visit(const VectorReduce *op) override { stream << open_span("VectorReduce"); stream << open_span("Type") << op->type << close_span(); From f0931c69796c9ebe5745d9105bc6c838a94dc070 Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Mon, 25 Jul 2022 13:15:30 -0400 Subject: [PATCH 02/55] update IRMatch for VectorIntrinsic node --- src/IRMatch.cpp | 19 +++++ src/IRMatch.h | 188 +++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 197 insertions(+), 10 deletions(-) diff --git a/src/IRMatch.cpp b/src/IRMatch.cpp index 6aba3155777f..f7bb7d457ff1 100644 --- a/src/IRMatch.cpp +++ b/src/IRMatch.cpp @@ -296,6 +296,22 @@ class IRMatch : public IRVisitor { } } + void visit(const VectorIntrinsic *op) override { + const VectorIntrinsic *e = expr.as(); + if (result && e && + types_match(op->type, e->type) && + e->name == op->name && + e->args.size() == op->args.size()) { + for (size_t i = 0; result && (i < e->args.size()); i++) { + // FIXME: should we early-out? Here and in Call* + expr = e->args[i]; + op->args[i].accept(this); + } + } else { + result = false; + } + } + void visit(const VectorReduce *op) override { const VectorReduce *e = expr.as(); if (result && e && op->op == e->op && types_match(op->type, e->type)) { @@ -505,6 +521,9 @@ bool equal_helper(const BaseExprNode &a, const BaseExprNode &b) noexcept { case IRNodeType::Shuffle: return (equal_helper(((const Shuffle &)a).vectors, ((const Shuffle &)b).vectors) && equal_helper(((const Shuffle &)a).indices, ((const Shuffle &)b).indices)); + case IRNodeType::VectorIntrinsic: + return (((const VectorIntrinsic &)a).name == ((const VectorIntrinsic &)b).name && + equal_helper(((const VectorIntrinsic &)a).args, ((const VectorIntrinsic &)b).args)); case IRNodeType::VectorReduce: // As with Cast above, we use equal instead of equal_helper // here, because while we know a.type == b.type, we don't know diff --git a/src/IRMatch.h b/src/IRMatch.h index 756b900e1f4d..089521c5bfed 100644 --- a/src/IRMatch.h +++ b/src/IRMatch.h @@ -1445,6 +1445,10 @@ struct Intrin { return rounding_shift_left(arg0, arg1); } else if (intrin == Call::rounding_shift_right) { return rounding_shift_right(arg0, arg1); + } else if (intrin == Call::bitwise_xor) { + return arg0 ^ arg1; + } else if (intrin == Call::bitwise_and) { + return arg0 & arg1; } Expr arg2 = std::get(args).make(state, type_hint); @@ -1521,6 +1525,14 @@ HALIDE_ALWAYS_INLINE auto intrin(Call::IntrinsicOp intrinsic_op, Args... args) n return {intrinsic_op, pattern_arg(args)...}; } +template +auto abs(A &&a) noexcept -> Intrin { + return {Call::abs, pattern_arg(a)}; +} +template +auto absd(A &&a, B &&b) noexcept -> Intrin { + return {Call::absd, pattern_arg(a), pattern_arg(b)}; +} template auto widening_add(A &&a, B &&b) noexcept -> Intrin { return {Call::widening_add, pattern_arg(a), pattern_arg(b)}; @@ -1569,6 +1581,36 @@ template auto rounding_shift_right(A &&a, B &&b) noexcept -> Intrin { return {Call::rounding_shift_right, pattern_arg(a), pattern_arg(b)}; } +template +auto bitwise_xor(A &&a, B &&b) noexcept -> Intrin { + return {Call::bitwise_xor, pattern_arg(a), pattern_arg(b)}; +} +template +HALIDE_ALWAYS_INLINE auto operator^(A &&a, B &&b) noexcept -> auto { + assert_is_lvalue_if_expr(); + assert_is_lvalue_if_expr(); + return bitwise_xor(a, b); +} +template +auto bitwise_and(A &&a, B &&b) noexcept -> Intrin { + return {Call::bitwise_and, pattern_arg(a), pattern_arg(b)}; +} +template +HALIDE_ALWAYS_INLINE auto operator&(A &&a, B &&b) noexcept -> auto { + assert_is_lvalue_if_expr(); + assert_is_lvalue_if_expr(); + return bitwise_and(a, b); +} +template +auto bitwise_or(A &&a, B &&b) noexcept -> Intrin { + return {Call::bitwise_or, pattern_arg(a), pattern_arg(b)}; +} +template +HALIDE_ALWAYS_INLINE auto operator|(A &&a, B &&b) noexcept -> auto { + assert_is_lvalue_if_expr(); + assert_is_lvalue_if_expr(); + return bitwise_or(a, b); +} template auto mul_shift_right(A &&a, B &&b, C &&c) noexcept -> Intrin { return {Call::mul_shift_right, pattern_arg(a), pattern_arg(b), pattern_arg(c)}; @@ -1839,6 +1881,109 @@ HALIDE_ALWAYS_INLINE auto ramp(A &&a, B &&b, C &&c) noexcept -> RampOp +struct VectorIntrinOp { + struct pattern_tag {}; + const std::string &intrin_name; + std::tuple args; + + static constexpr uint32_t binds = bitwise_or_reduce((bindings::mask)...); + + constexpr static IRNodeType min_node_type = IRNodeType::VectorIntrinsic; + constexpr static IRNodeType max_node_type = IRNodeType::VectorIntrinsic; + constexpr static bool canonical = and_reduce((Args::canonical)...); + + template::type> + HALIDE_ALWAYS_INLINE bool match_args(int, const VectorIntrinsic &v, MatcherState &state) const noexcept { + using T = decltype(std::get(args)); + return (std::get(args).template match(*v.args[i].get(), state) && + match_args::mask>(0, v, state)); + } + + template + HALIDE_ALWAYS_INLINE bool match_args(double, const VectorIntrinsic &v, MatcherState &state) const noexcept { + return true; + } + + template + HALIDE_ALWAYS_INLINE bool match(const BaseExprNode &e, MatcherState &state) const noexcept { + if (e.node_type != IRNodeType::VectorIntrinsic) { + return false; + } + const VectorIntrinsic &v = (const VectorIntrinsic &)e; + return (v.name == intrin_name && match_args<0, bound>(0, v, state)); + } + + template::type> + HALIDE_ALWAYS_INLINE void print_args(int, std::ostream &s) const { + s << std::get(args); + if (i + 1 < sizeof...(Args)) { + s << ", "; + } + print_args(0, s); + } + + template + HALIDE_ALWAYS_INLINE void print_args(double, std::ostream &s) const { + } + + HALIDE_ALWAYS_INLINE + void print_args(std::ostream &s) const { + print_args<0>(0, s); + } + + HALIDE_ALWAYS_INLINE + Expr make(MatcherState &state, halide_type_t type_hint) const { + std::vector r_args(sizeof...(Args)); + // TODO(rootjalex): How do we do type hints for the args? + // TODO(rootjalex): Is there a way to do basically an unrolled + // loop of the below? this is ugly. + // Supposedly C++20 will have constexpr std::transform, perhaps + // we can use that when Halide upgrades. + + r_args[0] = std::get<0>(args).make(state, {}); + if constexpr (sizeof...(Args) > 1) { + r_args[1] = std::get<1>(args).make(state, {}); + } + if constexpr (sizeof...(Args) > 2) { + r_args[2] = std::get<2>(args).make(state, {}); + } + + // for (int i = 0; i < sizeof...(Args); i++) { + // // TODO(rootjalex): how do we do type-hints here? + // args[i] = std::get(args).make(state, {}); + // } + return VectorIntrinsic::make(type_hint, intrin_name, r_args); + } + + constexpr static bool foldable = false; + + HALIDE_ALWAYS_INLINE + VectorIntrinOp(const std::string &name, Args... args) noexcept + : intrin_name(name), args(args...) { + static_assert(sizeof...(Args) > 0 && sizeof...(Args) <= 3, + "VectorIntrinsicOp must have non-zero arguments, and update make() if more than 3 arguments."); + } +}; + +template +std::ostream &operator<<(std::ostream &s, const VectorIntrinOp &op) { + // TODO(rootjalex): Should we print the type? + s << "vector_intrin(\""; + s << op.intrin_name << "\", "; + op.print_args(s); + s << ")"; + return s; +} + +template +HALIDE_ALWAYS_INLINE auto v_intrin(const std::string &name, Args... args) noexcept -> VectorIntrinOp { + return {name, pattern_arg(args)...}; +} + template struct VectorReduceOp { struct pattern_tag {}; @@ -1895,6 +2040,12 @@ HALIDE_ALWAYS_INLINE auto h_add(A &&a, B lanes) noexcept -> VectorReduceOp +HALIDE_ALWAYS_INLINE auto h_satadd(A &&a, B lanes) noexcept -> VectorReduceOp { + assert_is_lvalue_if_expr(); + return {pattern_arg(a), pattern_arg(lanes)}; +} + template HALIDE_ALWAYS_INLINE auto h_min(A &&a, B lanes) noexcept -> VectorReduceOp { assert_is_lvalue_if_expr(); @@ -2273,6 +2424,8 @@ template struct IsFloat { struct pattern_tag {}; A a; + int bits; + int lanes; constexpr static uint32_t binds = bindings::mask; @@ -2287,7 +2440,7 @@ struct IsFloat { void make_folded_const(halide_scalar_value_t &val, halide_type_t &ty, MatcherState &state) const { // a is almost certainly a very simple pattern (e.g. a wild), so just inline the make method. Type t = a.make(state, {}).type(); - val.u.u64 = t.is_float(); + val.u.u64 = t.is_float() && (bits == 0 || t.bits() == bits) && (lanes == 0 || t.lanes() == lanes); ty.code = halide_type_uint; ty.bits = 1; ty.lanes = t.lanes(); @@ -2295,14 +2448,21 @@ struct IsFloat { }; template -HALIDE_ALWAYS_INLINE auto is_float(A &&a) noexcept -> IsFloat { +HALIDE_ALWAYS_INLINE auto is_float(A &&a, int bits = 0, int lanes = 0) noexcept -> IsFloat { assert_is_lvalue_if_expr(); - return {pattern_arg(a)}; + return {pattern_arg(a), bits, lanes}; } template std::ostream &operator<<(std::ostream &s, const IsFloat &op) { - s << "is_float(" << op.a << ")"; + s << "is_float(" << op.a; + if (op.bits > 0) { + s << ", " << op.bits; + } + if (op.lanes > 0) { + s << ", " << op.lanes; + } + s << ")"; return s; } @@ -2311,6 +2471,7 @@ struct IsInt { struct pattern_tag {}; A a; int bits; + int lanes; constexpr static uint32_t binds = bindings::mask; @@ -2325,7 +2486,7 @@ struct IsInt { void make_folded_const(halide_scalar_value_t &val, halide_type_t &ty, MatcherState &state) const { // a is almost certainly a very simple pattern (e.g. a wild), so just inline the make method. Type t = a.make(state, {}).type(); - val.u.u64 = t.is_int() && (bits == 0 || t.bits() == bits); + val.u.u64 = t.is_int() && (bits == 0 || t.bits() == bits) && (lanes == 0 || t.lanes() == lanes); ty.code = halide_type_uint; ty.bits = 1; ty.lanes = t.lanes(); @@ -2333,9 +2494,9 @@ struct IsInt { }; template -HALIDE_ALWAYS_INLINE auto is_int(A &&a, int bits = 0) noexcept -> IsInt { +HALIDE_ALWAYS_INLINE auto is_int(A &&a, int bits = 0, int lanes = 0) noexcept -> IsInt { assert_is_lvalue_if_expr(); - return {pattern_arg(a), bits}; + return {pattern_arg(a), bits, lanes}; } template @@ -2344,6 +2505,9 @@ std::ostream &operator<<(std::ostream &s, const IsInt &op) { if (op.bits > 0) { s << ", " << op.bits; } + if (op.lanes > 0) { + s << ", " << op.lanes; + } s << ")"; return s; } @@ -2353,6 +2517,7 @@ struct IsUInt { struct pattern_tag {}; A a; int bits; + int lanes; constexpr static uint32_t binds = bindings::mask; @@ -2367,7 +2532,7 @@ struct IsUInt { void make_folded_const(halide_scalar_value_t &val, halide_type_t &ty, MatcherState &state) const { // a is almost certainly a very simple pattern (e.g. a wild), so just inline the make method. Type t = a.make(state, {}).type(); - val.u.u64 = t.is_uint() && (bits == 0 || t.bits() == bits); + val.u.u64 = t.is_uint() && (bits == 0 || t.bits() == bits) && (lanes == 0 || t.lanes() == lanes); ty.code = halide_type_uint; ty.bits = 1; ty.lanes = t.lanes(); @@ -2375,9 +2540,9 @@ struct IsUInt { }; template -HALIDE_ALWAYS_INLINE auto is_uint(A &&a, int bits = 0) noexcept -> IsUInt { +HALIDE_ALWAYS_INLINE auto is_uint(A &&a, int bits = 0, int lanes = 0) noexcept -> IsUInt { assert_is_lvalue_if_expr(); - return {pattern_arg(a), bits}; + return {pattern_arg(a), bits, lanes}; } template @@ -2386,6 +2551,9 @@ std::ostream &operator<<(std::ostream &s, const IsUInt &op) { if (op.bits > 0) { s << ", " << op.bits; } + if (op.lanes > 0) { + s << ", " << op.lanes; + } s << ")"; return s; } From ac5b6f2b1032c2e30012236b16b0615fc67626a1 Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Mon, 25 Jul 2022 17:39:13 -0400 Subject: [PATCH 03/55] implement optimize_x86_instructions --- Makefile | 6 +- src/CMakeLists.txt | 2 + src/CodeGen_X86.cpp | 355 +------------------------- src/IRMatch.h | 33 +++ src/Lower.cpp | 8 + src/X86Optimize.cpp | 588 ++++++++++++++++++++++++++++++++++++++++++++ src/X86Optimize.h | 22 ++ 7 files changed, 664 insertions(+), 350 deletions(-) create mode 100644 src/X86Optimize.cpp create mode 100644 src/X86Optimize.h diff --git a/Makefile b/Makefile index 61d78a7cc2dd..dabec3536bab 100644 --- a/Makefile +++ b/Makefile @@ -573,7 +573,8 @@ SOURCE_FILES = \ Var.cpp \ VectorizeLoops.cpp \ WasmExecutor.cpp \ - WrapCalls.cpp + WrapCalls.cpp \ + X86Optimize.cpp # The externally-visible header files that go into making Halide.h. # Don't include anything here that includes llvm headers. @@ -738,7 +739,8 @@ HEADER_FILES = \ Util.h \ Var.h \ VectorizeLoops.h \ - WrapCalls.h + WrapCalls.h \ + X86Optimize.h OBJECTS = $(SOURCE_FILES:%.cpp=$(BUILD_DIR)/%.o) HEADERS = $(HEADER_FILES:%.h=$(SRC_DIR)/%.h) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index e0015a65551a..64d1a9f4316e 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -165,6 +165,7 @@ set(HEADER_FILES VectorizeLoops.h WasmExecutor.h WrapCalls.h + X86Optimize.h ) set(SOURCE_FILES @@ -342,6 +343,7 @@ set(SOURCE_FILES VectorizeLoops.cpp WasmExecutor.cpp WrapCalls.cpp + X86Optimize.cpp ) ## diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp index 7529c31688f4..0e99407726b8 100644 --- a/src/CodeGen_X86.cpp +++ b/src/CodeGen_X86.cpp @@ -7,6 +7,7 @@ #include "LLVM_Headers.h" #include "Simplify.h" #include "Util.h" +#include "X86Optimize.h" namespace Halide { namespace Internal { @@ -21,30 +22,6 @@ using namespace llvm; namespace { -// Populate feature flags in a target according to those implied by -// existing flags, so that instruction patterns can just check for the -// oldest feature flag that supports an instruction. -Target complete_x86_target(Target t) { - if (t.has_feature(Target::AVX512_SapphireRapids)) { - t.set_feature(Target::AVX512_Cannonlake); - } - if (t.has_feature(Target::AVX512_Cannonlake)) { - t.set_feature(Target::AVX512_Skylake); - } - if (t.has_feature(Target::AVX512_Cannonlake) || - t.has_feature(Target::AVX512_Skylake) || - t.has_feature(Target::AVX512_KNL)) { - t.set_feature(Target::AVX2); - } - if (t.has_feature(Target::AVX2)) { - t.set_feature(Target::AVX); - } - if (t.has_feature(Target::AVX)) { - t.set_feature(Target::SSE41); - } - return t; -} - /** A code generator that emits x86 code from a given Halide stmt. */ class CodeGen_X86 : public CodeGen_Posix { public: @@ -69,10 +46,7 @@ class CodeGen_X86 : public CodeGen_Posix { /** Nodes for which we want to emit specific sse/avx intrinsics */ // @{ - void visit(const Add *) override; - void visit(const Sub *) override; void visit(const Cast *) override; - void visit(const Call *) override; void visit(const GT *) override; void visit(const LT *) override; void visit(const LE *) override; @@ -83,7 +57,7 @@ class CodeGen_X86 : public CodeGen_Posix { void visit(const Allocate *) override; void visit(const Load *) override; void visit(const Store *) override; - void codegen_vector_reduce(const VectorReduce *, const Expr &init) override; + void visit(const VectorIntrinsic *) override; // @} private: @@ -265,85 +239,6 @@ void CodeGen_X86::init_module() { } } -// i32(i16_a)*i32(i16_b) +/- i32(i16_c)*i32(i16_d) can be done by -// interleaving a, c, and b, d, and then using dot_product. -bool should_use_dot_product(const Expr &a, const Expr &b, vector &result) { - Type t = a.type(); - internal_assert(b.type() == t); - - if (!(t.is_int() && t.bits() == 32 && t.lanes() >= 4)) { - return false; - } - - const Call *ma = Call::as_intrinsic(a, {Call::widening_mul}); - const Call *mb = Call::as_intrinsic(b, {Call::widening_mul}); - // dot_product can't handle mixed type widening muls. - if (ma && ma->args[0].type() != ma->args[1].type()) { - return false; - } - if (mb && mb->args[0].type() != mb->args[1].type()) { - return false; - } - // If the operands are widening shifts, we might be able to treat these as - // multiplies. - const Call *sa = Call::as_intrinsic(a, {Call::widening_shift_left}); - const Call *sb = Call::as_intrinsic(b, {Call::widening_shift_left}); - if (sa && !is_const(sa->args[1])) { - sa = nullptr; - } - if (sb && !is_const(sb->args[1])) { - sb = nullptr; - } - if ((ma || sa) && (mb || sb)) { - Expr a0 = ma ? ma->args[0] : sa->args[0]; - Expr a1 = ma ? ma->args[1] : lossless_cast(sa->args[0].type(), simplify(make_const(sa->type, 1) << sa->args[1])); - Expr b0 = mb ? mb->args[0] : sb->args[0]; - Expr b1 = mb ? mb->args[1] : lossless_cast(sb->args[0].type(), simplify(make_const(sb->type, 1) << sb->args[1])); - if (a1.defined() && b1.defined()) { - std::vector args = {a0, a1, b0, b1}; - result.swap(args); - return true; - } - } - return false; -} - -void CodeGen_X86::visit(const Add *op) { - vector matches; - if (should_use_dot_product(op->a, op->b, matches)) { - Expr ac = Shuffle::make_interleave({matches[0], matches[2]}); - Expr bd = Shuffle::make_interleave({matches[1], matches[3]}); - value = call_overloaded_intrin(op->type, "dot_product", {ac, bd}); - if (value) { - return; - } - } - CodeGen_Posix::visit(op); -} - -void CodeGen_X86::visit(const Sub *op) { - vector matches; - if (should_use_dot_product(op->a, op->b, matches)) { - // Negate one of the factors in the second expression - Expr negative_2 = lossless_negate(matches[2]); - Expr negative_3 = lossless_negate(matches[3]); - if (negative_2.defined() || negative_3.defined()) { - if (negative_2.defined()) { - matches[2] = negative_2; - } else { - matches[3] = negative_3; - } - Expr ac = Shuffle::make_interleave({matches[0], matches[2]}); - Expr bd = Shuffle::make_interleave({matches[1], matches[3]}); - value = call_overloaded_intrin(op->type, "dot_product", {ac, bd}); - if (value) { - return; - } - } - } - CodeGen_Posix::visit(op); -} - void CodeGen_X86::visit(const GT *op) { Type t = op->a.type(); @@ -450,43 +345,12 @@ void CodeGen_X86::visit(const Select *op) { } void CodeGen_X86::visit(const Cast *op) { - if (!op->type.is_vector()) { // We only have peephole optimizations for vectors in here. CodeGen_Posix::visit(op); return; } - struct Pattern { - string intrin; - Expr pattern; - }; - - // clang-format off - static Pattern patterns[] = { - // This isn't rounding_multiply_quantzied(i16, i16, 15) because it doesn't - // saturate the result. - {"pmulhrs", i16(rounding_shift_right(widening_mul(wild_i16x_, wild_i16x_), 15))}, - - {"saturating_narrow", i16_sat(wild_i32x_)}, - {"saturating_narrow", u16_sat(wild_i32x_)}, - {"saturating_narrow", i8_sat(wild_i16x_)}, - {"saturating_narrow", u8_sat(wild_i16x_)}, - - {"f32_to_bf16", bf16(wild_f32x_)}, - }; - // clang-format on - - vector matches; - for (const Pattern &p : patterns) { - if (expr_match(p.pattern, op, matches)) { - value = call_overloaded_intrin(op->type, p.intrin, matches); - if (value) { - return; - } - } - } - if (const Call *mul = Call::as_intrinsic(op->value, {Call::widening_mul})) { if (op->value.type().bits() < op->type.bits() && op->type.bits() <= 32) { // LLVM/x86 really doesn't like 8 -> 16 bit multiplication. If we're @@ -501,216 +365,6 @@ void CodeGen_X86::visit(const Cast *op) { CodeGen_Posix::visit(op); } -void CodeGen_X86::visit(const Call *op) { - if (!op->type.is_vector()) { - // We only have peephole optimizations for vectors in here. - CodeGen_Posix::visit(op); - return; - } - - // A 16-bit mul-shift-right of less than 16 can sometimes be rounded up to a - // full 16 to use pmulh(u)w by left-shifting one of the operands. This is - // handled here instead of in the lowering of mul_shift_right because it's - // unlikely to be a good idea on platforms other than x86, as it adds an - // extra shift in the fully-lowered case. - if ((op->type.element_of() == UInt(16) || - op->type.element_of() == Int(16)) && - op->is_intrinsic(Call::mul_shift_right)) { - internal_assert(op->args.size() == 3); - const uint64_t *shift = as_const_uint(op->args[2]); - if (shift && *shift < 16 && *shift >= 8) { - Type narrow = op->type.with_bits(8); - Expr narrow_a = lossless_cast(narrow, op->args[0]); - Expr narrow_b = narrow_a.defined() ? Expr() : lossless_cast(narrow, op->args[1]); - int shift_left = 16 - (int)(*shift); - if (narrow_a.defined()) { - codegen(mul_shift_right(op->args[0] << shift_left, op->args[1], 16)); - return; - } else if (narrow_b.defined()) { - codegen(mul_shift_right(op->args[0], op->args[1] << shift_left, 16)); - return; - } - } - } else if (op->type.is_int() && - op->type.bits() <= 16 && - op->is_intrinsic(Call::rounding_halving_add)) { - // We can redirect signed rounding halving add to unsigned rounding - // halving add by adding 128 / 32768 to the result if the sign of the - // args differs. - internal_assert(op->args.size() == 2); - Type t = op->type.with_code(halide_type_uint); - Expr a = cast(t, op->args[0]); - Expr b = cast(t, op->args[1]); - codegen(cast(op->type, rounding_halving_add(a, b) + ((a ^ b) & (1 << (t.bits() - 1))))); - return; - } else if (op->is_intrinsic(Call::absd)) { - internal_assert(op->args.size() == 2); - if (op->args[0].type().is_uint()) { - // On x86, there are many 3-instruction sequences to compute absd of - // unsigned integers. This one consists solely of instructions with - // throughput of 3 ops per cycle on Cannon Lake. - // - // Solution due to Wojciech Mula: - // http://0x80.pl/notesen/2018-03-11-sse-abs-unsigned.html - codegen(saturating_sub(op->args[0], op->args[1]) | saturating_sub(op->args[1], op->args[0])); - return; - } else if (op->args[0].type().is_int()) { - codegen(Max::make(op->args[0], op->args[1]) - Min::make(op->args[0], op->args[1])); - return; - } - } - - struct Pattern { - string intrin; - Expr pattern; - }; - - // clang-format off - static Pattern patterns[] = { - {"pmulh", mul_shift_right(wild_i16x_, wild_i16x_, 16)}, - {"pmulh", mul_shift_right(wild_u16x_, wild_u16x_, 16)}, - {"saturating_pmulhrs", rounding_mul_shift_right(wild_i16x_, wild_i16x_, 15)}, - }; - // clang-format on - - vector matches; - for (const auto &pattern : patterns) { - if (expr_match(pattern.pattern, op, matches)) { - value = call_overloaded_intrin(op->type, pattern.intrin, matches); - if (value) { - return; - } - } - } - - CodeGen_Posix::visit(op); -} - -void CodeGen_X86::codegen_vector_reduce(const VectorReduce *op, const Expr &init) { - if (op->op != VectorReduce::Add && op->op != VectorReduce::SaturatingAdd) { - CodeGen_Posix::codegen_vector_reduce(op, init); - return; - } - const int factor = op->value.type().lanes() / op->type.lanes(); - - struct Pattern { - VectorReduce::Operator reduce_op; - int factor; - Expr pattern; - const char *intrin; - Type narrow_type; - uint32_t flags = 0; - enum { - CombineInit = 1 << 0, - SwapOperands = 1 << 1, - SingleArg = 1 << 2, - }; - }; - // clang-format off - // These patterns are roughly sorted "best to worst", in case there are two - // patterns that match the expression. - static const Pattern patterns[] = { - // 4-way dot products - {VectorReduce::Add, 4, i32(widening_mul(wild_u8x_, wild_i8x_)), "dot_product", {}, Pattern::CombineInit}, - {VectorReduce::Add, 4, i32(widening_mul(wild_i8x_, wild_u8x_)), "dot_product", {}, Pattern::CombineInit | Pattern::SwapOperands}, - {VectorReduce::SaturatingAdd, 4, i32(widening_mul(wild_u8x_, wild_i8x_)), "saturating_dot_product", {}, Pattern::CombineInit}, - {VectorReduce::SaturatingAdd, 4, i32(widening_mul(wild_i8x_, wild_u8x_)), "saturating_dot_product", {}, Pattern::CombineInit | Pattern::SwapOperands}, - - // 2-way dot products - {VectorReduce::Add, 2, i32(widening_mul(wild_i8x_, wild_i8x_)), "dot_product", Int(16)}, - {VectorReduce::Add, 2, i32(widening_mul(wild_i8x_, wild_u8x_)), "dot_product", Int(16)}, - {VectorReduce::Add, 2, i32(widening_mul(wild_u8x_, wild_i8x_)), "dot_product", Int(16)}, - {VectorReduce::Add, 2, i32(widening_mul(wild_u8x_, wild_u8x_)), "dot_product", Int(16)}, - {VectorReduce::SaturatingAdd, 2, i32(widening_mul(wild_u8x_, wild_i8x_)), "saturating_dot_product", {}, Pattern::CombineInit}, - {VectorReduce::SaturatingAdd, 2, i32(widening_mul(wild_i8x_, wild_u8x_)), "saturating_dot_product", {}, Pattern::CombineInit | Pattern::SwapOperands}, - {VectorReduce::SaturatingAdd, 2, widening_mul(wild_u8x_, wild_i8x_), "saturating_dot_product"}, - {VectorReduce::SaturatingAdd, 2, widening_mul(wild_i8x_, wild_u8x_), "saturating_dot_product", {}, Pattern::SwapOperands}, - - {VectorReduce::Add, 2, i32(widening_mul(wild_i16x_, wild_i16x_)), "dot_product", {}, Pattern::CombineInit}, - {VectorReduce::Add, 2, i32(widening_mul(wild_i16x_, wild_i16x_)), "dot_product", Int(16)}, - {VectorReduce::SaturatingAdd, 2, i32(widening_mul(wild_i16x_, wild_i16x_)), "saturating_dot_product", {}, Pattern::CombineInit}, - - {VectorReduce::Add, 2, wild_f32x_ * wild_f32x_, "dot_product", BFloat(16), Pattern::CombineInit}, - - // One could do a horizontal widening addition with - // other dot_products against a vector of ones. Currently disabled - // because I haven't found other cases where it's clearly better. - - {VectorReduce::Add, 2, u16(wild_u8x_), "horizontal_widening_add", {}, Pattern::SingleArg}, - {VectorReduce::Add, 2, i16(wild_u8x_), "horizontal_widening_add", {}, Pattern::SingleArg}, - {VectorReduce::Add, 2, i16(wild_i8x_), "horizontal_widening_add", {}, Pattern::SingleArg}, - }; - // clang-format on - - std::vector matches; - for (const Pattern &p : patterns) { - if (op->op != p.reduce_op || p.factor != factor) { - continue; - } - if (expr_match(p.pattern, op->value, matches)) { - if (p.flags & Pattern::SingleArg) { - Expr a = matches[0]; - - if (p.narrow_type.bits() > 0) { - a = lossless_cast(p.narrow_type.with_lanes(a.type().lanes()), a); - } - if (!a.defined()) { - continue; - } - - if (init.defined() && (p.flags & Pattern::CombineInit)) { - value = call_overloaded_intrin(op->type, p.intrin, {init, a}); - if (value) { - return; - } - } else { - value = call_overloaded_intrin(op->type, p.intrin, {a}); - if (value) { - if (init.defined()) { - Value *x = value; - Value *y = codegen(init); - value = builder->CreateAdd(x, y); - } - return; - } - } - } else { - Expr a = matches[0]; - Expr b = matches[1]; - if (p.flags & Pattern::SwapOperands) { - std::swap(a, b); - } - if (p.narrow_type.bits() > 0) { - a = lossless_cast(p.narrow_type.with_lanes(a.type().lanes()), a); - b = lossless_cast(p.narrow_type.with_lanes(b.type().lanes()), b); - } - if (!a.defined() || !b.defined()) { - continue; - } - - if (init.defined() && (p.flags & Pattern::CombineInit)) { - value = call_overloaded_intrin(op->type, p.intrin, {init, a, b}); - if (value) { - return; - } - } else { - value = call_overloaded_intrin(op->type, p.intrin, {a, b}); - if (value) { - if (init.defined()) { - Value *x = value; - Value *y = codegen(init); - value = builder->CreateAdd(x, y); - } - return; - } - } - } - } - } - - CodeGen_Posix::codegen_vector_reduce(op, init); -} - void CodeGen_X86::visit(const Allocate *op) { ScopedBinding bind(mem_type, op->name, op->memory_type); CodeGen_Posix::visit(op); @@ -743,6 +397,11 @@ void CodeGen_X86::visit(const Store *op) { CodeGen_Posix::visit(op); } +void CodeGen_X86::visit(const VectorIntrinsic *op) { + value = call_overloaded_intrin(op->type, op->name, op->args); + internal_assert(value) << "CodeGen_X86 failed on " << Expr(op) << "\n"; +} + string CodeGen_X86::mcpu_target() const { // Perform an ad-hoc guess for the -mcpu given features. // WARNING: this is used to drive -mcpu, *NOT* -mtune! diff --git a/src/IRMatch.h b/src/IRMatch.h index 089521c5bfed..cc4a4b490664 100644 --- a/src/IRMatch.h +++ b/src/IRMatch.h @@ -2198,6 +2198,39 @@ HALIDE_ALWAYS_INLINE auto cast(halide_type_t t, A &&a) noexcept -> CastOp +struct TypeHint { + struct pattern_tag {}; + Type type; + A a; + + constexpr static uint32_t binds = bindings::mask; + + constexpr static IRNodeType min_node_type = IRNodeType::Cast; + constexpr static IRNodeType max_node_type = IRNodeType::Cast; + constexpr static bool canonical = A::canonical; + + HALIDE_ALWAYS_INLINE + Expr make(MatcherState &state, halide_type_t type_hint) const { + return a.make(state, type); + } + + constexpr static bool foldable = false; +}; + +template +std::ostream &operator<<(std::ostream &s, const TypeHint &op) { + s << "typed(" << op.type << ", " << op.a << ")"; + return s; +} + +template +HALIDE_ALWAYS_INLINE auto typed(halide_type_t t, A &&a) noexcept -> TypeHint { + assert_is_lvalue_if_expr(); + return {t, pattern_arg(a)}; +} + template struct Fold { struct pattern_tag {}; diff --git a/src/Lower.cpp b/src/Lower.cpp index 38ad867686e6..20d98a20562a 100644 --- a/src/Lower.cpp +++ b/src/Lower.cpp @@ -75,6 +75,7 @@ #include "UnsafePromises.h" #include "VectorizeLoops.h" #include "WrapCalls.h" +#include "X86Optimize.h" namespace Halide { namespace Internal { @@ -443,6 +444,13 @@ void lower_impl(const vector &output_funcs, debug(1) << "Skipping GPU offload...\n"; } + if (t.arch == Target::X86) { + debug(1) << "Performing x86-specific vector instruction selection...\n"; + s = optimize_x86_instructions(s, t); + debug(2) << "Lowering after performing x86-specific vector instruction selection:\n" + << s << "\n\n"; + } + // TODO: This needs to happen before lowering parallel tasks, because global // images used inside parallel loops are rewritten from loads from images to // loads from closure parameters. Closure parameters are missing the Buffer<> diff --git a/src/X86Optimize.cpp b/src/X86Optimize.cpp new file mode 100644 index 000000000000..3a1d3df3b6f8 --- /dev/null +++ b/src/X86Optimize.cpp @@ -0,0 +1,588 @@ +#include "X86Optimize.h" + +#include "CSE.h" +// FIXME: move lower_int_uint_div out of CodeGen_Internal to remove this dependency. +#include "CodeGen_Internal.h" +#include "FindIntrinsics.h" +#include "IR.h" +#include "IRMatch.h" +#include "IRMutator.h" +#include "IROperator.h" +#include "Simplify.h" + +namespace Halide { +namespace Internal { + +// Populate feature flags in a target according to those implied by +// existing flags, so that instruction patterns can just check for the +// oldest feature flag that supports an instruction. +Target complete_x86_target(Target t) { + if (t.has_feature(Target::AVX512_SapphireRapids)) { + t.set_feature(Target::AVX512_Cannonlake); + } + if (t.has_feature(Target::AVX512_Cannonlake)) { + t.set_feature(Target::AVX512_Skylake); + } + if (t.has_feature(Target::AVX512_Cannonlake) || + t.has_feature(Target::AVX512_Skylake) || + t.has_feature(Target::AVX512_KNL)) { + t.set_feature(Target::AVX2); + } + if (t.has_feature(Target::AVX2)) { + t.set_feature(Target::AVX); + } + if (t.has_feature(Target::AVX)) { + t.set_feature(Target::SSE41); + } + return t; +} + +#if defined(WITH_X86) + +namespace { + +// i32(i16_a)*i32(i16_b) +/- i32(i16_c)*i32(i16_d) can be done by +// interleaving a, c, and b, d, and then using dot_product. +bool should_use_dot_product(const Expr &a, const Expr &b, std::vector &result) { + Type t = a.type(); + internal_assert(b.type() == t) << a << " and " << b << " don't match types\n"; + + if (!(t.is_int() && t.bits() == 32 && t.lanes() >= 4)) { + return false; + } + + const Call *ma = Call::as_intrinsic(a, {Call::widening_mul}); + const Call *mb = Call::as_intrinsic(b, {Call::widening_mul}); + // dot_product can't handle mixed type widening muls. + if (ma && ma->args[0].type() != ma->args[1].type()) { + return false; + } + if (mb && mb->args[0].type() != mb->args[1].type()) { + return false; + } + // If the operands are widening shifts, we might be able to treat these as + // multiplies. + const Call *sa = Call::as_intrinsic(a, {Call::widening_shift_left}); + const Call *sb = Call::as_intrinsic(b, {Call::widening_shift_left}); + if (sa && !is_const(sa->args[1])) { + sa = nullptr; + } + if (sb && !is_const(sb->args[1])) { + sb = nullptr; + } + if ((ma || sa) && (mb || sb)) { + Expr a0 = ma ? ma->args[0] : sa->args[0]; + Expr a1 = ma ? ma->args[1] : lossless_cast(sa->args[0].type(), simplify(make_const(sa->type, 1) << sa->args[1])); + Expr b0 = mb ? mb->args[0] : sb->args[0]; + Expr b1 = mb ? mb->args[1] : lossless_cast(sb->args[0].type(), simplify(make_const(sb->type, 1) << sb->args[1])); + if (a1.defined() && b1.defined()) { + std::vector args = {a0, a1, b0, b1}; + result.swap(args); + return true; + } + } + return false; +} + +// // Templated saturating casts for use in rewrite rules. +// template +// auto saturating_cast() + +/** A code generator that replaces Halide IR with VectorIntrinsics specific to x86. */ +class Optimize_X86 : public IRMutator { +public: + /** Create an x86 code generator. Processor features can be + * enabled using the appropriate flags in the target struct. */ + Optimize_X86(const Target &t) : target(t) { + } + +protected: + + bool should_peephole_optimize(const Type &type) { + // We only have peephole optimizations for vectors here. + // FIXME: should we only optimize vectors that are multiples of the native vector width? + // when we do, we fail simd_op_check tests on weird vector sizes. + return type.is_vector(); + } + + Expr visit(const Div *op) override { + if (!should_peephole_optimize(op->type) || !op->type.is_int_or_uint()) { + return IRMutator::visit(op); + } + // Lower division here in order to do pattern-matching on intrinsics. + return mutate(lower_int_uint_div(op->a, op->b)); + } + + /** Nodes for which we want to emit specific sse/avx intrinsics */ + Expr visit(const Add *op) override { + if (!should_peephole_optimize(op->type)) { + return IRMutator::visit(op); + } + + std::vector matches; + // TODO(rootjalex): is it possible to rewrite should_use_dot_product + // as a series of rewrite-rules? lossless_cast is the hardest part. + const int lanes = op->type.lanes(); + + // FIXME: should we check for accumulating dot_products first? + // can there even be overlap between these? + auto rewrite = IRMatcher::rewriter(IRMatcher::add(op->a, op->b), op->type); + if ( + // Only AVX512_SapphireRapids has accumulating dot products. + target.has_feature(Target::AVX512_SapphireRapids) && + // FIXME: add the float16 -> float32 versions as well. + (op->type.element_of() == Int(32)) && + + // Accumulating pmaddubsw + (rewrite( + x + h_add(cast(Int(32, lanes * 4), widening_mul(y, z)), lanes), + v_intrin("dot_product", x, y, z), + is_uint(y, 8) && is_int(z, 8)) || + + rewrite( + x + h_add(cast(Int(32, lanes * 4), widening_mul(y, z)), lanes), + v_intrin("dot_product", x, z, y), + is_int(y, 8) && is_uint(z, 8)) || + + rewrite( + h_add(cast(Int(32, lanes * 4), widening_mul(x, y)), lanes) + z, + v_intrin("dot_product", z, x, y), + is_uint(x, 8) && is_int(y, 8)) || + + rewrite( + h_add(cast(Int(32, lanes * 4), widening_mul(x, y)), lanes) + z, + v_intrin("dot_product", z, y, x), + is_int(x, 8) && is_uint(y, 8)) || + + // Accumulating pmaddwd. + rewrite( + x + h_add(widening_mul(y, z), lanes), + v_intrin("dot_product", x, y, z), + is_int(y, 16, lanes * 2) && is_int(z, 16, lanes * 2)) || + + rewrite( + h_add(widening_mul(x, y), lanes) + z, + v_intrin("dot_product", z, x, y), + is_int(x, 16, lanes * 2) && is_int(y, 16, lanes * 2)) || + + false)) { + return mutate(rewrite.result); + } + + if ((op->type.lanes() % 4 == 0) && should_use_dot_product(op->a, op->b, matches)) { + Expr ac = Shuffle::make_interleave({matches[0], matches[2]}); + Expr bd = Shuffle::make_interleave({matches[1], matches[3]}); + // We have dot_products for every x86 arch (because SSE2 has it), + // so this is `always` safe (as long as the output type lanes has + // a factor of 4). + return mutate(VectorIntrinsic::make(op->type, "dot_product", {ac, bd})); + } + + return IRMutator::visit(op); + } + + Expr visit(const Sub *op) override { + if (!should_peephole_optimize(op->type)) { + return IRMutator::visit(op); + } + + std::vector matches; + // TODO(rootjalex): same issue as the Add case, lossless_cast and + // lossless_negate are hard to use in rewrite rules. + + if ((op->type.lanes() % 4 == 0) && should_use_dot_product(op->a, op->b, matches)) { + // Negate one of the factors in the second expression + Expr negative_2 = lossless_negate(matches[2]); + Expr negative_3 = lossless_negate(matches[3]); + if (negative_2.defined() || negative_3.defined()) { + if (negative_2.defined()) { + matches[2] = negative_2; + } else { + matches[3] = negative_3; + } + Expr ac = Shuffle::make_interleave({matches[0], matches[2]}); + Expr bd = Shuffle::make_interleave({matches[1], matches[3]}); + // Always safe, see comment in Add case above. + return mutate(VectorIntrinsic::make(op->type, "dot_product", {ac, bd})); + } + } + + return IRMutator::visit(op); + } + + Expr visit(const Cast *op) override { + if (!should_peephole_optimize(op->type)) { + return IRMutator::visit(op); + } + + const int lanes = op->type.lanes(); + + auto rewrite = IRMatcher::rewriter(IRMatcher::cast(op->type, op->value), op->type); + + // TODO: saturating casts should be intrinsics, and supported in IRMatch.h. + const Expr i32_i16max = cast(Int(32, lanes), Int(16).max()); + const Expr i32_i16min = cast(Int(32, lanes), Int(16).min()); + const Expr i16_i8max = cast(Int(16, lanes), Int(8).max()); + const Expr i16_i8min = cast(Int(16, lanes), Int(8).min()); + const Expr i16_u8max = cast(Int(16, lanes), UInt(8).max()); + const Expr i16_u8min = cast(Int(16, lanes), UInt(8).min()); + const Expr i32_u16max = cast(Int(32, lanes), UInt(16).max()); + const Expr i32_u16min = cast(Int(32, lanes), UInt(16).min()); + + if ( + // pmulhrs is supported via AVX2 and SSE41, so SSE41 is the LCD. + (target.has_feature(Target::SSE41) && + rewrite( + cast(Int(16, lanes), rounding_shift_right(widening_mul(x, y), 15)), + v_intrin("pmulhrs", x, y), + is_int(x, 16) && is_int(y, 16))) || + + // saturating_narrow is always supported (via SSE2) for: + // int32 -> int16, int16 -> int8, int16 -> uint8 + rewrite( + cast(Int(16, lanes), max(min(x, i32_i16min), i32_i16min)), + v_intrin("saturating_narrow", x), + is_int(x, 32)) || + + rewrite( + cast(Int(8, lanes), max(min(x, i16_i8min), i16_i8min)), + v_intrin("saturating_narrow", x), + is_int(x, 16)) || + + rewrite( + cast(UInt(8, lanes), max(min(x, i16_u8min), i16_u8min)), + v_intrin("saturating_narrow", x), + is_int(x, 16)) || + + // int32 -> uint16 is supported via SSE41 + (target.has_feature(Target::SSE41) && + rewrite( + cast(UInt(16, lanes), max(min(x, i32_u16min), i32_u16min)), + v_intrin("saturating_narrow", x), + is_int(x, 32))) || + + // f32_to_bf16 is supported only via Target::AVX512_SapphireRapids + (target.has_feature(Target::AVX512_SapphireRapids) && + rewrite( + cast(BFloat(16, lanes), x), + v_intrin("f32_to_bf16", x), + is_float(x, 32))) || + + false) { + return mutate(rewrite.result); + } + + // TODO: should we handle CodeGen_X86's weird 8 -> 16 bit issue here? + + return IRMutator::visit(op); + } + + Expr visit(const Call *op) override { + if (!should_peephole_optimize(op->type)) { + return IRMutator::visit(op); + } + + // TODO: This optimization is hard to do via a rewrite-rule because of lossless_cast. + + // A 16-bit mul-shift-right of less than 16 can sometimes be rounded up to a + // full 16 to use pmulh(u)w by left-shifting one of the operands. This is + // handled here instead of in the lowering of mul_shift_right because it's + // unlikely to be a good idea on platforms other than x86, as it adds an + // extra shift in the fully-lowered case. + if ((op->type.element_of() == UInt(16) || + op->type.element_of() == Int(16)) && + op->is_intrinsic(Call::mul_shift_right)) { + internal_assert(op->args.size() == 3); + const uint64_t *shift = as_const_uint(op->args[2]); + if (shift && *shift < 16 && *shift >= 8) { + Type narrow = op->type.with_bits(8); + Expr narrow_a = lossless_cast(narrow, op->args[0]); + Expr narrow_b = narrow_a.defined() ? Expr() : lossless_cast(narrow, op->args[1]); + int shift_left = 16 - (int)(*shift); + if (narrow_a.defined()) { + return mutate(mul_shift_right(op->args[0] << shift_left, op->args[1], 16)); + } else if (narrow_b.defined()) { + return mutate(mul_shift_right(op->args[0], op->args[1] << shift_left, 16)); + } + } + } + + const int lanes = op->type.lanes(); + const int bits = op->type.bits(); + + auto rewrite = IRMatcher::rewriter(op, op->type); + + Type unsigned_type = op->type.with_code(halide_type_uint); + auto x_uint = cast(unsigned_type, x); + auto y_uint = cast(unsigned_type, y); + + if ( + // We can redirect signed rounding halving add to unsigned rounding + // halving add by adding 128 / 32768 to the result if the sign of the + // args differs. + ((op->type.is_int() && bits <= 16) && + rewrite( + rounding_halving_add(x, y), + cast(op->type, rounding_halving_add(x_uint, y_uint) + ((x_uint ^ y_uint) & (1 << (bits - 1)))))) || + + // On x86, there are many 3-instruction sequences to compute absd of + // unsigned integers. This one consists solely of instructions with + // throughput of 3 ops per cycle on Cannon Lake. + // + // Solution due to Wojciech Mula: + // http://0x80.pl/notesen/2018-03-11-sse-abs-unsigned.html + (op->type.is_uint() && + rewrite( + absd(x, y), + saturating_sub(x, y) | saturating_sub(y, x))) || + + // Current best way to lower absd on x86. + (op->type.is_int() && + rewrite( + absd(x, y), + max(x, y) - min(x, y))) || + + // pmulh is always supported (via SSE2). + ((op->type.is_int_or_uint() && bits == 16) && + rewrite( + mul_shift_right(x, y, 16), + v_intrin("pmulh", x, y))) || + + // saturating_pmulhrs is supported via SSE41 + ((target.has_feature(Target::SSE41) && + op->type.is_int() && bits == 16) && + rewrite( + rounding_mul_shift_right(x, y, 15), + v_intrin("saturating_pmulhrs", x, y))) || + + // TODO(rootjalex): The following intrinsics are + // simply one-to-one mappings, should they even + // be handled here? + + // int(8 | 16 | 32) -> uint is supported via SSE41 + // float is always supported (via SSE2). + (((target.has_feature(Target::SSE41) && bits <= 32) || + op->type.is_float()) && + rewrite( + abs(x), + v_intrin("abs", x))) || + + // saturating ops for 8 and 16 bits are always supported (via SSE2). + ((bits == 8 || bits == 16) && + (rewrite( + saturating_add(x, y), + v_intrin("saturating_add", x, y)) || + rewrite( + saturating_sub(x, y), + v_intrin("saturating_sub", x, y)))) || + + // pavg ops for 8 and 16 bits are always supported (via SSE2). + ((op->type.is_uint() && (bits == 8 || bits == 16)) && + rewrite( + rounding_halving_add(x, y), + v_intrin("rounding_halving_add", x, y))) || + + // int16 -> int32 widening_mul has a (v)pmaddwd implementation. + // always supported (via SSE2). + ((op->type.is_int() && (bits == 32)) && + rewrite( + widening_mul(x, y), + v_intrin("widening_mul", x, y), + is_int(x, 16) && is_int(y, 16))) || + + (target.has_feature(Target::AVX512_SapphireRapids) && + (op->type.is_int() && (bits == 32)) && + // SapphireRapids accumulating dot products. + (rewrite( + saturating_add(x, h_satadd(cast(Int(32, lanes * 4), widening_mul(y, z)) , lanes)), + v_intrin("saturating_dot_product", x, y, z), + is_uint(y, 8) && is_int(z, 8)) || + + rewrite( + saturating_add(x, h_satadd(cast(Int(32, lanes * 4), widening_mul(y, z)) , lanes)), + v_intrin("saturating_dot_product", x, z, y), + is_int(y, 8) && is_uint(z, 8)) || + + rewrite( + saturating_add(x, h_satadd(cast(Int(32, lanes * 2), widening_mul(y, z)) , lanes)), + v_intrin("saturating_dot_product", x, y, z), + is_uint(y, 8) && is_int(z, 8)) || + + rewrite( + saturating_add(x, h_satadd(cast(Int(32, lanes * 2), widening_mul(y, z)) , lanes)), + v_intrin("saturating_dot_product", x, z, y), + is_int(y, 8) && is_uint(z, 8)) || + + rewrite( + saturating_add(x, h_satadd(widening_mul(y, z) , lanes)), + v_intrin("saturating_dot_product", x, z, y), + is_int(y, 16, lanes * 2) && is_int(z, 16, lanes * 2)) || + + false)) || + + false) { + return mutate(rewrite.result); + } + + // Fixed-point intrinsics should be lowered here. + // This is safe because this mutator is top-down. + if (op->is_intrinsic({ + Call::halving_add, + Call::halving_sub, + Call::mul_shift_right, + Call::rounding_halving_add, + Call::rounding_mul_shift_right, + Call::rounding_shift_left, + Call::rounding_shift_right, + Call::saturating_add, + Call::saturating_sub, + Call::sorted_avg, + Call::widening_add, + Call::widening_mul, + Call::widening_shift_left, + Call::widening_shift_right, + Call::widening_sub, + })) { + // TODO: Should we have a base-class that does this + the VectorReduce lowering needed below? + return mutate(lower_intrinsic(op)); + } + + return IRMutator::visit(op); + } + + Expr visit(const VectorReduce *op) override { + // FIXME: We need to split up VectorReduce nodes in the same way that + // CodeGen_LLVM::codegen_vector_reduce does, in order to do all + // matching here. + if ((op->op != VectorReduce::Add && op->op != VectorReduce::SaturatingAdd) || + !should_peephole_optimize(op->type)) { + return IRMutator::visit(op); + } + + const int lanes = op->type.lanes(); + const int value_lanes = op->value.type().lanes(); + const int factor = value_lanes / lanes; + Expr value = op->value; + + switch (op->op) { + case VectorReduce::Add: { + auto rewrite = IRMatcher::rewriter(IRMatcher::h_add(value, lanes), op->type); + auto x_is_int_or_uint = is_int(x) || is_uint(x); + auto y_is_int_or_uint = is_int(y) || is_uint(y); + if ( + // 2-way dot-products, int16 -> int32 is always supported (via SSE2). + ((factor == 2) && + (rewrite( + h_add(cast(Int(32, value_lanes), widening_mul(x, y)), lanes), + v_intrin("dot_product", cast(Int(16, value_lanes), x), cast(Int(16, value_lanes), y)), + x_is_int_or_uint && y_is_int_or_uint) || + + // Horizontal widening add via pmaddwd + rewrite( + h_add(cast(Int(32, value_lanes), x), lanes), + v_intrin("dot_product", x, make_const(Int(16, value_lanes), 1)), + is_int(x, 16)) || + + (rewrite( + h_add(widening_mul(x, y), lanes), + v_intrin("dot_product", x, y), + is_int(x, 16) && is_int(y, 16))) || + + // pmaddub supported via SSE41 + (target.has_feature(Target::SSE41) && + // Horizontal widening adds using 2-way saturating dot products. + (rewrite( + h_add(cast(UInt(16, value_lanes), x), lanes), + cast(UInt(16, lanes), typed(Int(16, lanes), v_intrin("saturating_dot_product", x, make_const(Int(8, value_lanes), 1)))), + is_uint(x, 8)) || + + rewrite( + h_add(cast(Int(16, value_lanes), x), lanes), + v_intrin("saturating_dot_product", x, make_const(Int(8, value_lanes), 1)), + is_uint(x, 8)) || + + rewrite( + h_add(cast(Int(16, value_lanes), x), lanes), + v_intrin("saturating_dot_product", make_const(UInt(8, value_lanes), 1), x), + is_int(x, 8)) || + + // SSE41 and AVX2 support horizontal_add via phadd intrinsics. + rewrite( + h_add(x, lanes), + v_intrin("horizontal_add", x), + is_int(x, 16, lanes * 2) || is_uint(x, 16, lanes * 2) || + is_int(x, 32, lanes * 2) || is_uint(x, 32, lanes * 2)) || + + // TODO: add in Andrew's psadbw pattern. + + false)) || + + false))) { + return mutate(rewrite.result); + } + break; + } + case VectorReduce::SaturatingAdd: { + auto rewrite = IRMatcher::rewriter(IRMatcher::h_satadd(value, lanes), op->type); + if ( + // Saturating dot products are supported via SSE41 and AVX2. + ((factor == 2) && target.has_feature(Target::SSE41) && + (rewrite( + h_satadd(widening_mul(x, y), lanes), + v_intrin("saturating_dot_product", x, y), + is_uint(x, 8) && is_int(y, 8)) || + + rewrite( + h_satadd(widening_mul(x, y), lanes), + v_intrin("saturating_dot_product", y, x), + is_int(x, 8) && is_uint(y, 8)) || + + false))) { + return mutate(rewrite.result); + } + break; + } + default: + break; + } + + // FIXME: We need to split up VectorReduce nodes in the same way that + // CodeGen_LLVM::codegen_vector_reduce does, in order to do all + // matching here. + + return IRMutator::visit(op); + } + +private: + const Target ⌖ + + IRMatcher::Wild<0> x; + IRMatcher::Wild<1> y; + IRMatcher::Wild<2> z; +}; + +} + + + +Stmt optimize_x86_instructions(Stmt s, const Target &t) { + s = Optimize_X86(complete_x86_target(t)).mutate(s); + // Some of the rules above can introduce repeated sub-terms, so run CSE again. + s = common_subexpression_elimination(s); + return s; +} + +#else // WITH_X86 + +Stmt optimize_x86_instructions(Stmt s, const Target &t) { + user_error << "x86 not enabled for this build of Halide.\n"; + return Stmt(); +} + + +#endif // WITH_X86 + +} // namespace Internal +} // namespace Halide + + diff --git a/src/X86Optimize.h b/src/X86Optimize.h new file mode 100644 index 000000000000..bcf375cae1d5 --- /dev/null +++ b/src/X86Optimize.h @@ -0,0 +1,22 @@ +#ifndef HALIDE_IR_X86_OPTIMIZE_H +#define HALIDE_IR_X86_OPTIMIZE_H + +/** \file + * Tools for optimizing IR for x86. + */ + +#include "Expr.h" +#include "Target.h" + +namespace Halide { +namespace Internal { + +/** Perform vector instruction selection, inserting VectorIntrinsic nodes. */ +Stmt optimize_x86_instructions(Stmt s, const Target &t); + +Target complete_x86_target(Target t); + +} // namespace Internal +} // namespace Halide + +#endif From 09193f493395bdca9263f59e4ee9f5093dfbd238 Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Mon, 25 Jul 2022 17:43:09 -0400 Subject: [PATCH 04/55] fix typo --- src/StmtToHtml.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/StmtToHtml.cpp b/src/StmtToHtml.cpp index 1f5b1d20ccff..ceddcabec83d 100644 --- a/src/StmtToHtml.cpp +++ b/src/StmtToHtml.cpp @@ -713,7 +713,7 @@ class StmtToHtml : public IRVisitor { } void visit(const VectorIntrinsic *op) override { - stream << open_span("VectoIntrinsic"); + stream << open_span("VectorIntrinsic"); stream << open_span("Type") << op->type << close_span(); print_list(symbol("vector_intrinsic") + "(\"" + op->name + "\"", op->args, ")"); stream << close_span(); From 24f74a9a2732b80c44c52e323c69a795aabcaf87 Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Mon, 25 Jul 2022 17:49:22 -0400 Subject: [PATCH 05/55] clang-format --- src/CodeGen_LLVM.cpp | 3 +- src/IRMatch.h | 6 +- src/X86Optimize.cpp | 284 +++++++++++++++++++++---------------------- 3 files changed, 143 insertions(+), 150 deletions(-) diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp index 4990ba6e78c1..14db16dba353 100644 --- a/src/CodeGen_LLVM.cpp +++ b/src/CodeGen_LLVM.cpp @@ -4001,7 +4001,8 @@ void CodeGen_LLVM::visit(const Shuffle *op) { } void CodeGen_LLVM::visit(const VectorIntrinsic *op) { - internal_error << "CodeGen_LLVM received VectorIntrinsic node, should be handled by architecture-specific CodeGen class:\n" << Expr(op) << "\n"; + internal_error << "CodeGen_LLVM received VectorIntrinsic node, should be handled by architecture-specific CodeGen class:\n" + << Expr(op) << "\n"; } void CodeGen_LLVM::visit(const VectorReduce *op) { diff --git a/src/IRMatch.h b/src/IRMatch.h index cc4a4b490664..457744f87a21 100644 --- a/src/IRMatch.h +++ b/src/IRMatch.h @@ -1586,7 +1586,7 @@ auto bitwise_xor(A &&a, B &&b) noexcept -> Intrin -HALIDE_ALWAYS_INLINE auto operator^(A &&a, B &&b) noexcept -> auto { +HALIDE_ALWAYS_INLINE auto operator^(A &&a, B &&b) noexcept -> auto{ assert_is_lvalue_if_expr(); assert_is_lvalue_if_expr(); return bitwise_xor(a, b); @@ -1596,7 +1596,7 @@ auto bitwise_and(A &&a, B &&b) noexcept -> Intrin -HALIDE_ALWAYS_INLINE auto operator&(A &&a, B &&b) noexcept -> auto { +HALIDE_ALWAYS_INLINE auto operator&(A &&a, B &&b) noexcept -> auto{ assert_is_lvalue_if_expr(); assert_is_lvalue_if_expr(); return bitwise_and(a, b); @@ -1606,7 +1606,7 @@ auto bitwise_or(A &&a, B &&b) noexcept -> Intrin -HALIDE_ALWAYS_INLINE auto operator|(A &&a, B &&b) noexcept -> auto { +HALIDE_ALWAYS_INLINE auto operator|(A &&a, B &&b) noexcept -> auto{ assert_is_lvalue_if_expr(); assert_is_lvalue_if_expr(); return bitwise_or(a, b); diff --git a/src/X86Optimize.cpp b/src/X86Optimize.cpp index 3a1d3df3b6f8..16bad9be4413 100644 --- a/src/X86Optimize.cpp +++ b/src/X86Optimize.cpp @@ -84,20 +84,16 @@ bool should_use_dot_product(const Expr &a, const Expr &b, std::vector &res return false; } -// // Templated saturating casts for use in rewrite rules. -// template -// auto saturating_cast() - -/** A code generator that replaces Halide IR with VectorIntrinsics specific to x86. */ +/** A top-down code optimizer that replaces Halide IR with VectorIntrinsics specific to x86. */ class Optimize_X86 : public IRMutator { public: - /** Create an x86 code generator. Processor features can be + /** Create an x86 code optimizer. Processor features can be * enabled using the appropriate flags in the target struct. */ - Optimize_X86(const Target &t) : target(t) { + Optimize_X86(const Target &t) + : target(t) { } protected: - bool should_peephole_optimize(const Type &type) { // We only have peephole optimizations for vectors here. // FIXME: should we only optimize vectors that are multiples of the native vector width? @@ -135,35 +131,35 @@ class Optimize_X86 : public IRMutator { // Accumulating pmaddubsw (rewrite( - x + h_add(cast(Int(32, lanes * 4), widening_mul(y, z)), lanes), - v_intrin("dot_product", x, y, z), - is_uint(y, 8) && is_int(z, 8)) || + x + h_add(cast(Int(32, lanes * 4), widening_mul(y, z)), lanes), + v_intrin("dot_product", x, y, z), + is_uint(y, 8) && is_int(z, 8)) || rewrite( - x + h_add(cast(Int(32, lanes * 4), widening_mul(y, z)), lanes), - v_intrin("dot_product", x, z, y), - is_int(y, 8) && is_uint(z, 8)) || + x + h_add(cast(Int(32, lanes * 4), widening_mul(y, z)), lanes), + v_intrin("dot_product", x, z, y), + is_int(y, 8) && is_uint(z, 8)) || rewrite( - h_add(cast(Int(32, lanes * 4), widening_mul(x, y)), lanes) + z, - v_intrin("dot_product", z, x, y), - is_uint(x, 8) && is_int(y, 8)) || + h_add(cast(Int(32, lanes * 4), widening_mul(x, y)), lanes) + z, + v_intrin("dot_product", z, x, y), + is_uint(x, 8) && is_int(y, 8)) || rewrite( - h_add(cast(Int(32, lanes * 4), widening_mul(x, y)), lanes) + z, - v_intrin("dot_product", z, y, x), - is_int(x, 8) && is_uint(y, 8)) || + h_add(cast(Int(32, lanes * 4), widening_mul(x, y)), lanes) + z, + v_intrin("dot_product", z, y, x), + is_int(x, 8) && is_uint(y, 8)) || // Accumulating pmaddwd. rewrite( - x + h_add(widening_mul(y, z), lanes), - v_intrin("dot_product", x, y, z), - is_int(y, 16, lanes * 2) && is_int(z, 16, lanes * 2)) || - + x + h_add(widening_mul(y, z), lanes), + v_intrin("dot_product", x, y, z), + is_int(y, 16, lanes * 2) && is_int(z, 16, lanes * 2)) || + rewrite( - h_add(widening_mul(x, y), lanes) + z, - v_intrin("dot_product", z, x, y), - is_int(x, 16, lanes * 2) && is_int(y, 16, lanes * 2)) || + h_add(widening_mul(x, y), lanes) + z, + v_intrin("dot_product", z, x, y), + is_int(x, 16, lanes * 2) && is_int(y, 16, lanes * 2)) || false)) { return mutate(rewrite.result); @@ -233,9 +229,9 @@ class Optimize_X86 : public IRMutator { // pmulhrs is supported via AVX2 and SSE41, so SSE41 is the LCD. (target.has_feature(Target::SSE41) && rewrite( - cast(Int(16, lanes), rounding_shift_right(widening_mul(x, y), 15)), - v_intrin("pmulhrs", x, y), - is_int(x, 16) && is_int(y, 16))) || + cast(Int(16, lanes), rounding_shift_right(widening_mul(x, y), 15)), + v_intrin("pmulhrs", x, y), + is_int(x, 16) && is_int(y, 16))) || // saturating_narrow is always supported (via SSE2) for: // int32 -> int16, int16 -> int8, int16 -> uint8 @@ -257,16 +253,16 @@ class Optimize_X86 : public IRMutator { // int32 -> uint16 is supported via SSE41 (target.has_feature(Target::SSE41) && rewrite( - cast(UInt(16, lanes), max(min(x, i32_u16min), i32_u16min)), - v_intrin("saturating_narrow", x), - is_int(x, 32))) || + cast(UInt(16, lanes), max(min(x, i32_u16min), i32_u16min)), + v_intrin("saturating_narrow", x), + is_int(x, 32))) || // f32_to_bf16 is supported only via Target::AVX512_SapphireRapids (target.has_feature(Target::AVX512_SapphireRapids) && rewrite( - cast(BFloat(16, lanes), x), - v_intrin("f32_to_bf16", x), - is_float(x, 32))) || + cast(BFloat(16, lanes), x), + v_intrin("f32_to_bf16", x), + is_float(x, 32))) || false) { return mutate(rewrite.result); @@ -290,7 +286,7 @@ class Optimize_X86 : public IRMutator { // unlikely to be a good idea on platforms other than x86, as it adds an // extra shift in the fully-lowered case. if ((op->type.element_of() == UInt(16) || - op->type.element_of() == Int(16)) && + op->type.element_of() == Int(16)) && op->is_intrinsic(Call::mul_shift_right)) { internal_assert(op->args.size() == 3); const uint64_t *shift = as_const_uint(op->args[2]); @@ -322,8 +318,9 @@ class Optimize_X86 : public IRMutator { // args differs. ((op->type.is_int() && bits <= 16) && rewrite( - rounding_halving_add(x, y), - cast(op->type, rounding_halving_add(x_uint, y_uint) + ((x_uint ^ y_uint) & (1 << (bits - 1)))))) || + rounding_halving_add(x, y), + cast(op->type, rounding_halving_add(x_uint, y_uint) + + ((x_uint ^ y_uint) & (1 << (bits - 1)))))) || // On x86, there are many 3-instruction sequences to compute absd of // unsigned integers. This one consists solely of instructions with @@ -333,27 +330,27 @@ class Optimize_X86 : public IRMutator { // http://0x80.pl/notesen/2018-03-11-sse-abs-unsigned.html (op->type.is_uint() && rewrite( - absd(x, y), - saturating_sub(x, y) | saturating_sub(y, x))) || + absd(x, y), + saturating_sub(x, y) | saturating_sub(y, x))) || // Current best way to lower absd on x86. (op->type.is_int() && rewrite( - absd(x, y), - max(x, y) - min(x, y))) || + absd(x, y), + max(x, y) - min(x, y))) || // pmulh is always supported (via SSE2). ((op->type.is_int_or_uint() && bits == 16) && rewrite( - mul_shift_right(x, y, 16), - v_intrin("pmulh", x, y))) || + mul_shift_right(x, y, 16), + v_intrin("pmulh", x, y))) || // saturating_pmulhrs is supported via SSE41 ((target.has_feature(Target::SSE41) && op->type.is_int() && bits == 16) && rewrite( - rounding_mul_shift_right(x, y, 15), - v_intrin("saturating_pmulhrs", x, y))) || + rounding_mul_shift_right(x, y, 15), + v_intrin("saturating_pmulhrs", x, y))) || // TODO(rootjalex): The following intrinsics are // simply one-to-one mappings, should they even @@ -364,61 +361,61 @@ class Optimize_X86 : public IRMutator { (((target.has_feature(Target::SSE41) && bits <= 32) || op->type.is_float()) && rewrite( - abs(x), - v_intrin("abs", x))) || + abs(x), + v_intrin("abs", x))) || // saturating ops for 8 and 16 bits are always supported (via SSE2). ((bits == 8 || bits == 16) && (rewrite( - saturating_add(x, y), - v_intrin("saturating_add", x, y)) || - rewrite( - saturating_sub(x, y), - v_intrin("saturating_sub", x, y)))) || + saturating_add(x, y), + v_intrin("saturating_add", x, y)) || + rewrite( + saturating_sub(x, y), + v_intrin("saturating_sub", x, y)))) || // pavg ops for 8 and 16 bits are always supported (via SSE2). - ((op->type.is_uint() && (bits == 8 || bits == 16)) && + ((op->type.is_uint() && (bits == 8 || bits == 16)) && rewrite( - rounding_halving_add(x, y), - v_intrin("rounding_halving_add", x, y))) || + rounding_halving_add(x, y), + v_intrin("rounding_halving_add", x, y))) || // int16 -> int32 widening_mul has a (v)pmaddwd implementation. // always supported (via SSE2). ((op->type.is_int() && (bits == 32)) && rewrite( - widening_mul(x, y), - v_intrin("widening_mul", x, y), - is_int(x, 16) && is_int(y, 16))) || + widening_mul(x, y), + v_intrin("widening_mul", x, y), + is_int(x, 16) && is_int(y, 16))) || (target.has_feature(Target::AVX512_SapphireRapids) && (op->type.is_int() && (bits == 32)) && // SapphireRapids accumulating dot products. (rewrite( - saturating_add(x, h_satadd(cast(Int(32, lanes * 4), widening_mul(y, z)) , lanes)), - v_intrin("saturating_dot_product", x, y, z), - is_uint(y, 8) && is_int(z, 8)) || + saturating_add(x, h_satadd(cast(Int(32, lanes * 4), widening_mul(y, z)), lanes)), + v_intrin("saturating_dot_product", x, y, z), + is_uint(y, 8) && is_int(z, 8)) || rewrite( - saturating_add(x, h_satadd(cast(Int(32, lanes * 4), widening_mul(y, z)) , lanes)), - v_intrin("saturating_dot_product", x, z, y), - is_int(y, 8) && is_uint(z, 8)) || - + saturating_add(x, h_satadd(cast(Int(32, lanes * 4), widening_mul(y, z)), lanes)), + v_intrin("saturating_dot_product", x, z, y), + is_int(y, 8) && is_uint(z, 8)) || + rewrite( - saturating_add(x, h_satadd(cast(Int(32, lanes * 2), widening_mul(y, z)) , lanes)), - v_intrin("saturating_dot_product", x, y, z), - is_uint(y, 8) && is_int(z, 8)) || + saturating_add(x, h_satadd(cast(Int(32, lanes * 2), widening_mul(y, z)), lanes)), + v_intrin("saturating_dot_product", x, y, z), + is_uint(y, 8) && is_int(z, 8)) || rewrite( - saturating_add(x, h_satadd(cast(Int(32, lanes * 2), widening_mul(y, z)) , lanes)), - v_intrin("saturating_dot_product", x, z, y), - is_int(y, 8) && is_uint(z, 8)) || - + saturating_add(x, h_satadd(cast(Int(32, lanes * 2), widening_mul(y, z)), lanes)), + v_intrin("saturating_dot_product", x, z, y), + is_int(y, 8) && is_uint(z, 8)) || + rewrite( - saturating_add(x, h_satadd(widening_mul(y, z) , lanes)), - v_intrin("saturating_dot_product", x, z, y), - is_int(y, 16, lanes * 2) && is_int(z, 16, lanes * 2)) || - - false)) || + saturating_add(x, h_satadd(widening_mul(y, z), lanes)), + v_intrin("saturating_dot_product", x, z, y), + is_int(y, 16, lanes * 2) && is_int(z, 16, lanes * 2)) || + + false)) || false) { return mutate(rewrite.result); @@ -427,21 +424,21 @@ class Optimize_X86 : public IRMutator { // Fixed-point intrinsics should be lowered here. // This is safe because this mutator is top-down. if (op->is_intrinsic({ - Call::halving_add, - Call::halving_sub, - Call::mul_shift_right, - Call::rounding_halving_add, - Call::rounding_mul_shift_right, - Call::rounding_shift_left, - Call::rounding_shift_right, - Call::saturating_add, - Call::saturating_sub, - Call::sorted_avg, - Call::widening_add, - Call::widening_mul, - Call::widening_shift_left, - Call::widening_shift_right, - Call::widening_sub, + Call::halving_add, + Call::halving_sub, + Call::mul_shift_right, + Call::rounding_halving_add, + Call::rounding_mul_shift_right, + Call::rounding_shift_left, + Call::rounding_shift_right, + Call::saturating_add, + Call::saturating_sub, + Call::sorted_avg, + Call::widening_add, + Call::widening_mul, + Call::widening_shift_left, + Call::widening_shift_right, + Call::widening_sub, })) { // TODO: Should we have a base-class that does this + the VectorReduce lowering needed below? return mutate(lower_intrinsic(op)); @@ -473,51 +470,51 @@ class Optimize_X86 : public IRMutator { // 2-way dot-products, int16 -> int32 is always supported (via SSE2). ((factor == 2) && (rewrite( - h_add(cast(Int(32, value_lanes), widening_mul(x, y)), lanes), - v_intrin("dot_product", cast(Int(16, value_lanes), x), cast(Int(16, value_lanes), y)), - x_is_int_or_uint && y_is_int_or_uint) || - - // Horizontal widening add via pmaddwd - rewrite( - h_add(cast(Int(32, value_lanes), x), lanes), - v_intrin("dot_product", x, make_const(Int(16, value_lanes), 1)), - is_int(x, 16)) || + h_add(cast(Int(32, value_lanes), widening_mul(x, y)), lanes), + v_intrin("dot_product", cast(Int(16, value_lanes), x), cast(Int(16, value_lanes), y)), + x_is_int_or_uint && y_is_int_or_uint) || + + // Horizontal widening add via pmaddwd + rewrite( + h_add(cast(Int(32, value_lanes), x), lanes), + v_intrin("dot_product", x, make_const(Int(16, value_lanes), 1)), + is_int(x, 16)) || - (rewrite( - h_add(widening_mul(x, y), lanes), - v_intrin("dot_product", x, y), - is_int(x, 16) && is_int(y, 16))) || - - // pmaddub supported via SSE41 - (target.has_feature(Target::SSE41) && - // Horizontal widening adds using 2-way saturating dot products. (rewrite( - h_add(cast(UInt(16, value_lanes), x), lanes), - cast(UInt(16, lanes), typed(Int(16, lanes), v_intrin("saturating_dot_product", x, make_const(Int(8, value_lanes), 1)))), - is_uint(x, 8)) || - - rewrite( - h_add(cast(Int(16, value_lanes), x), lanes), - v_intrin("saturating_dot_product", x, make_const(Int(8, value_lanes), 1)), - is_uint(x, 8)) || - - rewrite( - h_add(cast(Int(16, value_lanes), x), lanes), - v_intrin("saturating_dot_product", make_const(UInt(8, value_lanes), 1), x), - is_int(x, 8)) || - - // SSE41 and AVX2 support horizontal_add via phadd intrinsics. - rewrite( - h_add(x, lanes), - v_intrin("horizontal_add", x), - is_int(x, 16, lanes * 2) || is_uint(x, 16, lanes * 2) || - is_int(x, 32, lanes * 2) || is_uint(x, 32, lanes * 2)) || + h_add(widening_mul(x, y), lanes), + v_intrin("dot_product", x, y), + is_int(x, 16) && is_int(y, 16))) || + + // pmaddub supported via SSE41 + (target.has_feature(Target::SSE41) && + // Horizontal widening adds using 2-way saturating dot products. + (rewrite( + h_add(cast(UInt(16, value_lanes), x), lanes), + cast(UInt(16, lanes), typed(Int(16, lanes), v_intrin("saturating_dot_product", x, make_const(Int(8, value_lanes), 1)))), + is_uint(x, 8)) || + + rewrite( + h_add(cast(Int(16, value_lanes), x), lanes), + v_intrin("saturating_dot_product", x, make_const(Int(8, value_lanes), 1)), + is_uint(x, 8)) || + + rewrite( + h_add(cast(Int(16, value_lanes), x), lanes), + v_intrin("saturating_dot_product", make_const(UInt(8, value_lanes), 1), x), + is_int(x, 8)) || + + // SSE41 and AVX2 support horizontal_add via phadd intrinsics. + rewrite( + h_add(x, lanes), + v_intrin("horizontal_add", x), + is_int(x, 16, lanes * 2) || is_uint(x, 16, lanes * 2) || + is_int(x, 32, lanes * 2) || is_uint(x, 32, lanes * 2)) || // TODO: add in Andrew's psadbw pattern. - false)) || + false)) || - false))) { + false))) { return mutate(rewrite.result); } break; @@ -528,16 +525,16 @@ class Optimize_X86 : public IRMutator { // Saturating dot products are supported via SSE41 and AVX2. ((factor == 2) && target.has_feature(Target::SSE41) && (rewrite( - h_satadd(widening_mul(x, y), lanes), - v_intrin("saturating_dot_product", x, y), - is_uint(x, 8) && is_int(y, 8)) || + h_satadd(widening_mul(x, y), lanes), + v_intrin("saturating_dot_product", x, y), + is_uint(x, 8) && is_int(y, 8)) || rewrite( - h_satadd(widening_mul(x, y), lanes), - v_intrin("saturating_dot_product", y, x), - is_int(x, 8) && is_uint(y, 8)) || + h_satadd(widening_mul(x, y), lanes), + v_intrin("saturating_dot_product", y, x), + is_int(x, 8) && is_uint(y, 8)) || - false))) { + false))) { return mutate(rewrite.result); } break; @@ -561,9 +558,7 @@ class Optimize_X86 : public IRMutator { IRMatcher::Wild<2> z; }; -} - - +} // namespace Stmt optimize_x86_instructions(Stmt s, const Target &t) { s = Optimize_X86(complete_x86_target(t)).mutate(s); @@ -579,10 +574,7 @@ Stmt optimize_x86_instructions(Stmt s, const Target &t) { return Stmt(); } - #endif // WITH_X86 } // namespace Internal } // namespace Halide - - From 58ff01ba76aa782080a7419b1b92a3f01b8ff5ba Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Mon, 25 Jul 2022 18:09:16 -0400 Subject: [PATCH 06/55] add VectorIntrinsic comment --- src/IR.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/IR.h b/src/IR.h index d732d1a43ea7..ad1ad2437123 100644 --- a/src/IR.h +++ b/src/IR.h @@ -886,6 +886,9 @@ struct Atomic : public StmtNode { static const IRNodeType _node_type = IRNodeType::Atomic; }; +/** Represent a target-specific vector instruction. + * Intrinsic may not be element-wise operation, i.e. + * dot_products. */ struct VectorIntrinsic : public ExprNode { std::string name; std::vector args; From 0d30b56cf720d76a602e6652e84fdf62c1b12aa6 Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Mon, 25 Jul 2022 18:11:22 -0400 Subject: [PATCH 07/55] format --- src/IRMatch.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/IRMatch.h b/src/IRMatch.h index 457744f87a21..cc4a4b490664 100644 --- a/src/IRMatch.h +++ b/src/IRMatch.h @@ -1586,7 +1586,7 @@ auto bitwise_xor(A &&a, B &&b) noexcept -> Intrin -HALIDE_ALWAYS_INLINE auto operator^(A &&a, B &&b) noexcept -> auto{ +HALIDE_ALWAYS_INLINE auto operator^(A &&a, B &&b) noexcept -> auto { assert_is_lvalue_if_expr(); assert_is_lvalue_if_expr(); return bitwise_xor(a, b); @@ -1596,7 +1596,7 @@ auto bitwise_and(A &&a, B &&b) noexcept -> Intrin -HALIDE_ALWAYS_INLINE auto operator&(A &&a, B &&b) noexcept -> auto{ +HALIDE_ALWAYS_INLINE auto operator&(A &&a, B &&b) noexcept -> auto { assert_is_lvalue_if_expr(); assert_is_lvalue_if_expr(); return bitwise_and(a, b); @@ -1606,7 +1606,7 @@ auto bitwise_or(A &&a, B &&b) noexcept -> Intrin -HALIDE_ALWAYS_INLINE auto operator|(A &&a, B &&b) noexcept -> auto{ +HALIDE_ALWAYS_INLINE auto operator|(A &&a, B &&b) noexcept -> auto { assert_is_lvalue_if_expr(); assert_is_lvalue_if_expr(); return bitwise_or(a, b); From e9029a261d9e6f7bd2f544879b6e9583d9d51548 Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Mon, 25 Jul 2022 23:31:28 -0400 Subject: [PATCH 08/55] add missing horizontal_add x86Intrinsics --- src/CodeGen_X86.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp index 0e99407726b8..5d902e866115 100644 --- a/src/CodeGen_X86.cpp +++ b/src/CodeGen_X86.cpp @@ -159,6 +159,16 @@ const x86Intrinsic intrinsic_defs[] = { // LLVM does not provide an unmasked 128bit cvtneps2bf16 intrinsic, so provide a wrapper around the masked version. {"vcvtneps2bf16x4", BFloat(16, 4), "f32_to_bf16", {Float(32, 4)}, Target::AVX512_SapphireRapids}, + // Horizontal adds that use (v)phadd(w | d). + {"phaddw_sse3", UInt(16, 8), "horizontal_add", {UInt(16, 16)}, Target::SSE41}, + {"phaddw_sse3", Int(16, 8), "horizontal_add", {Int(16, 16)}, Target::SSE41}, + {"phaddw_avx2", UInt(16, 16), "horizontal_add", {UInt(16, 32)}, Target::AVX2}, + {"phaddw_avx2", Int(16, 16), "horizontal_add", {Int(16, 32)}, Target::AVX2}, + {"phaddd_sse3", UInt(32, 4), "horizontal_add", {UInt(32, 8)}, Target::SSE41}, + {"phaddd_sse3", Int(32, 4), "horizontal_add", {Int(32, 8)}, Target::SSE41}, + {"phaddd_avx2", UInt(32, 8), "horizontal_add", {UInt(32, 16)}, Target::AVX2}, + {"phaddd_avx2", Int(32, 8), "horizontal_add", {Int(32, 16)}, Target::AVX2}, + // 2-way dot products {"llvm.x86.avx2.pmadd.ub.sw", Int(16, 16), "saturating_dot_product", {UInt(8, 32), Int(8, 32)}, Target::AVX2}, {"llvm.x86.ssse3.pmadd.ub.sw.128", Int(16, 8), "saturating_dot_product", {UInt(8, 16), Int(8, 16)}, Target::SSE41}, From 9d2deb482c9d1abdf2c32ba48c6df8e52ff73ca8 Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Mon, 25 Jul 2022 23:33:50 -0400 Subject: [PATCH 09/55] fix bfloat16 abs issue --- src/X86Optimize.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/X86Optimize.cpp b/src/X86Optimize.cpp index 16bad9be4413..691df83b000f 100644 --- a/src/X86Optimize.cpp +++ b/src/X86Optimize.cpp @@ -357,9 +357,9 @@ class Optimize_X86 : public IRMutator { // be handled here? // int(8 | 16 | 32) -> uint is supported via SSE41 - // float is always supported (via SSE2). + // float32 is always supported (via SSE2). (((target.has_feature(Target::SSE41) && bits <= 32) || - op->type.is_float()) && + (op->type.is_float() && bits == 32) && rewrite( abs(x), v_intrin("abs", x))) || From 1a51b8331e34ab9932699797886b4e1ae2e5311a Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Mon, 25 Jul 2022 23:36:07 -0400 Subject: [PATCH 10/55] fix unhandled bitwise_or in IRMatch.h --- src/IRMatch.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/IRMatch.h b/src/IRMatch.h index cc4a4b490664..44082d8a76c7 100644 --- a/src/IRMatch.h +++ b/src/IRMatch.h @@ -1449,6 +1449,8 @@ struct Intrin { return arg0 ^ arg1; } else if (intrin == Call::bitwise_and) { return arg0 & arg1; + } else if (intrin == Call::bitwise_or) { + return arg0 | arg1; } Expr arg2 = std::get(args).make(state, type_hint); From 614b7ea76dd42166790459a18c3075b181007c25 Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Mon, 25 Jul 2022 23:47:23 -0400 Subject: [PATCH 11/55] missing paren --- src/X86Optimize.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/X86Optimize.cpp b/src/X86Optimize.cpp index 691df83b000f..381b2daf4f3e 100644 --- a/src/X86Optimize.cpp +++ b/src/X86Optimize.cpp @@ -359,7 +359,7 @@ class Optimize_X86 : public IRMutator { // int(8 | 16 | 32) -> uint is supported via SSE41 // float32 is always supported (via SSE2). (((target.has_feature(Target::SSE41) && bits <= 32) || - (op->type.is_float() && bits == 32) && + (op->type.is_float() && bits == 32)) && rewrite( abs(x), v_intrin("abs", x))) || From a5b7e72b88f8c5d06f6d09bb3aa92c50cb0d3c64 Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Tue, 26 Jul 2022 00:06:29 -0400 Subject: [PATCH 12/55] fix buildbot failures (I hope?) --- src/CodeGen_C.cpp | 5 +++++ src/CodeGen_C.h | 1 + src/Deinterleave.cpp | 5 +++++ src/Derivative.cpp | 3 +++ src/ModulusRemainder.cpp | 6 ++++++ src/Monotonic.cpp | 5 +++++ 6 files changed, 25 insertions(+) diff --git a/src/CodeGen_C.cpp b/src/CodeGen_C.cpp index c5b64f0610f5..1477bcd71316 100644 --- a/src/CodeGen_C.cpp +++ b/src/CodeGen_C.cpp @@ -2829,6 +2829,11 @@ Expr CodeGen_C::scalarize_vector_reduce(const VectorReduce *op) { return Shuffle::make_concat(lanes); } +void CodeGen_C::visit(const VectorIntrinsic *op) { + internal_error << "CodeGen_C should never receive a VectorIntrinsic, received:\n" + << Expr(op) << "\n"; +} + void CodeGen_C::visit(const VectorReduce *op) { stream << get_indent() << "// Vector reduce: " << op->op << "\n"; diff --git a/src/CodeGen_C.h b/src/CodeGen_C.h index 9c06d4bb5630..2427aeef32dc 100644 --- a/src/CodeGen_C.h +++ b/src/CodeGen_C.h @@ -235,6 +235,7 @@ class CodeGen_C : public IRPrinter { void visit(const Fork *) override; void visit(const Acquire *) override; void visit(const Atomic *) override; + void visit(const VectorIntrinsic *) override; void visit(const VectorReduce *) override; void visit_binop(Type t, const Expr &a, const Expr &b, const char *op); diff --git a/src/Deinterleave.cpp b/src/Deinterleave.cpp index e368d851d615..30a5c012a7ba 100644 --- a/src/Deinterleave.cpp +++ b/src/Deinterleave.cpp @@ -195,6 +195,11 @@ class Deinterleaver : public IRGraphMutator { using IRMutator::visit; + Expr visit(const VectorIntrinsic *op) override { + internal_error << "Deinterleaver should never receive VectorIntrinsic node, received:\n" + << Expr(op) << "\n"; + } + Expr visit(const VectorReduce *op) override { std::vector input_lanes; int factor = op->value.type().lanes() / op->type.lanes(); diff --git a/src/Derivative.cpp b/src/Derivative.cpp index c536eeea92ae..851faad0cd4b 100644 --- a/src/Derivative.cpp +++ b/src/Derivative.cpp @@ -88,6 +88,9 @@ class ReverseAccumulationVisitor : public IRVisitor { void visit(const Shuffle *op) override { internal_error << "Encounter unexpected expression \"Shuffle\" when differentiating."; } + void visit(const VectorIntrinsic *op) override { + internal_error << "Encounter unexpected expression \"VectorIntrinsic\" when differentiating."; + } void visit(const VectorReduce *op) override { internal_error << "Encounter unexpected expression \"VectorReduce\" when differentiating."; } diff --git a/src/ModulusRemainder.cpp b/src/ModulusRemainder.cpp index 34a598e4c7e3..cb2b54957948 100644 --- a/src/ModulusRemainder.cpp +++ b/src/ModulusRemainder.cpp @@ -74,6 +74,7 @@ class ComputeModulusRemainder : public IRVisitor { void visit(const Free *) override; void visit(const Evaluate *) override; void visit(const Shuffle *) override; + void visit(const VectorIntrinsic *) override; void visit(const VectorReduce *) override; void visit(const Prefetch *) override; void visit(const Atomic *) override; @@ -213,6 +214,11 @@ void ComputeModulusRemainder::visit(const Shuffle *op) { result = ModulusRemainder{}; } +void ComputeModulusRemainder::visit(const VectorIntrinsic *op) { + internal_error << "modulus_remainder of VectorIntrinsic:\n" << Expr(op) << "\n"; + result = ModulusRemainder{}; +} + void ComputeModulusRemainder::visit(const VectorReduce *op) { internal_assert(op->type.is_scalar()) << "modulus_remainder of vector\n"; result = ModulusRemainder{}; diff --git a/src/Monotonic.cpp b/src/Monotonic.cpp index ae8978b2cb57..2c51c7fa960b 100644 --- a/src/Monotonic.cpp +++ b/src/Monotonic.cpp @@ -534,6 +534,11 @@ class DerivativeBounds : public IRVisitor { result = ConstantInterval::single_point(0); } + void visit(const VectorIntrinsic *op) override { + // TODO(rootjalex): Should this be an error? + result = ConstantInterval::everything(); + } + void visit(const VectorReduce *op) override { op->value.accept(this); switch (op->op) { From c58f85e04b6129709e58b5f08d3cc8ca296fa633 Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Tue, 26 Jul 2022 00:08:16 -0400 Subject: [PATCH 13/55] clang-format --- src/ModulusRemainder.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ModulusRemainder.cpp b/src/ModulusRemainder.cpp index cb2b54957948..be0226b44e56 100644 --- a/src/ModulusRemainder.cpp +++ b/src/ModulusRemainder.cpp @@ -215,7 +215,8 @@ void ComputeModulusRemainder::visit(const Shuffle *op) { } void ComputeModulusRemainder::visit(const VectorIntrinsic *op) { - internal_error << "modulus_remainder of VectorIntrinsic:\n" << Expr(op) << "\n"; + internal_error << "modulus_remainder of VectorIntrinsic:\n" + << Expr(op) << "\n"; result = ModulusRemainder{}; } From c9efd3221e7c62cb4ac69d6651c7345938007875 Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Tue, 26 Jul 2022 00:18:55 -0400 Subject: [PATCH 14/55] add empty Expr return to Deinterleaver::visic(const VectorIntrinsic*) --- src/Deinterleave.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Deinterleave.cpp b/src/Deinterleave.cpp index 30a5c012a7ba..0a68517f7f41 100644 --- a/src/Deinterleave.cpp +++ b/src/Deinterleave.cpp @@ -198,6 +198,7 @@ class Deinterleaver : public IRGraphMutator { Expr visit(const VectorIntrinsic *op) override { internal_error << "Deinterleaver should never receive VectorIntrinsic node, received:\n" << Expr(op) << "\n"; + return Expr(); } Expr visit(const VectorReduce *op) override { From 0e94961a68a7d3173591d3da11ade43aa3a9e92c Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Tue, 26 Jul 2022 01:00:49 -0400 Subject: [PATCH 15/55] fix horizontal_add references --- src/runtime/x86_avx2.ll | 32 ++++++++++++++++++++++++++++++++ src/runtime/x86_sse41.ll | 17 +++++++++++++++++ 2 files changed, 49 insertions(+) diff --git a/src/runtime/x86_avx2.ll b/src/runtime/x86_avx2.ll index 1a80f5b583d3..370dd5a84a24 100644 --- a/src/runtime/x86_avx2.ll +++ b/src/runtime/x86_avx2.ll @@ -72,3 +72,35 @@ define weak_odr <16 x i16> @hadd_pmadd_i8_avx2(<32 x i8> %a) nounwind alwaysinli ret <16 x i16> %1 } declare <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>) nounwind readnone + +define weak_odr <16 x i16> @phaddw_avx2(<32 x i16> %a) nounwind alwaysinline { + %1 = shufflevector <32 x i16> %a, <32 x i16> undef, <16 x i32> + %2 = shufflevector <32 x i16> %a, <32 x i16> undef, <16 x i32> + %3 = tail call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %1, <16 x i16> %2) + ret <16 x i16> %3 + } + declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>) nounwind readnone + + define weak_odr <8 x i32> @phaddd_avx2(<16 x i32> %a) nounwind alwaysinline { + %1 = shufflevector <16 x i32> %a, <16 x i32> undef, <8 x i32> + %2 = shufflevector <16 x i32> %a, <16 x i32> undef, <8 x i32> + %3 = tail call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %1, <8 x i32> %2) + ret <8 x i32> %3 + } + declare <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32>, <8 x i32>) nounwind readnone + + define weak_odr <8 x i32> @hadd_pmadd_i16_avx2(<16 x i16> %a) nounwind alwaysinline { + %1 = tail call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a, <16 x i16> ) + ret <8 x i32> %1 + } + + define weak_odr <8 x i32> @wmul_pmaddwd_avx2(<8 x i16> %a, <8 x i16> %b) nounwind alwaysinline { + %1 = zext <8 x i16> %a to <8 x i32> + %2 = zext <8 x i16> %b to <8 x i32> + %3 = bitcast <8 x i32> %1 to <16 x i16> + %4 = bitcast <8 x i32> %2 to <16 x i16> + %res = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %3, <16 x i16> %4) + ret <8 x i32> %res + } + declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) nounwind readnone + \ No newline at end of file diff --git a/src/runtime/x86_sse41.ll b/src/runtime/x86_sse41.ll index f109ee37ec23..b49faca965d9 100644 --- a/src/runtime/x86_sse41.ll +++ b/src/runtime/x86_sse41.ll @@ -92,3 +92,20 @@ define weak_odr <8 x i16> @hadd_pmadd_i8_sse3(<16 x i8> %a) nounwind alwaysinlin ret <8 x i16> %1 } declare <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>) nounwind readnone + +define weak_odr <8 x i16> @phaddw_sse3(<16 x i16> %a) nounwind alwaysinline { + %1 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> + %2 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> + %3 = tail call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %1, <8 x i16> %2) + ret <8 x i16> %3 + } + declare <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16>, <8 x i16>) nounwind readnone + + define weak_odr <4 x i32> @phaddd_sse3(<8 x i32> %a) nounwind alwaysinline { + %1 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> + %2 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> + %3 = tail call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %1, <4 x i32> %2) + ret <4 x i32> %3 + } + declare <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32>, <4 x i32>) nounwind readnone + \ No newline at end of file From fb538e38015d434ffc6fd704bdada4c5cd5551bf Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Tue, 26 Jul 2022 01:03:27 -0400 Subject: [PATCH 16/55] fix bfloat16 abs issue (again) --- src/X86Optimize.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/X86Optimize.cpp b/src/X86Optimize.cpp index 381b2daf4f3e..0572ac3b2c75 100644 --- a/src/X86Optimize.cpp +++ b/src/X86Optimize.cpp @@ -358,7 +358,7 @@ class Optimize_X86 : public IRMutator { // int(8 | 16 | 32) -> uint is supported via SSE41 // float32 is always supported (via SSE2). - (((target.has_feature(Target::SSE41) && bits <= 32) || + (((target.has_feature(Target::SSE41) && op->type.is_int() && bits <= 32) || (op->type.is_float() && bits == 32)) && rewrite( abs(x), From c2a61752e4d01539fc71d8c03610b99884068dc0 Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Tue, 26 Jul 2022 01:16:01 -0400 Subject: [PATCH 17/55] fix instruction selection location --- src/CodeGen_X86.cpp | 39 +++++++++++++++++++++++++++++++++++++++ src/Lower.cpp | 7 ------- 2 files changed, 39 insertions(+), 7 deletions(-) diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp index 5d902e866115..73959adf78ac 100644 --- a/src/CodeGen_X86.cpp +++ b/src/CodeGen_X86.cpp @@ -30,6 +30,9 @@ class CodeGen_X86 : public CodeGen_Posix { CodeGen_X86(Target); protected: + void compile_func(const LoweredFunc &f, + const std::string &simple_name, const std::string &extern_name) override; + string mcpu_target() const override; string mcpu_tune() const override; string mattrs() const override; @@ -249,6 +252,42 @@ void CodeGen_X86::init_module() { } } +// FIXME: This is nearly identical to CodeGen_LLVM, should re-factor this somehow. +// Only difference is the call to `optimize_x86_instructions()` +void CodeGen_X86::compile_func(const LoweredFunc &f, const std::string &simple_name, + const std::string &extern_name) { + // Generate the function declaration and argument unpacking code. + begin_func(f.linkage, simple_name, extern_name, f.args); + + // If building with MSAN, ensure that calls to halide_msan_annotate_buffer_is_initialized() + // happen for every output buffer if the function succeeds. + if (f.linkage != LinkageType::Internal && + target.has_feature(Target::MSAN)) { + llvm::Function *annotate_buffer_fn = + module->getFunction("halide_msan_annotate_buffer_is_initialized_as_destructor"); + internal_assert(annotate_buffer_fn) + << "Could not find halide_msan_annotate_buffer_is_initialized_as_destructor in module\n"; + annotate_buffer_fn->addParamAttr(0, Attribute::NoAlias); + for (const auto &arg : f.args) { + if (arg.kind == Argument::OutputBuffer) { + register_destructor(annotate_buffer_fn, sym_get(arg.name + ".buffer"), OnSuccess); + } + } + } + + // Generate the function body. + debug(1) << "Generating llvm bitcode for function " << f.name << "...\n"; + debug(1) << "X86: Optimizing vector instructions...\n"; + Stmt body = optimize_x86_instructions(f.body, target); + debug(2) << "X86: Lowering after vector instructions:\n" + << body << "\n\n"; + + body.accept(this); + + // Clean up and return. + end_func(f.args); +} + void CodeGen_X86::visit(const GT *op) { Type t = op->a.type(); diff --git a/src/Lower.cpp b/src/Lower.cpp index 20d98a20562a..f25f209ecea4 100644 --- a/src/Lower.cpp +++ b/src/Lower.cpp @@ -444,13 +444,6 @@ void lower_impl(const vector &output_funcs, debug(1) << "Skipping GPU offload...\n"; } - if (t.arch == Target::X86) { - debug(1) << "Performing x86-specific vector instruction selection...\n"; - s = optimize_x86_instructions(s, t); - debug(2) << "Lowering after performing x86-specific vector instruction selection:\n" - << s << "\n\n"; - } - // TODO: This needs to happen before lowering parallel tasks, because global // images used inside parallel loops are rewritten from loads from images to // loads from closure parameters. Closure parameters are missing the Buffer<> From 78edb81d539bb92db18047b97cdb0b7eba6b6b3d Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Tue, 26 Jul 2022 01:18:30 -0400 Subject: [PATCH 18/55] clang format --- src/CodeGen_X86.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp index 73959adf78ac..07269145e2d2 100644 --- a/src/CodeGen_X86.cpp +++ b/src/CodeGen_X86.cpp @@ -255,7 +255,7 @@ void CodeGen_X86::init_module() { // FIXME: This is nearly identical to CodeGen_LLVM, should re-factor this somehow. // Only difference is the call to `optimize_x86_instructions()` void CodeGen_X86::compile_func(const LoweredFunc &f, const std::string &simple_name, - const std::string &extern_name) { + const std::string &extern_name) { // Generate the function declaration and argument unpacking code. begin_func(f.linkage, simple_name, extern_name, f.args); From 53c560b72f4b37aa35010afadb406c8d46ffbe58 Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Tue, 26 Jul 2022 10:57:05 -0400 Subject: [PATCH 19/55] fix virtual function hidden error --- src/X86Optimize.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/X86Optimize.cpp b/src/X86Optimize.cpp index 0572ac3b2c75..ed58b43dc6e3 100644 --- a/src/X86Optimize.cpp +++ b/src/X86Optimize.cpp @@ -101,6 +101,8 @@ class Optimize_X86 : public IRMutator { return type.is_vector(); } + using IRMutator::visit; + Expr visit(const Div *op) override { if (!should_peephole_optimize(op->type) || !op->type.is_int_or_uint()) { return IRMutator::visit(op); From 2cfc0c185dde714aabc3abcd1f84901c9b037db0 Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Tue, 26 Jul 2022 11:06:59 -0400 Subject: [PATCH 20/55] fix absd codegen bug --- src/X86Optimize.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/X86Optimize.cpp b/src/X86Optimize.cpp index ed58b43dc6e3..c040befb4194 100644 --- a/src/X86Optimize.cpp +++ b/src/X86Optimize.cpp @@ -330,16 +330,16 @@ class Optimize_X86 : public IRMutator { // // Solution due to Wojciech Mula: // http://0x80.pl/notesen/2018-03-11-sse-abs-unsigned.html - (op->type.is_uint() && - rewrite( - absd(x, y), - saturating_sub(x, y) | saturating_sub(y, x))) || + rewrite( + absd(x, y), + saturating_sub(x, y) | saturating_sub(y, x), + is_uint(x) && is_uint(y)) || // Current best way to lower absd on x86. - (op->type.is_int() && - rewrite( - absd(x, y), - max(x, y) - min(x, y))) || + rewrite( + absd(x, y), + max(x, y) - min(x, y), + is_int(x) && is_int(y)) || // pmulh is always supported (via SSE2). ((op->type.is_int_or_uint() && bits == 16) && From 0675e8605157c39b2698bb566b2affea43029a44 Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Tue, 26 Jul 2022 12:43:36 -0400 Subject: [PATCH 21/55] attempt to fix x86 vector-reduction splitting --- src/CodeGen_LLVM.cpp | 30 +++++++++++++------------- src/CodeGen_LLVM.h | 9 ++++++++ src/CodeGen_X86.cpp | 26 ++++++++++++++++++++++- src/X86Optimize.cpp | 50 ++++++++++++++------------------------------ src/X86Optimize.h | 5 ++--- 5 files changed, 67 insertions(+), 53 deletions(-) diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp index 14db16dba353..c13ef1bdd0ac 100644 --- a/src/CodeGen_LLVM.cpp +++ b/src/CodeGen_LLVM.cpp @@ -4009,7 +4009,7 @@ void CodeGen_LLVM::visit(const VectorReduce *op) { codegen_vector_reduce(op, Expr()); } -void CodeGen_LLVM::codegen_vector_reduce(const VectorReduce *op, const Expr &init) { +Expr CodeGen_LLVM::split_vector_reduce(const VectorReduce *op, const Expr &init) const { Expr val = op->value; const int output_lanes = op->type.lanes(); const int native_lanes = native_vector_bits() / op->type.bits(); @@ -4049,8 +4049,7 @@ void CodeGen_LLVM::codegen_vector_reduce(const VectorReduce *op, const Expr &ini equiv = max(equiv, init); } equiv = cast(op->type, equiv); - equiv.accept(this); - return; + return equiv; } if (op->type.is_bool() && op->op == VectorReduce::And) { @@ -4061,8 +4060,7 @@ void CodeGen_LLVM::codegen_vector_reduce(const VectorReduce *op, const Expr &ini if (init.defined()) { equiv = min(equiv, init); } - equiv.accept(this); - return; + return equiv; } if (elt == Float(16) && upgrade_type_for_arithmetic(elt) != elt) { @@ -4072,8 +4070,7 @@ void CodeGen_LLVM::codegen_vector_reduce(const VectorReduce *op, const Expr &ini equiv = binop(equiv, init); } equiv = cast(op->type, equiv); - equiv.accept(this); - return; + return equiv; } if (output_lanes == 1) { @@ -4172,8 +4169,7 @@ void CodeGen_LLVM::codegen_vector_reduce(const VectorReduce *op, const Expr &ini if (initial_value.defined()) { equiv = binop(initial_value, equiv); } - equiv.accept(this); - return; + return equiv; } } @@ -4196,8 +4192,7 @@ void CodeGen_LLVM::codegen_vector_reduce(const VectorReduce *op, const Expr &ini equiv = binop(equiv, init); } equiv = common_subexpression_elimination(equiv); - equiv.accept(this); - return; + return equiv; } if (factor > 2 && ((factor & 1) == 0)) { @@ -4229,8 +4224,7 @@ void CodeGen_LLVM::codegen_vector_reduce(const VectorReduce *op, const Expr &ini equiv = binop(equiv, init); } equiv = common_subexpression_elimination(equiv); - codegen(equiv); - return; + return equiv; } // Extract each slice and combine @@ -4244,8 +4238,14 @@ void CodeGen_LLVM::codegen_vector_reduce(const VectorReduce *op, const Expr &ini } } equiv = common_subexpression_elimination(equiv); - codegen(equiv); -} // namespace Internal + return equiv; +} + +void CodeGen_LLVM::codegen_vector_reduce(const VectorReduce *op, const Expr &init) { + Expr equiv = split_vector_reduce(op, init); + equiv.accept(this); + return; +} void CodeGen_LLVM::visit(const Atomic *op) { if (!op->mutex_name.empty()) { diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h index 7c5078428431..1a479f7829e7 100644 --- a/src/CodeGen_LLVM.h +++ b/src/CodeGen_LLVM.h @@ -507,6 +507,15 @@ class CodeGen_LLVM : public IRVisitor { * across backends. */ virtual void codegen_vector_reduce(const VectorReduce *op, const Expr &init); + // TODO: this probably shouldn't be public, or should be moved where the rest of + // the public methods are. +public: + /** Split up a VectorReduce node if possible, or generate LLVM + intrinsics for full reductions. This is used in + `codegen_vector_reduce`. **/ + virtual Expr split_vector_reduce(const VectorReduce *op, const Expr &init) const; + +protected: /** Are we inside an atomic node that uses mutex locks? This is used for detecting deadlocks from nested atomics & illegal vectorization. */ bool inside_atomic_mutex_node; diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp index 07269145e2d2..ba2c44579d59 100644 --- a/src/CodeGen_X86.cpp +++ b/src/CodeGen_X86.cpp @@ -22,6 +22,30 @@ using namespace llvm; namespace { +// Populate feature flags in a target according to those implied by +// existing flags, so that instruction patterns can just check for the +// oldest feature flag that supports an instruction. +Target complete_x86_target(Target t) { + if (t.has_feature(Target::AVX512_SapphireRapids)) { + t.set_feature(Target::AVX512_Cannonlake); + } + if (t.has_feature(Target::AVX512_Cannonlake)) { + t.set_feature(Target::AVX512_Skylake); + } + if (t.has_feature(Target::AVX512_Cannonlake) || + t.has_feature(Target::AVX512_Skylake) || + t.has_feature(Target::AVX512_KNL)) { + t.set_feature(Target::AVX2); + } + if (t.has_feature(Target::AVX2)) { + t.set_feature(Target::AVX); + } + if (t.has_feature(Target::AVX)) { + t.set_feature(Target::SSE41); + } + return t; +} + /** A code generator that emits x86 code from a given Halide stmt. */ class CodeGen_X86 : public CodeGen_Posix { public: @@ -278,7 +302,7 @@ void CodeGen_X86::compile_func(const LoweredFunc &f, const std::string &simple_n // Generate the function body. debug(1) << "Generating llvm bitcode for function " << f.name << "...\n"; debug(1) << "X86: Optimizing vector instructions...\n"; - Stmt body = optimize_x86_instructions(f.body, target); + Stmt body = optimize_x86_instructions(f.body, target, this); debug(2) << "X86: Lowering after vector instructions:\n" << body << "\n\n"; diff --git a/src/X86Optimize.cpp b/src/X86Optimize.cpp index c040befb4194..f4c2b758db55 100644 --- a/src/X86Optimize.cpp +++ b/src/X86Optimize.cpp @@ -13,30 +13,6 @@ namespace Halide { namespace Internal { -// Populate feature flags in a target according to those implied by -// existing flags, so that instruction patterns can just check for the -// oldest feature flag that supports an instruction. -Target complete_x86_target(Target t) { - if (t.has_feature(Target::AVX512_SapphireRapids)) { - t.set_feature(Target::AVX512_Cannonlake); - } - if (t.has_feature(Target::AVX512_Cannonlake)) { - t.set_feature(Target::AVX512_Skylake); - } - if (t.has_feature(Target::AVX512_Cannonlake) || - t.has_feature(Target::AVX512_Skylake) || - t.has_feature(Target::AVX512_KNL)) { - t.set_feature(Target::AVX2); - } - if (t.has_feature(Target::AVX2)) { - t.set_feature(Target::AVX); - } - if (t.has_feature(Target::AVX)) { - t.set_feature(Target::SSE41); - } - return t; -} - #if defined(WITH_X86) namespace { @@ -89,8 +65,8 @@ class Optimize_X86 : public IRMutator { public: /** Create an x86 code optimizer. Processor features can be * enabled using the appropriate flags in the target struct. */ - Optimize_X86(const Target &t) - : target(t) { + Optimize_X86(const Target &t, const CodeGen_LLVM *c) + : target(t), codegen(c) { } protected: @@ -545,15 +521,21 @@ class Optimize_X86 : public IRMutator { break; } - // FIXME: We need to split up VectorReduce nodes in the same way that - // CodeGen_LLVM::codegen_vector_reduce does, in order to do all - // matching here. + return attempt_vector_split(op); + } - return IRMutator::visit(op); + Expr attempt_vector_split(const VectorReduce *op) { + Expr split = codegen->split_vector_reduce(op, Expr()); + if (split.defined() && !split.same_as(op)) { + return mutate(split); + } else { + return IRMutator::visit(op); + } } private: const Target ⌖ + const CodeGen_LLVM *codegen; IRMatcher::Wild<0> x; IRMatcher::Wild<1> y; @@ -562,11 +544,11 @@ class Optimize_X86 : public IRMutator { } // namespace -Stmt optimize_x86_instructions(Stmt s, const Target &t) { - s = Optimize_X86(complete_x86_target(t)).mutate(s); +Stmt optimize_x86_instructions(Stmt stmt, const Target &target, const CodeGen_LLVM *codegen) { + stmt = Optimize_X86(target, codegen).mutate(stmt); // Some of the rules above can introduce repeated sub-terms, so run CSE again. - s = common_subexpression_elimination(s); - return s; + stmt = common_subexpression_elimination(stmt); + return stmt; } #else // WITH_X86 diff --git a/src/X86Optimize.h b/src/X86Optimize.h index bcf375cae1d5..df37c7dd896f 100644 --- a/src/X86Optimize.h +++ b/src/X86Optimize.h @@ -5,6 +5,7 @@ * Tools for optimizing IR for x86. */ +#include "CodeGen_LLVM.h" #include "Expr.h" #include "Target.h" @@ -12,9 +13,7 @@ namespace Halide { namespace Internal { /** Perform vector instruction selection, inserting VectorIntrinsic nodes. */ -Stmt optimize_x86_instructions(Stmt s, const Target &t); - -Target complete_x86_target(Target t); +Stmt optimize_x86_instructions(Stmt stmt, const Target &target, const CodeGen_LLVM *codegen); } // namespace Internal } // namespace Halide From 6471226694d45feae15ac7893cab29e9d77382f7 Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Tue, 26 Jul 2022 13:02:16 -0400 Subject: [PATCH 22/55] clang tidy --- src/CodeGen_LLVM.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp index c13ef1bdd0ac..f821aaf670d4 100644 --- a/src/CodeGen_LLVM.cpp +++ b/src/CodeGen_LLVM.cpp @@ -4244,7 +4244,6 @@ Expr CodeGen_LLVM::split_vector_reduce(const VectorReduce *op, const Expr &init) void CodeGen_LLVM::codegen_vector_reduce(const VectorReduce *op, const Expr &init) { Expr equiv = split_vector_reduce(op, init); equiv.accept(this); - return; } void CodeGen_LLVM::visit(const Atomic *op) { From fb8216607297130f2e84dc63bf7bdb548e3d73f7 Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Tue, 26 Jul 2022 15:14:13 -0400 Subject: [PATCH 23/55] fix MSVC templating bug --- src/IRMatch.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/IRMatch.h b/src/IRMatch.h index 44082d8a76c7..2d1394742713 100644 --- a/src/IRMatch.h +++ b/src/IRMatch.h @@ -1948,10 +1948,10 @@ struct VectorIntrinOp { r_args[0] = std::get<0>(args).make(state, {}); if constexpr (sizeof...(Args) > 1) { - r_args[1] = std::get<1>(args).make(state, {}); + r_args[1] = std::get(args).make(state, {}); } if constexpr (sizeof...(Args) > 2) { - r_args[2] = std::get<2>(args).make(state, {}); + r_args[2] = std::get(args).make(state, {}); } // for (int i = 0; i < sizeof...(Args); i++) { From f0926064940725259e6bede767d986f2302e78ba Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Wed, 27 Jul 2022 17:31:44 -0400 Subject: [PATCH 24/55] implement Andrew's requested changes --- src/Bounds.cpp | 4 +-- src/CodeGen_C.cpp | 4 +-- src/CodeGen_C.h | 2 +- src/CodeGen_LLVM.cpp | 4 +-- src/CodeGen_LLVM.h | 2 +- src/CodeGen_X86.cpp | 7 ++-- src/Deinterleave.cpp | 4 +-- src/Derivative.cpp | 4 +-- src/Expr.h | 2 +- src/IR.cpp | 45 +++++++++++++++++++----- src/IR.h | 42 ++++++++++++++++++---- src/IREquality.cpp | 8 ++--- src/IRMatch.cpp | 12 +++---- src/IRMatch.h | 30 ++++++++-------- src/IRMutator.cpp | 4 +-- src/IRMutator.h | 2 +- src/IRPrinter.cpp | 4 +-- src/IRPrinter.h | 2 +- src/IRVisitor.cpp | 4 +-- src/IRVisitor.h | 10 +++--- src/ModulusRemainder.cpp | 6 ++-- src/Monotonic.cpp | 2 +- src/Simplify_Exprs.cpp | 2 +- src/Simplify_Internal.h | 2 +- src/StmtToHtml.cpp | 6 ++-- src/X86Optimize.cpp | 76 +++++++++++++++++++++------------------- src/X86Optimize.h | 2 +- 27 files changed, 177 insertions(+), 115 deletions(-) diff --git a/src/Bounds.cpp b/src/Bounds.cpp index 107bd04185c4..632001485ba5 100644 --- a/src/Bounds.cpp +++ b/src/Bounds.cpp @@ -1110,9 +1110,9 @@ class Bounds : public IRVisitor { op->value.accept(this); } - void visit(const VectorIntrinsic *op) override { + void visit(const VectorInstruction *op) override { // TODO(rootjalex): we may need to implement bounds queries. - internal_error << "Unexpected VectorIntrinsic in bounds query: " << Expr(op) << "\n"; + internal_error << "Unexpected VectorInstruction in bounds query: " << Expr(op) << "\n"; } void visit(const Call *op) override { diff --git a/src/CodeGen_C.cpp b/src/CodeGen_C.cpp index 1477bcd71316..8bc1abda3502 100644 --- a/src/CodeGen_C.cpp +++ b/src/CodeGen_C.cpp @@ -2829,8 +2829,8 @@ Expr CodeGen_C::scalarize_vector_reduce(const VectorReduce *op) { return Shuffle::make_concat(lanes); } -void CodeGen_C::visit(const VectorIntrinsic *op) { - internal_error << "CodeGen_C should never receive a VectorIntrinsic, received:\n" +void CodeGen_C::visit(const VectorInstruction *op) { + internal_error << "CodeGen_C should never receive a VectorInstruction, received:\n" << Expr(op) << "\n"; } diff --git a/src/CodeGen_C.h b/src/CodeGen_C.h index 2427aeef32dc..256b35f55efe 100644 --- a/src/CodeGen_C.h +++ b/src/CodeGen_C.h @@ -235,7 +235,7 @@ class CodeGen_C : public IRPrinter { void visit(const Fork *) override; void visit(const Acquire *) override; void visit(const Atomic *) override; - void visit(const VectorIntrinsic *) override; + void visit(const VectorInstruction *) override; void visit(const VectorReduce *) override; void visit_binop(Type t, const Expr &a, const Expr &b, const char *op); diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp index f821aaf670d4..68b4da12ae87 100644 --- a/src/CodeGen_LLVM.cpp +++ b/src/CodeGen_LLVM.cpp @@ -4000,8 +4000,8 @@ void CodeGen_LLVM::visit(const Shuffle *op) { } } -void CodeGen_LLVM::visit(const VectorIntrinsic *op) { - internal_error << "CodeGen_LLVM received VectorIntrinsic node, should be handled by architecture-specific CodeGen class:\n" +void CodeGen_LLVM::visit(const VectorInstruction *op) { + internal_error << "CodeGen_LLVM received VectorInstruction node, should be handled by architecture-specific CodeGen class:\n" << Expr(op) << "\n"; } diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h index 1a479f7829e7..4f951d5a9131 100644 --- a/src/CodeGen_LLVM.h +++ b/src/CodeGen_LLVM.h @@ -362,7 +362,7 @@ class CodeGen_LLVM : public IRVisitor { void visit(const IfThenElse *) override; void visit(const Evaluate *) override; void visit(const Shuffle *) override; - void visit(const VectorIntrinsic *) override; + void visit(const VectorInstruction *) override; void visit(const VectorReduce *) override; void visit(const Prefetch *) override; void visit(const Atomic *) override; diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp index ba2c44579d59..8caa3ee756b0 100644 --- a/src/CodeGen_X86.cpp +++ b/src/CodeGen_X86.cpp @@ -84,7 +84,7 @@ class CodeGen_X86 : public CodeGen_Posix { void visit(const Allocate *) override; void visit(const Load *) override; void visit(const Store *) override; - void visit(const VectorIntrinsic *) override; + void visit(const VectorInstruction *) override; // @} private: @@ -470,8 +470,9 @@ void CodeGen_X86::visit(const Store *op) { CodeGen_Posix::visit(op); } -void CodeGen_X86::visit(const VectorIntrinsic *op) { - value = call_overloaded_intrin(op->type, op->name, op->args); +void CodeGen_X86::visit(const VectorInstruction *op) { + const std::string name = op->get_instruction_name(); + value = call_overloaded_intrin(op->type, name, op->args); internal_assert(value) << "CodeGen_X86 failed on " << Expr(op) << "\n"; } diff --git a/src/Deinterleave.cpp b/src/Deinterleave.cpp index 0a68517f7f41..c2182d4f5192 100644 --- a/src/Deinterleave.cpp +++ b/src/Deinterleave.cpp @@ -195,8 +195,8 @@ class Deinterleaver : public IRGraphMutator { using IRMutator::visit; - Expr visit(const VectorIntrinsic *op) override { - internal_error << "Deinterleaver should never receive VectorIntrinsic node, received:\n" + Expr visit(const VectorInstruction *op) override { + internal_error << "Deinterleaver should never receive VectorInstruction node, received:\n" << Expr(op) << "\n"; return Expr(); } diff --git a/src/Derivative.cpp b/src/Derivative.cpp index 851faad0cd4b..cade4b010980 100644 --- a/src/Derivative.cpp +++ b/src/Derivative.cpp @@ -88,8 +88,8 @@ class ReverseAccumulationVisitor : public IRVisitor { void visit(const Shuffle *op) override { internal_error << "Encounter unexpected expression \"Shuffle\" when differentiating."; } - void visit(const VectorIntrinsic *op) override { - internal_error << "Encounter unexpected expression \"VectorIntrinsic\" when differentiating."; + void visit(const VectorInstruction *op) override { + internal_error << "Encounter unexpected expression \"VectorInstruction\" when differentiating."; } void visit(const VectorReduce *op) override { internal_error << "Encounter unexpected expression \"VectorReduce\" when differentiating."; diff --git a/src/Expr.h b/src/Expr.h index aaab7dac23be..efb7526e0eb5 100644 --- a/src/Expr.h +++ b/src/Expr.h @@ -57,7 +57,7 @@ enum class IRNodeType { Call, Let, Shuffle, - VectorIntrinsic, + VectorInstruction, VectorReduce, // Stmts LetStmt, diff --git a/src/IR.cpp b/src/IR.cpp index 776a8da806ee..07832b93b6ca 100644 --- a/src/IR.cpp +++ b/src/IR.cpp @@ -901,17 +901,46 @@ Stmt Atomic::make(const std::string &producer_name, return node; } -Expr VectorIntrinsic::make(Type type, const std::string &name, const std::vector &args) { - user_assert(!name.empty()) << "VectorIntrinsic without a name\n"; +namespace { + +const char *const instruction_op_names[] = { + // Shared: + "abs", + "dot_product", + "rounding_halving_add", + "saturating_add", + "saturating_narrow", + "saturating_sub", + "widening_mul", + + // x86-specific + "f32_to_bf16", + "horizontal_add", + "pmulh", + "pmulhrs", + "saturating_dot_product", +}; + +static_assert(sizeof(instruction_op_names) / sizeof(instruction_op_names[0]) == VectorInstruction::InstructionOpCount, + "instruction_op_names needs attention"); + +} // namespace + +Expr VectorInstruction::make(Type type, InstructionOp op, const std::vector &args) { user_assert(!args.empty()) << "VectorInrinsic without arguments\n"; - VectorIntrinsic *node = new VectorIntrinsic; + VectorInstruction *node = new VectorInstruction; node->type = type; - node->name = name; + node->op = op; node->args = args; return node; } +const char *VectorInstruction::get_instruction_name() const { + return instruction_op_names[op]; +} + + Expr VectorReduce::make(VectorReduce::Operator op, Expr vec, int lanes) { @@ -1092,8 +1121,8 @@ void ExprNode::accept(IRVisitor *v) const { v->visit((const Shuffle *)this); } template<> -void ExprNode::accept(IRVisitor *v) const { - v->visit((const VectorIntrinsic *)this); +void ExprNode::accept(IRVisitor *v) const { + v->visit((const VectorInstruction *)this); } template<> void ExprNode::accept(IRVisitor *v) const { @@ -1285,8 +1314,8 @@ Expr ExprNode::mutate_expr(IRMutator *v) const { return v->visit((const Shuffle *)this); } template<> -Expr ExprNode::mutate_expr(IRMutator *v) const { - return v->visit((const VectorIntrinsic *)this); +Expr ExprNode::mutate_expr(IRMutator *v) const { + return v->visit((const VectorInstruction *)this); } template<> Expr ExprNode::mutate_expr(IRMutator *v) const { diff --git a/src/IR.h b/src/IR.h index ad1ad2437123..e0ea25ce5768 100644 --- a/src/IR.h +++ b/src/IR.h @@ -886,16 +886,44 @@ struct Atomic : public StmtNode { static const IRNodeType _node_type = IRNodeType::Atomic; }; -/** Represent a target-specific vector instruction. - * Intrinsic may not be element-wise operation, i.e. - * dot_products. */ -struct VectorIntrinsic : public ExprNode { - std::string name; +/** Represent a length-agnostic and target-specific + * vector instruction. Intrinsic may not be element-wise + * operation, i.e. dot_products. Should only be generated + * and consumed during CodeGen. */ +struct VectorInstruction : public ExprNode { + // enums for vector instructions. Name is recovered via get_instruction_name() + // Specific enum values are *not* guaranteed to be stable across time. + // Please keep this list sorted via target architecture (with a shared section). + // This last will become more complete as we add Optimize passes for more backends. + // If you add an instruction here, update `instruction_op_names` in IR.cpp. + enum InstructionOp { + // Shared: + abs, + dot_product, + rounding_halving_add, + saturating_add, + saturating_narrow, + saturating_sub, + widening_mul, + + // x86-specific + f32_to_bf16, + horizontal_add, + pmulh, + pmulhrs, + saturating_dot_product, + + InstructionOpCount // Sentinel: keep last. + }; + + InstructionOp op; std::vector args; - static Expr make(Type type, const std::string &name, const std::vector &args); + static Expr make(Type type, InstructionOp op, const std::vector &args); + + static const IRNodeType _node_type = IRNodeType::VectorInstruction; - static const IRNodeType _node_type = IRNodeType::VectorIntrinsic; + const char *get_instruction_name() const; }; /** Horizontally reduce a vector to a scalar or narrower vector using diff --git a/src/IREquality.cpp b/src/IREquality.cpp index 15a9bc01cbbb..edcfc3d067dc 100644 --- a/src/IREquality.cpp +++ b/src/IREquality.cpp @@ -98,7 +98,7 @@ class IRComparer : public IRVisitor { void visit(const Shuffle *) override; void visit(const Prefetch *) override; void visit(const Atomic *) override; - void visit(const VectorIntrinsic *) override; + void visit(const VectorInstruction *) override; void visit(const VectorReduce *) override; }; @@ -630,10 +630,10 @@ void IRComparer::visit(const Atomic *op) { compare_stmt(s->body, op->body); } -void IRComparer::visit(const VectorIntrinsic *op) { - const VectorIntrinsic *e = expr.as(); +void IRComparer::visit(const VectorInstruction *op) { + const VectorInstruction *e = expr.as(); - compare_names(e->name, op->name); + compare_scalar(e->op, op->op); compare_expr_vector(e->args, op->args); } diff --git a/src/IRMatch.cpp b/src/IRMatch.cpp index f7bb7d457ff1..4cfc409163af 100644 --- a/src/IRMatch.cpp +++ b/src/IRMatch.cpp @@ -296,11 +296,11 @@ class IRMatch : public IRVisitor { } } - void visit(const VectorIntrinsic *op) override { - const VectorIntrinsic *e = expr.as(); + void visit(const VectorInstruction *op) override { + const VectorInstruction *e = expr.as(); if (result && e && types_match(op->type, e->type) && - e->name == op->name && + e->op == op->op && e->args.size() == op->args.size()) { for (size_t i = 0; result && (i < e->args.size()); i++) { // FIXME: should we early-out? Here and in Call* @@ -521,9 +521,9 @@ bool equal_helper(const BaseExprNode &a, const BaseExprNode &b) noexcept { case IRNodeType::Shuffle: return (equal_helper(((const Shuffle &)a).vectors, ((const Shuffle &)b).vectors) && equal_helper(((const Shuffle &)a).indices, ((const Shuffle &)b).indices)); - case IRNodeType::VectorIntrinsic: - return (((const VectorIntrinsic &)a).name == ((const VectorIntrinsic &)b).name && - equal_helper(((const VectorIntrinsic &)a).args, ((const VectorIntrinsic &)b).args)); + case IRNodeType::VectorInstruction: + return (((const VectorInstruction &)a).op == ((const VectorInstruction &)b).op && + equal_helper(((const VectorInstruction &)a).args, ((const VectorInstruction &)b).args)); case IRNodeType::VectorReduce: // As with Cast above, we use equal instead of equal_helper // here, because while we know a.type == b.type, we don't know diff --git a/src/IRMatch.h b/src/IRMatch.h index 2d1394742713..982a8dcaeace 100644 --- a/src/IRMatch.h +++ b/src/IRMatch.h @@ -1886,36 +1886,36 @@ HALIDE_ALWAYS_INLINE auto ramp(A &&a, B &&b, C &&c) noexcept -> RampOp struct VectorIntrinOp { struct pattern_tag {}; - const std::string &intrin_name; + const VectorInstruction::InstructionOp op; std::tuple args; static constexpr uint32_t binds = bitwise_or_reduce((bindings::mask)...); - constexpr static IRNodeType min_node_type = IRNodeType::VectorIntrinsic; - constexpr static IRNodeType max_node_type = IRNodeType::VectorIntrinsic; + constexpr static IRNodeType min_node_type = IRNodeType::VectorInstruction; + constexpr static IRNodeType max_node_type = IRNodeType::VectorInstruction; constexpr static bool canonical = and_reduce((Args::canonical)...); template::type> - HALIDE_ALWAYS_INLINE bool match_args(int, const VectorIntrinsic &v, MatcherState &state) const noexcept { + HALIDE_ALWAYS_INLINE bool match_args(int, const VectorInstruction &v, MatcherState &state) const noexcept { using T = decltype(std::get(args)); return (std::get(args).template match(*v.args[i].get(), state) && match_args::mask>(0, v, state)); } template - HALIDE_ALWAYS_INLINE bool match_args(double, const VectorIntrinsic &v, MatcherState &state) const noexcept { + HALIDE_ALWAYS_INLINE bool match_args(double, const VectorInstruction &v, MatcherState &state) const noexcept { return true; } template HALIDE_ALWAYS_INLINE bool match(const BaseExprNode &e, MatcherState &state) const noexcept { - if (e.node_type != IRNodeType::VectorIntrinsic) { + if (e.node_type != IRNodeType::VectorInstruction) { return false; } - const VectorIntrinsic &v = (const VectorIntrinsic &)e; - return (v.name == intrin_name && match_args<0, bound>(0, v, state)); + const VectorInstruction &v = (const VectorInstruction &)e; + return (v.op == op && match_args<0, bound>(0, v, state)); } template(args).make(state, {}); // } - return VectorIntrinsic::make(type_hint, intrin_name, r_args); + return VectorInstruction::make(type_hint, op, r_args); } constexpr static bool foldable = false; HALIDE_ALWAYS_INLINE - VectorIntrinOp(const std::string &name, Args... args) noexcept - : intrin_name(name), args(args...) { + VectorIntrinOp(const VectorInstruction::InstructionOp _op, Args... args) noexcept + : op(_op), args(args...) { static_assert(sizeof...(Args) > 0 && sizeof...(Args) <= 3, - "VectorIntrinsicOp must have non-zero arguments, and update make() if more than 3 arguments."); + "VectorInstructionOp must have non-zero arguments, and update make() if more than 3 arguments."); } }; @@ -1975,15 +1975,15 @@ template std::ostream &operator<<(std::ostream &s, const VectorIntrinOp &op) { // TODO(rootjalex): Should we print the type? s << "vector_intrin(\""; - s << op.intrin_name << "\", "; + s << op.op << "\", "; op.print_args(s); s << ")"; return s; } template -HALIDE_ALWAYS_INLINE auto v_intrin(const std::string &name, Args... args) noexcept -> VectorIntrinOp { - return {name, pattern_arg(args)...}; +HALIDE_ALWAYS_INLINE auto v_intrin(const VectorInstruction::InstructionOp op, Args... args) noexcept -> VectorIntrinOp { + return {op, pattern_arg(args)...}; } template diff --git a/src/IRMutator.cpp b/src/IRMutator.cpp index e075897d6694..b1703a6cccd1 100644 --- a/src/IRMutator.cpp +++ b/src/IRMutator.cpp @@ -327,12 +327,12 @@ Expr IRMutator::visit(const Shuffle *op) { return Shuffle::make(new_vectors, op->indices); } -Expr IRMutator::visit(const VectorIntrinsic *op) { +Expr IRMutator::visit(const VectorInstruction *op) { auto [new_args, changed] = mutate_with_changes(op->args); if (!changed) { return op; } - return VectorIntrinsic::make(op->type, op->name, new_args); + return VectorInstruction::make(op->type, op->op, new_args); } Expr IRMutator::visit(const VectorReduce *op) { diff --git a/src/IRMutator.h b/src/IRMutator.h index e460b036b80f..4729bb08344f 100644 --- a/src/IRMutator.h +++ b/src/IRMutator.h @@ -81,7 +81,7 @@ class IRMutator { virtual Expr visit(const Call *); virtual Expr visit(const Let *); virtual Expr visit(const Shuffle *); - virtual Expr visit(const VectorIntrinsic *); + virtual Expr visit(const VectorInstruction *); virtual Expr visit(const VectorReduce *); virtual Stmt visit(const LetStmt *); diff --git a/src/IRPrinter.cpp b/src/IRPrinter.cpp index f609f28763fd..324b399e5548 100644 --- a/src/IRPrinter.cpp +++ b/src/IRPrinter.cpp @@ -1073,11 +1073,11 @@ void IRPrinter::visit(const Shuffle *op) { } } -void IRPrinter::visit(const VectorIntrinsic *op) { +void IRPrinter::visit(const VectorInstruction *op) { stream << "(" << op->type << ")vector_intrinsic(\"" - << op->name + << op->get_instruction_name() << "\", "; print_list(op->args); stream << ")"; diff --git a/src/IRPrinter.h b/src/IRPrinter.h index 1e7cc048b805..e4e89efd5806 100644 --- a/src/IRPrinter.h +++ b/src/IRPrinter.h @@ -194,7 +194,7 @@ class IRPrinter : public IRVisitor { void visit(const IfThenElse *) override; void visit(const Evaluate *) override; void visit(const Shuffle *) override; - void visit(const VectorIntrinsic *) override; + void visit(const VectorInstruction *) override; void visit(const VectorReduce *) override; void visit(const Prefetch *) override; void visit(const Atomic *) override; diff --git a/src/IRVisitor.cpp b/src/IRVisitor.cpp index 3b1956a51d8a..97c55d8075ac 100644 --- a/src/IRVisitor.cpp +++ b/src/IRVisitor.cpp @@ -257,7 +257,7 @@ void IRVisitor::visit(const Shuffle *op) { } } -void IRVisitor::visit(const VectorIntrinsic *op) { +void IRVisitor::visit(const VectorInstruction *op) { for (const auto &arg : op->args) { arg.accept(this); } @@ -521,7 +521,7 @@ void IRGraphVisitor::visit(const Shuffle *op) { } } -void IRGraphVisitor::visit(const VectorIntrinsic *op) { +void IRGraphVisitor::visit(const VectorInstruction *op) { for (const auto &arg : op->args) { include(arg); } diff --git a/src/IRVisitor.h b/src/IRVisitor.h index c9c170dd851d..5df16880dfed 100644 --- a/src/IRVisitor.h +++ b/src/IRVisitor.h @@ -71,7 +71,7 @@ class IRVisitor { virtual void visit(const IfThenElse *); virtual void visit(const Evaluate *); virtual void visit(const Shuffle *); - virtual void visit(const VectorIntrinsic *); + virtual void visit(const VectorInstruction *); virtual void visit(const VectorReduce *); virtual void visit(const Prefetch *); virtual void visit(const Fork *); @@ -143,7 +143,7 @@ class IRGraphVisitor : public IRVisitor { void visit(const IfThenElse *) override; void visit(const Evaluate *) override; void visit(const Shuffle *) override; - void visit(const VectorIntrinsic *) override; + void visit(const VectorInstruction *) override; void visit(const VectorReduce *) override; void visit(const Prefetch *) override; void visit(const Acquire *) override; @@ -226,8 +226,8 @@ class VariadicVisitor { return ((T *)this)->visit((const Let *)node, std::forward(args)...); case IRNodeType::Shuffle: return ((T *)this)->visit((const Shuffle *)node, std::forward(args)...); - case IRNodeType::VectorIntrinsic: - return ((T *)this)->visit((const VectorIntrinsic *)node, std::forward(args)...); + case IRNodeType::VectorInstruction: + return ((T *)this)->visit((const VectorInstruction *)node, std::forward(args)...); case IRNodeType::VectorReduce: return ((T *)this)->visit((const VectorReduce *)node, std::forward(args)...); // Explicitly list the Stmt types rather than using a @@ -290,7 +290,7 @@ class VariadicVisitor { case IRNodeType::Call: case IRNodeType::Let: case IRNodeType::Shuffle: - case IRNodeType::VectorIntrinsic: + case IRNodeType::VectorInstruction: case IRNodeType::VectorReduce: internal_error << "Unreachable"; break; diff --git a/src/ModulusRemainder.cpp b/src/ModulusRemainder.cpp index be0226b44e56..fcce870a5a29 100644 --- a/src/ModulusRemainder.cpp +++ b/src/ModulusRemainder.cpp @@ -74,7 +74,7 @@ class ComputeModulusRemainder : public IRVisitor { void visit(const Free *) override; void visit(const Evaluate *) override; void visit(const Shuffle *) override; - void visit(const VectorIntrinsic *) override; + void visit(const VectorInstruction *) override; void visit(const VectorReduce *) override; void visit(const Prefetch *) override; void visit(const Atomic *) override; @@ -214,8 +214,8 @@ void ComputeModulusRemainder::visit(const Shuffle *op) { result = ModulusRemainder{}; } -void ComputeModulusRemainder::visit(const VectorIntrinsic *op) { - internal_error << "modulus_remainder of VectorIntrinsic:\n" +void ComputeModulusRemainder::visit(const VectorInstruction *op) { + internal_error << "modulus_remainder of VectorInstruction:\n" << Expr(op) << "\n"; result = ModulusRemainder{}; } diff --git a/src/Monotonic.cpp b/src/Monotonic.cpp index 2c51c7fa960b..b5d8cea0d928 100644 --- a/src/Monotonic.cpp +++ b/src/Monotonic.cpp @@ -534,7 +534,7 @@ class DerivativeBounds : public IRVisitor { result = ConstantInterval::single_point(0); } - void visit(const VectorIntrinsic *op) override { + void visit(const VectorInstruction *op) override { // TODO(rootjalex): Should this be an error? result = ConstantInterval::everything(); } diff --git a/src/Simplify_Exprs.cpp b/src/Simplify_Exprs.cpp index 5aefebfa611f..0496fb4fc353 100644 --- a/src/Simplify_Exprs.cpp +++ b/src/Simplify_Exprs.cpp @@ -59,7 +59,7 @@ Expr Simplify::visit(const Broadcast *op, ExprInfo *bounds) { } } -Expr Simplify::visit(const VectorIntrinsic *op, ExprInfo *bounds) { +Expr Simplify::visit(const VectorInstruction *op, ExprInfo *bounds) { clear_bounds_info(bounds); return op; } diff --git a/src/Simplify_Internal.h b/src/Simplify_Internal.h index 7dae150042a4..1b0258a1d150 100644 --- a/src/Simplify_Internal.h +++ b/src/Simplify_Internal.h @@ -333,7 +333,7 @@ class Simplify : public VariadicVisitor { Expr visit(const Load *op, ExprInfo *bounds); Expr visit(const Call *op, ExprInfo *bounds); Expr visit(const Shuffle *op, ExprInfo *bounds); - Expr visit(const VectorIntrinsic *op, ExprInfo *bounds); + Expr visit(const VectorInstruction *op, ExprInfo *bounds); Expr visit(const VectorReduce *op, ExprInfo *bounds); Expr visit(const Let *op, ExprInfo *bounds); Stmt visit(const LetStmt *op); diff --git a/src/StmtToHtml.cpp b/src/StmtToHtml.cpp index ceddcabec83d..8dd6afdc8a73 100644 --- a/src/StmtToHtml.cpp +++ b/src/StmtToHtml.cpp @@ -712,10 +712,10 @@ class StmtToHtml : public IRVisitor { stream << close_span(); } - void visit(const VectorIntrinsic *op) override { - stream << open_span("VectorIntrinsic"); + void visit(const VectorInstruction *op) override { + stream << open_span("VectorInstruction"); stream << open_span("Type") << op->type << close_span(); - print_list(symbol("vector_intrinsic") + "(\"" + op->name + "\"", op->args, ")"); + print_list(symbol("vector_intrinsic") + "(\"" + op->get_instruction_name() + "\"", op->args, ")"); stream << close_span(); } diff --git a/src/X86Optimize.cpp b/src/X86Optimize.cpp index f4c2b758db55..4972e5c775c4 100644 --- a/src/X86Optimize.cpp +++ b/src/X86Optimize.cpp @@ -60,7 +60,7 @@ bool should_use_dot_product(const Expr &a, const Expr &b, std::vector &res return false; } -/** A top-down code optimizer that replaces Halide IR with VectorIntrinsics specific to x86. */ +/** A top-down code optimizer that replaces Halide IR with VectorInstructions specific to x86. */ class Optimize_X86 : public IRMutator { public: /** Create an x86 code optimizer. Processor features can be @@ -110,33 +110,33 @@ class Optimize_X86 : public IRMutator { // Accumulating pmaddubsw (rewrite( x + h_add(cast(Int(32, lanes * 4), widening_mul(y, z)), lanes), - v_intrin("dot_product", x, y, z), + v_intrin(VectorInstruction::dot_product, x, y, z), is_uint(y, 8) && is_int(z, 8)) || rewrite( x + h_add(cast(Int(32, lanes * 4), widening_mul(y, z)), lanes), - v_intrin("dot_product", x, z, y), + v_intrin(VectorInstruction::dot_product, x, z, y), is_int(y, 8) && is_uint(z, 8)) || rewrite( h_add(cast(Int(32, lanes * 4), widening_mul(x, y)), lanes) + z, - v_intrin("dot_product", z, x, y), + v_intrin(VectorInstruction::dot_product, z, x, y), is_uint(x, 8) && is_int(y, 8)) || rewrite( h_add(cast(Int(32, lanes * 4), widening_mul(x, y)), lanes) + z, - v_intrin("dot_product", z, y, x), + v_intrin(VectorInstruction::dot_product, z, y, x), is_int(x, 8) && is_uint(y, 8)) || // Accumulating pmaddwd. rewrite( x + h_add(widening_mul(y, z), lanes), - v_intrin("dot_product", x, y, z), + v_intrin(VectorInstruction::dot_product, x, y, z), is_int(y, 16, lanes * 2) && is_int(z, 16, lanes * 2)) || rewrite( h_add(widening_mul(x, y), lanes) + z, - v_intrin("dot_product", z, x, y), + v_intrin(VectorInstruction::dot_product, z, x, y), is_int(x, 16, lanes * 2) && is_int(y, 16, lanes * 2)) || false)) { @@ -149,7 +149,7 @@ class Optimize_X86 : public IRMutator { // We have dot_products for every x86 arch (because SSE2 has it), // so this is `always` safe (as long as the output type lanes has // a factor of 4). - return mutate(VectorIntrinsic::make(op->type, "dot_product", {ac, bd})); + return mutate(VectorInstruction::make(op->type, VectorInstruction::dot_product, {ac, bd})); } return IRMutator::visit(op); @@ -177,7 +177,7 @@ class Optimize_X86 : public IRMutator { Expr ac = Shuffle::make_interleave({matches[0], matches[2]}); Expr bd = Shuffle::make_interleave({matches[1], matches[3]}); // Always safe, see comment in Add case above. - return mutate(VectorIntrinsic::make(op->type, "dot_product", {ac, bd})); + return mutate(VectorInstruction::make(op->type, VectorInstruction::dot_product, {ac, bd})); } } @@ -208,38 +208,38 @@ class Optimize_X86 : public IRMutator { (target.has_feature(Target::SSE41) && rewrite( cast(Int(16, lanes), rounding_shift_right(widening_mul(x, y), 15)), - v_intrin("pmulhrs", x, y), + v_intrin(VectorInstruction::pmulhrs, x, y), is_int(x, 16) && is_int(y, 16))) || // saturating_narrow is always supported (via SSE2) for: // int32 -> int16, int16 -> int8, int16 -> uint8 rewrite( cast(Int(16, lanes), max(min(x, i32_i16min), i32_i16min)), - v_intrin("saturating_narrow", x), + v_intrin(VectorInstruction::saturating_narrow, x), is_int(x, 32)) || rewrite( cast(Int(8, lanes), max(min(x, i16_i8min), i16_i8min)), - v_intrin("saturating_narrow", x), + v_intrin(VectorInstruction::saturating_narrow, x), is_int(x, 16)) || rewrite( cast(UInt(8, lanes), max(min(x, i16_u8min), i16_u8min)), - v_intrin("saturating_narrow", x), + v_intrin(VectorInstruction::saturating_narrow, x), is_int(x, 16)) || // int32 -> uint16 is supported via SSE41 (target.has_feature(Target::SSE41) && rewrite( cast(UInt(16, lanes), max(min(x, i32_u16min), i32_u16min)), - v_intrin("saturating_narrow", x), + v_intrin(VectorInstruction::saturating_narrow, x), is_int(x, 32))) || // f32_to_bf16 is supported only via Target::AVX512_SapphireRapids (target.has_feature(Target::AVX512_SapphireRapids) && rewrite( cast(BFloat(16, lanes), x), - v_intrin("f32_to_bf16", x), + v_intrin(VectorInstruction::f32_to_bf16, x), is_float(x, 32))) || false) { @@ -285,6 +285,7 @@ class Optimize_X86 : public IRMutator { const int bits = op->type.bits(); auto rewrite = IRMatcher::rewriter(op, op->type); + using IRMatcher::typed; Type unsigned_type = op->type.with_code(halide_type_uint); auto x_uint = cast(unsigned_type, x); @@ -321,14 +322,17 @@ class Optimize_X86 : public IRMutator { ((op->type.is_int_or_uint() && bits == 16) && rewrite( mul_shift_right(x, y, 16), - v_intrin("pmulh", x, y))) || + v_intrin(VectorInstruction::pmulh, x, y))) || // saturating_pmulhrs is supported via SSE41 ((target.has_feature(Target::SSE41) && op->type.is_int() && bits == 16) && rewrite( rounding_mul_shift_right(x, y, 15), - v_intrin("saturating_pmulhrs", x, y))) || + // saturating_pmulhrs + select((x == typed(Int(16, lanes), -32768)) && (y == typed(Int(16, lanes), -32768)), + typed(Int(16, lanes), 32767), + v_intrin(VectorInstruction::pmulhrs, x, y)))) || // TODO(rootjalex): The following intrinsics are // simply one-to-one mappings, should they even @@ -340,29 +344,29 @@ class Optimize_X86 : public IRMutator { (op->type.is_float() && bits == 32)) && rewrite( abs(x), - v_intrin("abs", x))) || + v_intrin(VectorInstruction::abs, x))) || // saturating ops for 8 and 16 bits are always supported (via SSE2). ((bits == 8 || bits == 16) && (rewrite( saturating_add(x, y), - v_intrin("saturating_add", x, y)) || + v_intrin(VectorInstruction::saturating_add, x, y)) || rewrite( saturating_sub(x, y), - v_intrin("saturating_sub", x, y)))) || + v_intrin(VectorInstruction::saturating_sub, x, y)))) || // pavg ops for 8 and 16 bits are always supported (via SSE2). ((op->type.is_uint() && (bits == 8 || bits == 16)) && rewrite( rounding_halving_add(x, y), - v_intrin("rounding_halving_add", x, y))) || + v_intrin(VectorInstruction::rounding_halving_add, x, y))) || // int16 -> int32 widening_mul has a (v)pmaddwd implementation. // always supported (via SSE2). ((op->type.is_int() && (bits == 32)) && rewrite( widening_mul(x, y), - v_intrin("widening_mul", x, y), + v_intrin(VectorInstruction::widening_mul, x, y), is_int(x, 16) && is_int(y, 16))) || (target.has_feature(Target::AVX512_SapphireRapids) && @@ -370,27 +374,27 @@ class Optimize_X86 : public IRMutator { // SapphireRapids accumulating dot products. (rewrite( saturating_add(x, h_satadd(cast(Int(32, lanes * 4), widening_mul(y, z)), lanes)), - v_intrin("saturating_dot_product", x, y, z), + v_intrin(VectorInstruction::saturating_dot_product, x, y, z), is_uint(y, 8) && is_int(z, 8)) || rewrite( saturating_add(x, h_satadd(cast(Int(32, lanes * 4), widening_mul(y, z)), lanes)), - v_intrin("saturating_dot_product", x, z, y), + v_intrin(VectorInstruction::saturating_dot_product, x, z, y), is_int(y, 8) && is_uint(z, 8)) || rewrite( saturating_add(x, h_satadd(cast(Int(32, lanes * 2), widening_mul(y, z)), lanes)), - v_intrin("saturating_dot_product", x, y, z), + v_intrin(VectorInstruction::saturating_dot_product, x, y, z), is_uint(y, 8) && is_int(z, 8)) || rewrite( saturating_add(x, h_satadd(cast(Int(32, lanes * 2), widening_mul(y, z)), lanes)), - v_intrin("saturating_dot_product", x, z, y), + v_intrin(VectorInstruction::saturating_dot_product, x, z, y), is_int(y, 8) && is_uint(z, 8)) || rewrite( saturating_add(x, h_satadd(widening_mul(y, z), lanes)), - v_intrin("saturating_dot_product", x, z, y), + v_intrin(VectorInstruction::saturating_dot_product, x, z, y), is_int(y, 16, lanes * 2) && is_int(z, 16, lanes * 2)) || false)) || @@ -449,18 +453,18 @@ class Optimize_X86 : public IRMutator { ((factor == 2) && (rewrite( h_add(cast(Int(32, value_lanes), widening_mul(x, y)), lanes), - v_intrin("dot_product", cast(Int(16, value_lanes), x), cast(Int(16, value_lanes), y)), + v_intrin(VectorInstruction::dot_product, cast(Int(16, value_lanes), x), cast(Int(16, value_lanes), y)), x_is_int_or_uint && y_is_int_or_uint) || // Horizontal widening add via pmaddwd rewrite( h_add(cast(Int(32, value_lanes), x), lanes), - v_intrin("dot_product", x, make_const(Int(16, value_lanes), 1)), + v_intrin(VectorInstruction::dot_product, x, make_const(Int(16, value_lanes), 1)), is_int(x, 16)) || (rewrite( h_add(widening_mul(x, y), lanes), - v_intrin("dot_product", x, y), + v_intrin(VectorInstruction::dot_product, x, y), is_int(x, 16) && is_int(y, 16))) || // pmaddub supported via SSE41 @@ -468,23 +472,23 @@ class Optimize_X86 : public IRMutator { // Horizontal widening adds using 2-way saturating dot products. (rewrite( h_add(cast(UInt(16, value_lanes), x), lanes), - cast(UInt(16, lanes), typed(Int(16, lanes), v_intrin("saturating_dot_product", x, make_const(Int(8, value_lanes), 1)))), + cast(UInt(16, lanes), typed(Int(16, lanes), v_intrin(VectorInstruction::saturating_dot_product, x, make_const(Int(8, value_lanes), 1)))), is_uint(x, 8)) || rewrite( h_add(cast(Int(16, value_lanes), x), lanes), - v_intrin("saturating_dot_product", x, make_const(Int(8, value_lanes), 1)), + v_intrin(VectorInstruction::saturating_dot_product, x, make_const(Int(8, value_lanes), 1)), is_uint(x, 8)) || rewrite( h_add(cast(Int(16, value_lanes), x), lanes), - v_intrin("saturating_dot_product", make_const(UInt(8, value_lanes), 1), x), + v_intrin(VectorInstruction::saturating_dot_product, make_const(UInt(8, value_lanes), 1), x), is_int(x, 8)) || // SSE41 and AVX2 support horizontal_add via phadd intrinsics. rewrite( h_add(x, lanes), - v_intrin("horizontal_add", x), + v_intrin(VectorInstruction::horizontal_add, x), is_int(x, 16, lanes * 2) || is_uint(x, 16, lanes * 2) || is_int(x, 32, lanes * 2) || is_uint(x, 32, lanes * 2)) || @@ -504,12 +508,12 @@ class Optimize_X86 : public IRMutator { ((factor == 2) && target.has_feature(Target::SSE41) && (rewrite( h_satadd(widening_mul(x, y), lanes), - v_intrin("saturating_dot_product", x, y), + v_intrin(VectorInstruction::saturating_dot_product, x, y), is_uint(x, 8) && is_int(y, 8)) || rewrite( h_satadd(widening_mul(x, y), lanes), - v_intrin("saturating_dot_product", y, x), + v_intrin(VectorInstruction::saturating_dot_product, y, x), is_int(x, 8) && is_uint(y, 8)) || false))) { diff --git a/src/X86Optimize.h b/src/X86Optimize.h index df37c7dd896f..9ab9d5f54269 100644 --- a/src/X86Optimize.h +++ b/src/X86Optimize.h @@ -12,7 +12,7 @@ namespace Halide { namespace Internal { -/** Perform vector instruction selection, inserting VectorIntrinsic nodes. */ +/** Perform vector instruction selection, inserting VectorInstruction nodes. */ Stmt optimize_x86_instructions(Stmt stmt, const Target &target, const CodeGen_LLVM *codegen); } // namespace Internal From 339b6b71c4c2e4b6d87312b67fb8c5d393695e38 Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Wed, 27 Jul 2022 17:34:02 -0400 Subject: [PATCH 25/55] undef -> poison --- src/runtime/x86_avx2.ll | 9 ++++----- src/runtime/x86_sse41.ll | 9 ++++----- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/src/runtime/x86_avx2.ll b/src/runtime/x86_avx2.ll index 4f6f11718940..a340c240d734 100644 --- a/src/runtime/x86_avx2.ll +++ b/src/runtime/x86_avx2.ll @@ -74,16 +74,16 @@ define weak_odr <16 x i16> @hadd_pmadd_i8_avx2(<32 x i8> %a) nounwind alwaysinli declare <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>) nounwind readnone define weak_odr <16 x i16> @phaddw_avx2(<32 x i16> %a) nounwind alwaysinline { - %1 = shufflevector <32 x i16> %a, <32 x i16> undef, <16 x i32> - %2 = shufflevector <32 x i16> %a, <32 x i16> undef, <16 x i32> + %1 = shufflevector <32 x i16> %a, <32 x i16> poison, <16 x i32> + %2 = shufflevector <32 x i16> %a, <32 x i16> poison, <16 x i32> %3 = tail call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %1, <16 x i16> %2) ret <16 x i16> %3 } declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>) nounwind readnone define weak_odr <8 x i32> @phaddd_avx2(<16 x i32> %a) nounwind alwaysinline { - %1 = shufflevector <16 x i32> %a, <16 x i32> undef, <8 x i32> - %2 = shufflevector <16 x i32> %a, <16 x i32> undef, <8 x i32> + %1 = shufflevector <16 x i32> %a, <16 x i32> poison, <8 x i32> + %2 = shufflevector <16 x i32> %a, <16 x i32> poison, <8 x i32> %3 = tail call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %1, <8 x i32> %2) ret <8 x i32> %3 } @@ -103,4 +103,3 @@ define weak_odr <16 x i16> @phaddw_avx2(<32 x i16> %a) nounwind alwaysinline { ret <8 x i32> %res } declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) nounwind readnone - \ No newline at end of file diff --git a/src/runtime/x86_sse41.ll b/src/runtime/x86_sse41.ll index caf728957e59..59045d5e6337 100644 --- a/src/runtime/x86_sse41.ll +++ b/src/runtime/x86_sse41.ll @@ -94,18 +94,17 @@ define weak_odr <8 x i16> @hadd_pmadd_i8_sse3(<16 x i8> %a) nounwind alwaysinlin declare <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>) nounwind readnone define weak_odr <8 x i16> @phaddw_sse3(<16 x i16> %a) nounwind alwaysinline { - %1 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> - %2 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> + %1 = shufflevector <16 x i16> %a, <16 x i16> poison, <8 x i32> + %2 = shufflevector <16 x i16> %a, <16 x i16> poison, <8 x i32> %3 = tail call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %1, <8 x i16> %2) ret <8 x i16> %3 } declare <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16>, <8 x i16>) nounwind readnone define weak_odr <4 x i32> @phaddd_sse3(<8 x i32> %a) nounwind alwaysinline { - %1 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> - %2 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> + %1 = shufflevector <8 x i32> %a, <8 x i32> poison, <4 x i32> + %2 = shufflevector <8 x i32> %a, <8 x i32> poison, <4 x i32> %3 = tail call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %1, <4 x i32> %2) ret <4 x i32> %3 } declare <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32>, <4 x i32>) nounwind readnone - \ No newline at end of file From 17c99240c44dd2b4cc03e63e61010325849f298b Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Wed, 27 Jul 2022 17:36:14 -0400 Subject: [PATCH 26/55] fully remove saturating_pmulhrs --- src/CodeGen_X86.cpp | 2 -- src/runtime/x86_avx2.ll | 10 ---------- src/runtime/x86_sse41.ll | 10 ---------- 3 files changed, 22 deletions(-) diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp index 8caa3ee756b0..4d208e970a67 100644 --- a/src/CodeGen_X86.cpp +++ b/src/CodeGen_X86.cpp @@ -173,11 +173,9 @@ const x86Intrinsic intrinsic_defs[] = { {"llvm.x86.avx2.pmulh.w", Int(16, 16), "pmulh", {Int(16, 16), Int(16, 16)}, Target::AVX2}, {"llvm.x86.avx2.pmulhu.w", UInt(16, 16), "pmulh", {UInt(16, 16), UInt(16, 16)}, Target::AVX2}, {"llvm.x86.avx2.pmul.hr.sw", Int(16, 16), "pmulhrs", {Int(16, 16), Int(16, 16)}, Target::AVX2}, - {"saturating_pmulhrswx16", Int(16, 16), "saturating_pmulhrs", {Int(16, 16), Int(16, 16)}, Target::AVX2}, {"llvm.x86.sse2.pmulh.w", Int(16, 8), "pmulh", {Int(16, 8), Int(16, 8)}}, {"llvm.x86.sse2.pmulhu.w", UInt(16, 8), "pmulh", {UInt(16, 8), UInt(16, 8)}}, {"llvm.x86.ssse3.pmul.hr.sw.128", Int(16, 8), "pmulhrs", {Int(16, 8), Int(16, 8)}, Target::SSE41}, - {"saturating_pmulhrswx8", Int(16, 8), "saturating_pmulhrs", {Int(16, 8), Int(16, 8)}, Target::SSE41}, // Convert FP32 to BF16 {"vcvtne2ps2bf16x32", BFloat(16, 32), "f32_to_bf16", {Float(32, 32)}, Target::AVX512_SapphireRapids}, diff --git a/src/runtime/x86_avx2.ll b/src/runtime/x86_avx2.ll index a340c240d734..221d9560502d 100644 --- a/src/runtime/x86_avx2.ll +++ b/src/runtime/x86_avx2.ll @@ -52,16 +52,6 @@ define weak_odr <8 x i32> @abs_i32x8(<8 x i32> %arg) { ret <8 x i32> %3 } -define weak_odr <16 x i16> @saturating_pmulhrswx16(<16 x i16> %a, <16 x i16> %b) nounwind uwtable readnone alwaysinline { - %1 = tail call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %a, <16 x i16> %b) - %2 = icmp eq <16 x i16> %a, - %3 = icmp eq <16 x i16> %b, - %4 = and <16 x i1> %2, %3 - %5 = select <16 x i1> %4, <16 x i16> , <16 x i16> %1 - ret <16 x i16> %5 -} -declare <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16>, <16 x i16>) nounwind readnone - define weak_odr <16 x i16> @hadd_pmadd_u8_avx2(<32 x i8> %a) nounwind alwaysinline { %1 = tail call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %a, <32 x i8> ) ret <16 x i16> %1 diff --git a/src/runtime/x86_sse41.ll b/src/runtime/x86_sse41.ll index 59045d5e6337..6c7b2356de75 100644 --- a/src/runtime/x86_sse41.ll +++ b/src/runtime/x86_sse41.ll @@ -72,16 +72,6 @@ define weak_odr <4 x i32> @abs_i32x4(<4 x i32> %x) nounwind uwtable readnone alw ret <4 x i32> %3 } -define weak_odr <8 x i16> @saturating_pmulhrswx8(<8 x i16> %a, <8 x i16> %b) nounwind uwtable readnone alwaysinline { - %1 = tail call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16> %a, <8 x i16> %b) - %2 = icmp eq <8 x i16> %a, - %3 = icmp eq <8 x i16> %b, - %4 = and <8 x i1> %2, %3 - %5 = select <8 x i1> %4, <8 x i16> , <8 x i16> %1 - ret <8 x i16> %5 -} -declare <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16>, <8 x i16>) nounwind readnone - define weak_odr <8 x i16> @hadd_pmadd_u8_sse3(<16 x i8> %a) nounwind alwaysinline { %1 = tail call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a, <16 x i8> ) ret <8 x i16> %1 From 6c74a63554761dc84c813c38e90b2042b3bc8977 Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Wed, 27 Jul 2022 17:36:57 -0400 Subject: [PATCH 27/55] clang format --- src/IR.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/IR.cpp b/src/IR.cpp index 07832b93b6ca..6f6ed96e200c 100644 --- a/src/IR.cpp +++ b/src/IR.cpp @@ -940,7 +940,6 @@ const char *VectorInstruction::get_instruction_name() const { return instruction_op_names[op]; } - Expr VectorReduce::make(VectorReduce::Operator op, Expr vec, int lanes) { From 11690d793edba3719451115e921731f7c05899a2 Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Wed, 27 Jul 2022 23:42:12 -0400 Subject: [PATCH 28/55] disable UB for VectorInstruction node --- src/IRMutator.cpp | 12 +++++++----- src/IRVisitor.cpp | 14 ++++++++------ 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/src/IRMutator.cpp b/src/IRMutator.cpp index b1703a6cccd1..9ac27753ae09 100644 --- a/src/IRMutator.cpp +++ b/src/IRMutator.cpp @@ -328,11 +328,13 @@ Expr IRMutator::visit(const Shuffle *op) { } Expr IRMutator::visit(const VectorInstruction *op) { - auto [new_args, changed] = mutate_with_changes(op->args); - if (!changed) { - return op; - } - return VectorInstruction::make(op->type, op->op, new_args); + internal_error << "Always implement VectorInstruction visitor for IRMutator subclass\n"; + // auto [new_args, changed] = mutate_with_changes(op->args); + // if (!changed) { + // return op; + // } + // return VectorInstruction::make(op->type, op->op, new_args); + return Expr(); } Expr IRMutator::visit(const VectorReduce *op) { diff --git a/src/IRVisitor.cpp b/src/IRVisitor.cpp index 97c55d8075ac..bbca23a77a7b 100644 --- a/src/IRVisitor.cpp +++ b/src/IRVisitor.cpp @@ -258,9 +258,10 @@ void IRVisitor::visit(const Shuffle *op) { } void IRVisitor::visit(const VectorInstruction *op) { - for (const auto &arg : op->args) { - arg.accept(this); - } + internal_error << "Always implement VectorInstruction visitor for IRVisitor subclass\n"; + // for (const auto &arg : op->args) { + // arg.accept(this); + // } } void IRVisitor::visit(const VectorReduce *op) { @@ -522,9 +523,10 @@ void IRGraphVisitor::visit(const Shuffle *op) { } void IRGraphVisitor::visit(const VectorInstruction *op) { - for (const auto &arg : op->args) { - include(arg); - } + internal_error << "Always implement VectorInstruction visitor for IRGraphVisitor subclass\n"; + // for (const auto &arg : op->args) { + // include(arg); + // } } void IRGraphVisitor::visit(const VectorReduce *op) { From 3648ca6172dfffb337865de61ca59058acdb9395 Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Thu, 28 Jul 2022 00:06:39 -0400 Subject: [PATCH 29/55] implement a base class for instruction selection --- Makefile | 2 ++ src/CMakeLists.txt | 2 ++ src/InstructionSelector.cpp | 25 ++++++++++++++++++ src/InstructionSelector.h | 35 +++++++++++++++++++++++++ src/X86Optimize.cpp | 52 ++++++++++++------------------------- 5 files changed, 80 insertions(+), 36 deletions(-) create mode 100644 src/InstructionSelector.cpp create mode 100644 src/InstructionSelector.h diff --git a/Makefile b/Makefile index bd0381578b47..1eebde7970c4 100644 --- a/Makefile +++ b/Makefile @@ -476,6 +476,7 @@ SOURCE_FILES = \ InjectHostDevBufferCopies.cpp \ Inline.cpp \ InlineReductions.cpp \ + InstructionSelector.cpp \ IntegerDivisionTable.cpp \ Interval.cpp \ Introspection.cpp \ @@ -656,6 +657,7 @@ HEADER_FILES = \ InjectHostDevBufferCopies.h \ Inline.h \ InlineReductions.h \ + InstructionSelector.h \ IntegerDivisionTable.h \ Interval.h \ Introspection.h \ diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 64d1a9f4316e..e1c51f19e641 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -81,6 +81,7 @@ set(HEADER_FILES InjectHostDevBufferCopies.h Inline.h InlineReductions.h + InstructionSelector.h IntegerDivisionTable.h Interval.h Introspection.h @@ -245,6 +246,7 @@ set(SOURCE_FILES InjectHostDevBufferCopies.cpp Inline.cpp InlineReductions.cpp + InstructionSelector.cpp IntegerDivisionTable.cpp Interval.cpp Introspection.cpp diff --git a/src/InstructionSelector.cpp b/src/InstructionSelector.cpp new file mode 100644 index 000000000000..e408895eec6f --- /dev/null +++ b/src/InstructionSelector.cpp @@ -0,0 +1,25 @@ +#include "InstructionSelector.h" + +#include "CodeGen_Internal.h" + + +namespace Halide { +namespace Internal { + +InstructionSelector::InstructionSelector(const Target &t, const CodeGen_LLVM *c) : target(t), codegen(c) { +} + +Expr InstructionSelector::visit(const Div *op) { + if (!op->type.is_vector() || !op->type.is_int_or_uint()) { + return IRGraphMutator::visit(op); + } + // Lower division here in order to do pattern-matching on intrinsics. + return mutate(lower_int_uint_div(op->a, op->b)); +} + +Expr InstructionSelector::visit(const VectorReduce *op) { + return codegen->split_vector_reduce(op, Expr()); +} + +} // namespace Internal +} // namespace Halide diff --git a/src/InstructionSelector.h b/src/InstructionSelector.h new file mode 100644 index 000000000000..723ec6addd69 --- /dev/null +++ b/src/InstructionSelector.h @@ -0,0 +1,35 @@ +#ifndef HALIDE_INSTR_SELECTOR_H +#define HALIDE_INSTR_SELECTOR_H + +/** \file + * Defines a base class for VectorInstruction selection. + */ + +#include "CodeGen_LLVM.h" +#include "IR.h" +#include "IRMutator.h" +#include "Target.h" + +namespace Halide { +namespace Internal { + +/** A base class for vector instruction selection. + * The default implementation lowers int and uint + * division via `lower_int_uint_div` and splits + * VectorReduce nodes via CodeGen_LLVM::split_vector_reduce(). + */ +class InstructionSelector : public IRGraphMutator { +protected: + const Target ⌖ + const CodeGen_LLVM *codegen; + + Expr visit(const Div *) override; + Expr visit(const VectorReduce *) override; +public: + InstructionSelector(const Target &target, const CodeGen_LLVM *codegen); +}; + +} // namespace Internal +} // namespace Halide + +#endif diff --git a/src/X86Optimize.cpp b/src/X86Optimize.cpp index 4972e5c775c4..bf2c317f189b 100644 --- a/src/X86Optimize.cpp +++ b/src/X86Optimize.cpp @@ -1,9 +1,8 @@ #include "X86Optimize.h" #include "CSE.h" -// FIXME: move lower_int_uint_div out of CodeGen_Internal to remove this dependency. -#include "CodeGen_Internal.h" #include "FindIntrinsics.h" +#include "InstructionSelector.h" #include "IR.h" #include "IRMatch.h" #include "IRMutator.h" @@ -61,12 +60,12 @@ bool should_use_dot_product(const Expr &a, const Expr &b, std::vector &res } /** A top-down code optimizer that replaces Halide IR with VectorInstructions specific to x86. */ -class Optimize_X86 : public IRMutator { +class Optimize_X86 : public InstructionSelector { public: /** Create an x86 code optimizer. Processor features can be * enabled using the appropriate flags in the target struct. */ - Optimize_X86(const Target &t, const CodeGen_LLVM *c) - : target(t), codegen(c) { + Optimize_X86(const Target &target, const CodeGen_LLVM *codegen) + : InstructionSelector(target, codegen) { } protected: @@ -77,20 +76,12 @@ class Optimize_X86 : public IRMutator { return type.is_vector(); } - using IRMutator::visit; - - Expr visit(const Div *op) override { - if (!should_peephole_optimize(op->type) || !op->type.is_int_or_uint()) { - return IRMutator::visit(op); - } - // Lower division here in order to do pattern-matching on intrinsics. - return mutate(lower_int_uint_div(op->a, op->b)); - } + using IRGraphMutator::visit; /** Nodes for which we want to emit specific sse/avx intrinsics */ Expr visit(const Add *op) override { if (!should_peephole_optimize(op->type)) { - return IRMutator::visit(op); + return IRGraphMutator::visit(op); } std::vector matches; @@ -152,12 +143,12 @@ class Optimize_X86 : public IRMutator { return mutate(VectorInstruction::make(op->type, VectorInstruction::dot_product, {ac, bd})); } - return IRMutator::visit(op); + return IRGraphMutator::visit(op); } Expr visit(const Sub *op) override { if (!should_peephole_optimize(op->type)) { - return IRMutator::visit(op); + return IRGraphMutator::visit(op); } std::vector matches; @@ -181,12 +172,12 @@ class Optimize_X86 : public IRMutator { } } - return IRMutator::visit(op); + return IRGraphMutator::visit(op); } Expr visit(const Cast *op) override { if (!should_peephole_optimize(op->type)) { - return IRMutator::visit(op); + return IRGraphMutator::visit(op); } const int lanes = op->type.lanes(); @@ -248,12 +239,12 @@ class Optimize_X86 : public IRMutator { // TODO: should we handle CodeGen_X86's weird 8 -> 16 bit issue here? - return IRMutator::visit(op); + return IRGraphMutator::visit(op); } Expr visit(const Call *op) override { if (!should_peephole_optimize(op->type)) { - return IRMutator::visit(op); + return IRGraphMutator::visit(op); } // TODO: This optimization is hard to do via a rewrite-rule because of lossless_cast. @@ -405,6 +396,7 @@ class Optimize_X86 : public IRMutator { // Fixed-point intrinsics should be lowered here. // This is safe because this mutator is top-down. + // FIXME: Should this be default behavior of the base InstructionSelector class? if (op->is_intrinsic({ Call::halving_add, Call::halving_sub, @@ -426,7 +418,7 @@ class Optimize_X86 : public IRMutator { return mutate(lower_intrinsic(op)); } - return IRMutator::visit(op); + return IRGraphMutator::visit(op); } Expr visit(const VectorReduce *op) override { @@ -435,7 +427,7 @@ class Optimize_X86 : public IRMutator { // matching here. if ((op->op != VectorReduce::Add && op->op != VectorReduce::SaturatingAdd) || !should_peephole_optimize(op->type)) { - return IRMutator::visit(op); + return InstructionSelector::visit(op); } const int lanes = op->type.lanes(); @@ -525,22 +517,10 @@ class Optimize_X86 : public IRMutator { break; } - return attempt_vector_split(op); - } - - Expr attempt_vector_split(const VectorReduce *op) { - Expr split = codegen->split_vector_reduce(op, Expr()); - if (split.defined() && !split.same_as(op)) { - return mutate(split); - } else { - return IRMutator::visit(op); - } + return InstructionSelector::visit(op); } private: - const Target ⌖ - const CodeGen_LLVM *codegen; - IRMatcher::Wild<0> x; IRMatcher::Wild<1> y; IRMatcher::Wild<2> z; From c21bec5bbdcfb459cb31cc914c2189f0cee71139 Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Thu, 28 Jul 2022 00:24:05 -0400 Subject: [PATCH 30/55] clang format --- src/InstructionSelector.cpp | 4 ++-- src/InstructionSelector.h | 1 + src/X86Optimize.cpp | 44 ++++++++++++++++++------------------- 3 files changed, 25 insertions(+), 24 deletions(-) diff --git a/src/InstructionSelector.cpp b/src/InstructionSelector.cpp index e408895eec6f..569c04925638 100644 --- a/src/InstructionSelector.cpp +++ b/src/InstructionSelector.cpp @@ -2,11 +2,11 @@ #include "CodeGen_Internal.h" - namespace Halide { namespace Internal { -InstructionSelector::InstructionSelector(const Target &t, const CodeGen_LLVM *c) : target(t), codegen(c) { +InstructionSelector::InstructionSelector(const Target &t, const CodeGen_LLVM *c) + : target(t), codegen(c) { } Expr InstructionSelector::visit(const Div *op) { diff --git a/src/InstructionSelector.h b/src/InstructionSelector.h index 723ec6addd69..351a7c785a58 100644 --- a/src/InstructionSelector.h +++ b/src/InstructionSelector.h @@ -25,6 +25,7 @@ class InstructionSelector : public IRGraphMutator { Expr visit(const Div *) override; Expr visit(const VectorReduce *) override; + public: InstructionSelector(const Target &target, const CodeGen_LLVM *codegen); }; diff --git a/src/X86Optimize.cpp b/src/X86Optimize.cpp index 99918be40f30..541337fb14f9 100644 --- a/src/X86Optimize.cpp +++ b/src/X86Optimize.cpp @@ -2,11 +2,11 @@ #include "CSE.h" #include "FindIntrinsics.h" -#include "InstructionSelector.h" #include "IR.h" #include "IRMatch.h" #include "IRMutator.h" #include "IROperator.h" +#include "InstructionSelector.h" #include "Simplify.h" namespace Halide { @@ -490,40 +490,40 @@ class Optimize_X86 : public InstructionSelector { // psadbw is always supported via SSE2. ((factor == 8) && (rewrite( - h_add(cast(UInt(64, value_lanes), absd(x, y)), lanes), - v_intrin(VectorInstruction::sum_absd, x, y), - is_uint(x, 8) && is_uint(y, 8)) || - + h_add(cast(UInt(64, value_lanes), absd(x, y)), lanes), + v_intrin(VectorInstruction::sum_absd, x, y), + is_uint(x, 8) && is_uint(y, 8)) || + // Rewrite non-native sum-of-absolute-difference variants to the native // op. We support reducing to various types. We could consider supporting // multiple reduction factors too, but in general we don't handle non-native // reduction factors for VectorReduce nodes (yet?). rewrite( - h_add(cast(UInt(16, value_lanes), absd(x, y)), lanes), - cast(UInt(16, lanes), typed(UInt(64, lanes), v_intrin(VectorInstruction::sum_absd, x, y))), - is_uint(x, 8) && is_uint(y, 8)) || - + h_add(cast(UInt(16, value_lanes), absd(x, y)), lanes), + cast(UInt(16, lanes), typed(UInt(64, lanes), v_intrin(VectorInstruction::sum_absd, x, y))), + is_uint(x, 8) && is_uint(y, 8)) || + rewrite( - h_add(cast(UInt(32, value_lanes), absd(x, y)), lanes), - cast(UInt(32, lanes), typed(UInt(64, lanes), v_intrin(VectorInstruction::sum_absd, x, y))), - is_uint(x, 8) && is_uint(y, 8)) || + h_add(cast(UInt(32, value_lanes), absd(x, y)), lanes), + cast(UInt(32, lanes), typed(UInt(64, lanes), v_intrin(VectorInstruction::sum_absd, x, y))), + is_uint(x, 8) && is_uint(y, 8)) || rewrite( - h_add(cast(Int(16, value_lanes), absd(x, y)), lanes), - cast(Int(16, lanes), typed(UInt(64, lanes), v_intrin(VectorInstruction::sum_absd, x, y))), - is_uint(x, 8) && is_uint(y, 8)) || + h_add(cast(Int(16, value_lanes), absd(x, y)), lanes), + cast(Int(16, lanes), typed(UInt(64, lanes), v_intrin(VectorInstruction::sum_absd, x, y))), + is_uint(x, 8) && is_uint(y, 8)) || rewrite( - h_add(cast(Int(32, value_lanes), absd(x, y)), lanes), - cast(Int(32, lanes), typed(UInt(64, lanes), v_intrin(VectorInstruction::sum_absd, x, y))), - is_uint(x, 8) && is_uint(y, 8)) || + h_add(cast(Int(32, value_lanes), absd(x, y)), lanes), + cast(Int(32, lanes), typed(UInt(64, lanes), v_intrin(VectorInstruction::sum_absd, x, y))), + is_uint(x, 8) && is_uint(y, 8)) || rewrite( - h_add(cast(Int(64, value_lanes), absd(x, y)), lanes), - cast(Int(64, lanes), typed(UInt(64, lanes), v_intrin(VectorInstruction::sum_absd, x, y))), - is_uint(x, 8) && is_uint(y, 8)) || + h_add(cast(Int(64, value_lanes), absd(x, y)), lanes), + cast(Int(64, lanes), typed(UInt(64, lanes), v_intrin(VectorInstruction::sum_absd, x, y))), + is_uint(x, 8) && is_uint(y, 8)) || - false))) { + false))) { return mutate(rewrite.result); } break; From e6502f8a300f01616ec22c92deb858a8b45171e7 Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Thu, 28 Jul 2022 00:32:28 -0400 Subject: [PATCH 31/55] fix last remnants of vector intrinsic -> vector instruction renaming --- src/IRMatch.h | 8 ++--- src/IRPrinter.cpp | 2 +- src/StmtToHtml.cpp | 2 +- src/X86Optimize.cpp | 78 ++++++++++++++++++++++----------------------- 4 files changed, 45 insertions(+), 45 deletions(-) diff --git a/src/IRMatch.h b/src/IRMatch.h index 982a8dcaeace..60d54f8e391d 100644 --- a/src/IRMatch.h +++ b/src/IRMatch.h @@ -1884,7 +1884,7 @@ HALIDE_ALWAYS_INLINE auto ramp(A &&a, B &&b, C &&c) noexcept -> RampOp -struct VectorIntrinOp { +struct VectorInstrOp { struct pattern_tag {}; const VectorInstruction::InstructionOp op; std::tuple args; @@ -1964,7 +1964,7 @@ struct VectorIntrinOp { constexpr static bool foldable = false; HALIDE_ALWAYS_INLINE - VectorIntrinOp(const VectorInstruction::InstructionOp _op, Args... args) noexcept + VectorInstrOp(const VectorInstruction::InstructionOp _op, Args... args) noexcept : op(_op), args(args...) { static_assert(sizeof...(Args) > 0 && sizeof...(Args) <= 3, "VectorInstructionOp must have non-zero arguments, and update make() if more than 3 arguments."); @@ -1972,7 +1972,7 @@ struct VectorIntrinOp { }; template -std::ostream &operator<<(std::ostream &s, const VectorIntrinOp &op) { +std::ostream &operator<<(std::ostream &s, const VectorInstrOp &op) { // TODO(rootjalex): Should we print the type? s << "vector_intrin(\""; s << op.op << "\", "; @@ -1982,7 +1982,7 @@ std::ostream &operator<<(std::ostream &s, const VectorIntrinOp &op) { } template -HALIDE_ALWAYS_INLINE auto v_intrin(const VectorInstruction::InstructionOp op, Args... args) noexcept -> VectorIntrinOp { +HALIDE_ALWAYS_INLINE auto v_instr(const VectorInstruction::InstructionOp op, Args... args) noexcept -> VectorInstrOp { return {op, pattern_arg(args)...}; } diff --git a/src/IRPrinter.cpp b/src/IRPrinter.cpp index 324b399e5548..78d0e087d7cb 100644 --- a/src/IRPrinter.cpp +++ b/src/IRPrinter.cpp @@ -1076,7 +1076,7 @@ void IRPrinter::visit(const Shuffle *op) { void IRPrinter::visit(const VectorInstruction *op) { stream << "(" << op->type - << ")vector_intrinsic(\"" + << ")vector_instruction(\"" << op->get_instruction_name() << "\", "; print_list(op->args); diff --git a/src/StmtToHtml.cpp b/src/StmtToHtml.cpp index 8dd6afdc8a73..36db8155c525 100644 --- a/src/StmtToHtml.cpp +++ b/src/StmtToHtml.cpp @@ -715,7 +715,7 @@ class StmtToHtml : public IRVisitor { void visit(const VectorInstruction *op) override { stream << open_span("VectorInstruction"); stream << open_span("Type") << op->type << close_span(); - print_list(symbol("vector_intrinsic") + "(\"" + op->get_instruction_name() + "\"", op->args, ")"); + print_list(symbol("vector_instruction") + "(\"" + op->get_instruction_name() + "\"", op->args, ")"); stream << close_span(); } diff --git a/src/X86Optimize.cpp b/src/X86Optimize.cpp index 541337fb14f9..b6e5812ab07f 100644 --- a/src/X86Optimize.cpp +++ b/src/X86Optimize.cpp @@ -101,33 +101,33 @@ class Optimize_X86 : public InstructionSelector { // Accumulating pmaddubsw (rewrite( x + h_add(cast(Int(32, lanes * 4), widening_mul(y, z)), lanes), - v_intrin(VectorInstruction::dot_product, x, y, z), + v_instr(VectorInstruction::dot_product, x, y, z), is_uint(y, 8) && is_int(z, 8)) || rewrite( x + h_add(cast(Int(32, lanes * 4), widening_mul(y, z)), lanes), - v_intrin(VectorInstruction::dot_product, x, z, y), + v_instr(VectorInstruction::dot_product, x, z, y), is_int(y, 8) && is_uint(z, 8)) || rewrite( h_add(cast(Int(32, lanes * 4), widening_mul(x, y)), lanes) + z, - v_intrin(VectorInstruction::dot_product, z, x, y), + v_instr(VectorInstruction::dot_product, z, x, y), is_uint(x, 8) && is_int(y, 8)) || rewrite( h_add(cast(Int(32, lanes * 4), widening_mul(x, y)), lanes) + z, - v_intrin(VectorInstruction::dot_product, z, y, x), + v_instr(VectorInstruction::dot_product, z, y, x), is_int(x, 8) && is_uint(y, 8)) || // Accumulating pmaddwd. rewrite( x + h_add(widening_mul(y, z), lanes), - v_intrin(VectorInstruction::dot_product, x, y, z), + v_instr(VectorInstruction::dot_product, x, y, z), is_int(y, 16, lanes * 2) && is_int(z, 16, lanes * 2)) || rewrite( h_add(widening_mul(x, y), lanes) + z, - v_intrin(VectorInstruction::dot_product, z, x, y), + v_instr(VectorInstruction::dot_product, z, x, y), is_int(x, 16, lanes * 2) && is_int(y, 16, lanes * 2)) || false)) { @@ -199,38 +199,38 @@ class Optimize_X86 : public InstructionSelector { (target.has_feature(Target::SSE41) && rewrite( cast(Int(16, lanes), rounding_shift_right(widening_mul(x, y), 15)), - v_intrin(VectorInstruction::pmulhrs, x, y), + v_instr(VectorInstruction::pmulhrs, x, y), is_int(x, 16) && is_int(y, 16))) || // saturating_narrow is always supported (via SSE2) for: // int32 -> int16, int16 -> int8, int16 -> uint8 rewrite( cast(Int(16, lanes), max(min(x, i32_i16min), i32_i16min)), - v_intrin(VectorInstruction::saturating_narrow, x), + v_instr(VectorInstruction::saturating_narrow, x), is_int(x, 32)) || rewrite( cast(Int(8, lanes), max(min(x, i16_i8min), i16_i8min)), - v_intrin(VectorInstruction::saturating_narrow, x), + v_instr(VectorInstruction::saturating_narrow, x), is_int(x, 16)) || rewrite( cast(UInt(8, lanes), max(min(x, i16_u8min), i16_u8min)), - v_intrin(VectorInstruction::saturating_narrow, x), + v_instr(VectorInstruction::saturating_narrow, x), is_int(x, 16)) || // int32 -> uint16 is supported via SSE41 (target.has_feature(Target::SSE41) && rewrite( cast(UInt(16, lanes), max(min(x, i32_u16min), i32_u16min)), - v_intrin(VectorInstruction::saturating_narrow, x), + v_instr(VectorInstruction::saturating_narrow, x), is_int(x, 32))) || // f32_to_bf16 is supported only via Target::AVX512_SapphireRapids (target.has_feature(Target::AVX512_SapphireRapids) && rewrite( cast(BFloat(16, lanes), x), - v_intrin(VectorInstruction::f32_to_bf16, x), + v_instr(VectorInstruction::f32_to_bf16, x), is_float(x, 32))) || false) { @@ -313,7 +313,7 @@ class Optimize_X86 : public InstructionSelector { ((op->type.is_int_or_uint() && bits == 16) && rewrite( mul_shift_right(x, y, 16), - v_intrin(VectorInstruction::pmulh, x, y))) || + v_instr(VectorInstruction::pmulh, x, y))) || // saturating_pmulhrs is supported via SSE41 ((target.has_feature(Target::SSE41) && @@ -323,7 +323,7 @@ class Optimize_X86 : public InstructionSelector { // saturating_pmulhrs select((x == typed(Int(16, lanes), -32768)) && (y == typed(Int(16, lanes), -32768)), typed(Int(16, lanes), 32767), - v_intrin(VectorInstruction::pmulhrs, x, y)))) || + v_instr(VectorInstruction::pmulhrs, x, y)))) || // TODO(rootjalex): The following intrinsics are // simply one-to-one mappings, should they even @@ -335,29 +335,29 @@ class Optimize_X86 : public InstructionSelector { (op->type.is_float() && bits == 32)) && rewrite( abs(x), - v_intrin(VectorInstruction::abs, x))) || + v_instr(VectorInstruction::abs, x))) || // saturating ops for 8 and 16 bits are always supported (via SSE2). ((bits == 8 || bits == 16) && (rewrite( saturating_add(x, y), - v_intrin(VectorInstruction::saturating_add, x, y)) || + v_instr(VectorInstruction::saturating_add, x, y)) || rewrite( saturating_sub(x, y), - v_intrin(VectorInstruction::saturating_sub, x, y)))) || + v_instr(VectorInstruction::saturating_sub, x, y)))) || // pavg ops for 8 and 16 bits are always supported (via SSE2). ((op->type.is_uint() && (bits == 8 || bits == 16)) && rewrite( rounding_halving_add(x, y), - v_intrin(VectorInstruction::rounding_halving_add, x, y))) || + v_instr(VectorInstruction::rounding_halving_add, x, y))) || // int16 -> int32 widening_mul has a (v)pmaddwd implementation. // always supported (via SSE2). ((op->type.is_int() && (bits == 32)) && rewrite( widening_mul(x, y), - v_intrin(VectorInstruction::widening_mul, x, y), + v_instr(VectorInstruction::widening_mul, x, y), is_int(x, 16) && is_int(y, 16))) || (target.has_feature(Target::AVX512_SapphireRapids) && @@ -365,27 +365,27 @@ class Optimize_X86 : public InstructionSelector { // SapphireRapids accumulating dot products. (rewrite( saturating_add(x, h_satadd(cast(Int(32, lanes * 4), widening_mul(y, z)), lanes)), - v_intrin(VectorInstruction::saturating_dot_product, x, y, z), + v_instr(VectorInstruction::saturating_dot_product, x, y, z), is_uint(y, 8) && is_int(z, 8)) || rewrite( saturating_add(x, h_satadd(cast(Int(32, lanes * 4), widening_mul(y, z)), lanes)), - v_intrin(VectorInstruction::saturating_dot_product, x, z, y), + v_instr(VectorInstruction::saturating_dot_product, x, z, y), is_int(y, 8) && is_uint(z, 8)) || rewrite( saturating_add(x, h_satadd(cast(Int(32, lanes * 2), widening_mul(y, z)), lanes)), - v_intrin(VectorInstruction::saturating_dot_product, x, y, z), + v_instr(VectorInstruction::saturating_dot_product, x, y, z), is_uint(y, 8) && is_int(z, 8)) || rewrite( saturating_add(x, h_satadd(cast(Int(32, lanes * 2), widening_mul(y, z)), lanes)), - v_intrin(VectorInstruction::saturating_dot_product, x, z, y), + v_instr(VectorInstruction::saturating_dot_product, x, z, y), is_int(y, 8) && is_uint(z, 8)) || rewrite( saturating_add(x, h_satadd(widening_mul(y, z), lanes)), - v_intrin(VectorInstruction::saturating_dot_product, x, z, y), + v_instr(VectorInstruction::saturating_dot_product, x, z, y), is_int(y, 16, lanes * 2) && is_int(z, 16, lanes * 2)) || false)) || @@ -445,18 +445,18 @@ class Optimize_X86 : public InstructionSelector { ((factor == 2) && (rewrite( h_add(cast(Int(32, value_lanes), widening_mul(x, y)), lanes), - v_intrin(VectorInstruction::dot_product, cast(Int(16, value_lanes), x), cast(Int(16, value_lanes), y)), + v_instr(VectorInstruction::dot_product, cast(Int(16, value_lanes), x), cast(Int(16, value_lanes), y)), x_is_int_or_uint && y_is_int_or_uint) || // Horizontal widening add via pmaddwd rewrite( h_add(cast(Int(32, value_lanes), x), lanes), - v_intrin(VectorInstruction::dot_product, x, make_const(Int(16, value_lanes), 1)), + v_instr(VectorInstruction::dot_product, x, make_const(Int(16, value_lanes), 1)), is_int(x, 16)) || (rewrite( h_add(widening_mul(x, y), lanes), - v_intrin(VectorInstruction::dot_product, x, y), + v_instr(VectorInstruction::dot_product, x, y), is_int(x, 16) && is_int(y, 16))) || // pmaddub supported via SSE41 @@ -464,23 +464,23 @@ class Optimize_X86 : public InstructionSelector { // Horizontal widening adds using 2-way saturating dot products. (rewrite( h_add(cast(UInt(16, value_lanes), x), lanes), - cast(UInt(16, lanes), typed(Int(16, lanes), v_intrin(VectorInstruction::saturating_dot_product, x, make_const(Int(8, value_lanes), 1)))), + cast(UInt(16, lanes), typed(Int(16, lanes), v_instr(VectorInstruction::saturating_dot_product, x, make_const(Int(8, value_lanes), 1)))), is_uint(x, 8)) || rewrite( h_add(cast(Int(16, value_lanes), x), lanes), - v_intrin(VectorInstruction::saturating_dot_product, x, make_const(Int(8, value_lanes), 1)), + v_instr(VectorInstruction::saturating_dot_product, x, make_const(Int(8, value_lanes), 1)), is_uint(x, 8)) || rewrite( h_add(cast(Int(16, value_lanes), x), lanes), - v_intrin(VectorInstruction::saturating_dot_product, make_const(UInt(8, value_lanes), 1), x), + v_instr(VectorInstruction::saturating_dot_product, make_const(UInt(8, value_lanes), 1), x), is_int(x, 8)) || // SSE41 and AVX2 support horizontal_add via phadd intrinsics. rewrite( h_add(x, lanes), - v_intrin(VectorInstruction::horizontal_add, x), + v_instr(VectorInstruction::horizontal_add, x), is_int(x, 16, lanes * 2) || is_uint(x, 16, lanes * 2) || is_int(x, 32, lanes * 2) || is_uint(x, 32, lanes * 2)) || @@ -491,7 +491,7 @@ class Optimize_X86 : public InstructionSelector { ((factor == 8) && (rewrite( h_add(cast(UInt(64, value_lanes), absd(x, y)), lanes), - v_intrin(VectorInstruction::sum_absd, x, y), + v_instr(VectorInstruction::sum_absd, x, y), is_uint(x, 8) && is_uint(y, 8)) || // Rewrite non-native sum-of-absolute-difference variants to the native @@ -500,27 +500,27 @@ class Optimize_X86 : public InstructionSelector { // reduction factors for VectorReduce nodes (yet?). rewrite( h_add(cast(UInt(16, value_lanes), absd(x, y)), lanes), - cast(UInt(16, lanes), typed(UInt(64, lanes), v_intrin(VectorInstruction::sum_absd, x, y))), + cast(UInt(16, lanes), typed(UInt(64, lanes), v_instr(VectorInstruction::sum_absd, x, y))), is_uint(x, 8) && is_uint(y, 8)) || rewrite( h_add(cast(UInt(32, value_lanes), absd(x, y)), lanes), - cast(UInt(32, lanes), typed(UInt(64, lanes), v_intrin(VectorInstruction::sum_absd, x, y))), + cast(UInt(32, lanes), typed(UInt(64, lanes), v_instr(VectorInstruction::sum_absd, x, y))), is_uint(x, 8) && is_uint(y, 8)) || rewrite( h_add(cast(Int(16, value_lanes), absd(x, y)), lanes), - cast(Int(16, lanes), typed(UInt(64, lanes), v_intrin(VectorInstruction::sum_absd, x, y))), + cast(Int(16, lanes), typed(UInt(64, lanes), v_instr(VectorInstruction::sum_absd, x, y))), is_uint(x, 8) && is_uint(y, 8)) || rewrite( h_add(cast(Int(32, value_lanes), absd(x, y)), lanes), - cast(Int(32, lanes), typed(UInt(64, lanes), v_intrin(VectorInstruction::sum_absd, x, y))), + cast(Int(32, lanes), typed(UInt(64, lanes), v_instr(VectorInstruction::sum_absd, x, y))), is_uint(x, 8) && is_uint(y, 8)) || rewrite( h_add(cast(Int(64, value_lanes), absd(x, y)), lanes), - cast(Int(64, lanes), typed(UInt(64, lanes), v_intrin(VectorInstruction::sum_absd, x, y))), + cast(Int(64, lanes), typed(UInt(64, lanes), v_instr(VectorInstruction::sum_absd, x, y))), is_uint(x, 8) && is_uint(y, 8)) || false))) { @@ -535,12 +535,12 @@ class Optimize_X86 : public InstructionSelector { ((factor == 2) && target.has_feature(Target::SSE41) && (rewrite( h_satadd(widening_mul(x, y), lanes), - v_intrin(VectorInstruction::saturating_dot_product, x, y), + v_instr(VectorInstruction::saturating_dot_product, x, y), is_uint(x, 8) && is_int(y, 8)) || rewrite( h_satadd(widening_mul(x, y), lanes), - v_intrin(VectorInstruction::saturating_dot_product, y, x), + v_instr(VectorInstruction::saturating_dot_product, y, x), is_int(x, 8) && is_uint(y, 8)) || false))) { From 6d2bfd145f5692b7e5d3025f677c10e9c1aca4b4 Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Thu, 28 Jul 2022 01:05:26 -0400 Subject: [PATCH 32/55] fix virtual func hidden error --- src/InstructionSelector.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/InstructionSelector.h b/src/InstructionSelector.h index 351a7c785a58..e0489da14979 100644 --- a/src/InstructionSelector.h +++ b/src/InstructionSelector.h @@ -23,6 +23,7 @@ class InstructionSelector : public IRGraphMutator { const Target ⌖ const CodeGen_LLVM *codegen; + using IRGraphMutator::visit; Expr visit(const Div *) override; Expr visit(const VectorReduce *) override; From fa2d4e246efcd22e8a18d9e90b2211bfe76f4eec Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Thu, 28 Jul 2022 01:09:31 -0400 Subject: [PATCH 33/55] remove 'implement VI visitor' error msg --- src/IRMutator.cpp | 14 +++++++------- src/IRVisitor.cpp | 16 ++++++++-------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/src/IRMutator.cpp b/src/IRMutator.cpp index 9ac27753ae09..7b05b46ac22f 100644 --- a/src/IRMutator.cpp +++ b/src/IRMutator.cpp @@ -328,13 +328,13 @@ Expr IRMutator::visit(const Shuffle *op) { } Expr IRMutator::visit(const VectorInstruction *op) { - internal_error << "Always implement VectorInstruction visitor for IRMutator subclass\n"; - // auto [new_args, changed] = mutate_with_changes(op->args); - // if (!changed) { - // return op; - // } - // return VectorInstruction::make(op->type, op->op, new_args); - return Expr(); + // internal_error << "Always implement VectorInstruction visitor for IRMutator subclass\n"; + auto [new_args, changed] = mutate_with_changes(op->args); + if (!changed) { + return op; + } + return VectorInstruction::make(op->type, op->op, new_args); + // return Expr(); } Expr IRMutator::visit(const VectorReduce *op) { diff --git a/src/IRVisitor.cpp b/src/IRVisitor.cpp index bbca23a77a7b..2332d7bcc2e3 100644 --- a/src/IRVisitor.cpp +++ b/src/IRVisitor.cpp @@ -258,10 +258,10 @@ void IRVisitor::visit(const Shuffle *op) { } void IRVisitor::visit(const VectorInstruction *op) { - internal_error << "Always implement VectorInstruction visitor for IRVisitor subclass\n"; - // for (const auto &arg : op->args) { - // arg.accept(this); - // } + // internal_error << "Always implement VectorInstruction visitor for IRVisitor subclass\n"; + for (const auto &arg : op->args) { + arg.accept(this); + } } void IRVisitor::visit(const VectorReduce *op) { @@ -523,10 +523,10 @@ void IRGraphVisitor::visit(const Shuffle *op) { } void IRGraphVisitor::visit(const VectorInstruction *op) { - internal_error << "Always implement VectorInstruction visitor for IRGraphVisitor subclass\n"; - // for (const auto &arg : op->args) { - // include(arg); - // } + // internal_error << "Always implement VectorInstruction visitor for IRGraphVisitor subclass\n"; + for (const auto &arg : op->args) { + include(arg); + } } void IRGraphVisitor::visit(const VectorReduce *op) { From ec2cd4ebf2e66ac9f927184dfe1121e5402288ee Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Thu, 28 Jul 2022 12:41:49 -0400 Subject: [PATCH 34/55] address nits --- src/IRMatch.h | 10 +++++----- src/InstructionSelector.cpp | 10 +++++----- src/InstructionSelector.h | 4 ++-- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/IRMatch.h b/src/IRMatch.h index 60d54f8e391d..210688056066 100644 --- a/src/IRMatch.h +++ b/src/IRMatch.h @@ -1884,7 +1884,7 @@ HALIDE_ALWAYS_INLINE auto ramp(A &&a, B &&b, C &&c) noexcept -> RampOp -struct VectorInstrOp { +struct VectorInstructionOp { struct pattern_tag {}; const VectorInstruction::InstructionOp op; std::tuple args; @@ -1964,7 +1964,7 @@ struct VectorInstrOp { constexpr static bool foldable = false; HALIDE_ALWAYS_INLINE - VectorInstrOp(const VectorInstruction::InstructionOp _op, Args... args) noexcept + VectorInstructionOp(const VectorInstruction::InstructionOp _op, Args... args) noexcept : op(_op), args(args...) { static_assert(sizeof...(Args) > 0 && sizeof...(Args) <= 3, "VectorInstructionOp must have non-zero arguments, and update make() if more than 3 arguments."); @@ -1972,9 +1972,9 @@ struct VectorInstrOp { }; template -std::ostream &operator<<(std::ostream &s, const VectorInstrOp &op) { +std::ostream &operator<<(std::ostream &s, const VectorInstructionOp &op) { // TODO(rootjalex): Should we print the type? - s << "vector_intrin(\""; + s << "vector_instr(\""; s << op.op << "\", "; op.print_args(s); s << ")"; @@ -1982,7 +1982,7 @@ std::ostream &operator<<(std::ostream &s, const VectorInstrOp &op) { } template -HALIDE_ALWAYS_INLINE auto v_instr(const VectorInstruction::InstructionOp op, Args... args) noexcept -> VectorInstrOp { +HALIDE_ALWAYS_INLINE auto v_instr(const VectorInstruction::InstructionOp op, Args... args) noexcept -> VectorInstructionOp { return {op, pattern_arg(args)...}; } diff --git a/src/InstructionSelector.cpp b/src/InstructionSelector.cpp index 569c04925638..35bb63a640a4 100644 --- a/src/InstructionSelector.cpp +++ b/src/InstructionSelector.cpp @@ -10,15 +10,15 @@ InstructionSelector::InstructionSelector(const Target &t, const CodeGen_LLVM *c) } Expr InstructionSelector::visit(const Div *op) { - if (!op->type.is_vector() || !op->type.is_int_or_uint()) { - return IRGraphMutator::visit(op); + if (op->type.is_vector() && op->type.is_int_or_uint()) { + // Lower division here in order to do pattern-matching on intrinsics. + return mutate(lower_int_uint_div(op->a, op->b)); } - // Lower division here in order to do pattern-matching on intrinsics. - return mutate(lower_int_uint_div(op->a, op->b)); + return IRGraphMutator::visit(op); } Expr InstructionSelector::visit(const VectorReduce *op) { - return codegen->split_vector_reduce(op, Expr()); + return mutate(codegen->split_vector_reduce(op, Expr())); } } // namespace Internal diff --git a/src/InstructionSelector.h b/src/InstructionSelector.h index e0489da14979..d8e8d44e6761 100644 --- a/src/InstructionSelector.h +++ b/src/InstructionSelector.h @@ -1,5 +1,5 @@ -#ifndef HALIDE_INSTR_SELECTOR_H -#define HALIDE_INSTR_SELECTOR_H +#ifndef HALIDE_INSTRUCTION_SELECTOR_H +#define HALIDE_INSTRUCTION_SELECTOR_H /** \file * Defines a base class for VectorInstruction selection. From 0e5cfcfe2520fdc8b58b992358edef57d9d7f5d9 Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Thu, 28 Jul 2022 13:39:17 -0400 Subject: [PATCH 35/55] temporary HVX/CSE fix --- src/X86Optimize.cpp | 15 ++++++++++----- src/X86Optimize.h | 2 +- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/src/X86Optimize.cpp b/src/X86Optimize.cpp index b6e5812ab07f..b5e77cf32c5b 100644 --- a/src/X86Optimize.cpp +++ b/src/X86Optimize.cpp @@ -563,16 +563,21 @@ class Optimize_X86 : public InstructionSelector { } // namespace -Stmt optimize_x86_instructions(Stmt stmt, const Target &target, const CodeGen_LLVM *codegen) { - stmt = Optimize_X86(target, codegen).mutate(stmt); +Stmt optimize_x86_instructions(const Stmt &s, const Target &target, const CodeGen_LLVM *codegen) { + Stmt stmt = Optimize_X86(target, codegen).mutate(s); + // Some of the rules above can introduce repeated sub-terms, so run CSE again. - stmt = common_subexpression_elimination(stmt); - return stmt; + if (!stmt.same_as(s)) { + stmt = common_subexpression_elimination(stmt); + return stmt; + } else { + return s; + } } #else // WITH_X86 -Stmt optimize_x86_instructions(Stmt s, const Target &t) { +Stmt optimize_x86_instructions(const Stmt &s, const Target &t) { user_error << "x86 not enabled for this build of Halide.\n"; return Stmt(); } diff --git a/src/X86Optimize.h b/src/X86Optimize.h index 9ab9d5f54269..9732a2dba545 100644 --- a/src/X86Optimize.h +++ b/src/X86Optimize.h @@ -13,7 +13,7 @@ namespace Halide { namespace Internal { /** Perform vector instruction selection, inserting VectorInstruction nodes. */ -Stmt optimize_x86_instructions(Stmt stmt, const Target &target, const CodeGen_LLVM *codegen); +Stmt optimize_x86_instructions(const Stmt &stmt, const Target &target, const CodeGen_LLVM *codegen); } // namespace Internal } // namespace Halide From b3b3551bd9bb23bf179f9daadc243cbfb20a4bcf Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Thu, 28 Jul 2022 13:52:12 -0400 Subject: [PATCH 36/55] fix case without WITH_X86 --- src/X86Optimize.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/X86Optimize.cpp b/src/X86Optimize.cpp index b5e77cf32c5b..a1f54ce3d2f3 100644 --- a/src/X86Optimize.cpp +++ b/src/X86Optimize.cpp @@ -577,7 +577,7 @@ Stmt optimize_x86_instructions(const Stmt &s, const Target &target, const CodeGe #else // WITH_X86 -Stmt optimize_x86_instructions(const Stmt &s, const Target &t) { +Stmt optimize_x86_instructions(const Stmt &s, const Target &t, const CodeGen_LLVM *codegen) { user_error << "x86 not enabled for this build of Halide.\n"; return Stmt(); } From 40f575ca93733cd18e48a56fe6ae68993405a9a8 Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Fri, 29 Jul 2022 00:29:46 -0400 Subject: [PATCH 37/55] fix x86 saturating_narrow pattern mistake --- src/X86Optimize.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/X86Optimize.cpp b/src/X86Optimize.cpp index b5e77cf32c5b..e845beade690 100644 --- a/src/X86Optimize.cpp +++ b/src/X86Optimize.cpp @@ -205,24 +205,24 @@ class Optimize_X86 : public InstructionSelector { // saturating_narrow is always supported (via SSE2) for: // int32 -> int16, int16 -> int8, int16 -> uint8 rewrite( - cast(Int(16, lanes), max(min(x, i32_i16min), i32_i16min)), + cast(Int(16, lanes), max(min(x, i32_i16max), i32_i16min)), v_instr(VectorInstruction::saturating_narrow, x), is_int(x, 32)) || rewrite( - cast(Int(8, lanes), max(min(x, i16_i8min), i16_i8min)), + cast(Int(8, lanes), max(min(x, i16_i8max), i16_i8min)), v_instr(VectorInstruction::saturating_narrow, x), is_int(x, 16)) || rewrite( - cast(UInt(8, lanes), max(min(x, i16_u8min), i16_u8min)), + cast(UInt(8, lanes), max(min(x, i16_u8max), i16_u8min)), v_instr(VectorInstruction::saturating_narrow, x), is_int(x, 16)) || // int32 -> uint16 is supported via SSE41 (target.has_feature(Target::SSE41) && rewrite( - cast(UInt(16, lanes), max(min(x, i32_u16min), i32_u16min)), + cast(UInt(16, lanes), max(min(x, i32_u16max), i32_u16min)), v_instr(VectorInstruction::saturating_narrow, x), is_int(x, 32))) || From 545fbe819423c24408e68367eba5f2116ec95ebe Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Tue, 2 Aug 2022 13:28:13 -0400 Subject: [PATCH 38/55] lower mod in InstructionSelector too --- src/CodeGen_LLVM.h | 10 +++++++--- src/InstructionSelector.cpp | 9 +++++++++ src/InstructionSelector.h | 5 +++-- 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h index 5ea221baa577..1fbb4328ac7c 100644 --- a/src/CodeGen_LLVM.h +++ b/src/CodeGen_LLVM.h @@ -45,6 +45,7 @@ class GlobalVariable; namespace Halide { struct ExternSignature; +class InstructionSelector; namespace Internal { @@ -507,9 +508,6 @@ class CodeGen_LLVM : public IRVisitor { * across backends. */ virtual void codegen_vector_reduce(const VectorReduce *op, const Expr &init); - // TODO: this probably shouldn't be public, or should be moved where the rest of - // the public methods are. -public: /** Split up a VectorReduce node if possible, or generate LLVM intrinsics for full reductions. This is used in `codegen_vector_reduce`. **/ @@ -604,6 +602,12 @@ class CodeGen_LLVM : public IRVisitor { * represents a unique struct type created by a closure or similar. */ std::map struct_type_recovery; + + /** Instruction selection uses `split_vector_reduce` and + * `upgrade_type_for_arithmetic`, so needs access to those + * methods. + */ + friend class InstructionSelector; }; } // namespace Internal diff --git a/src/InstructionSelector.cpp b/src/InstructionSelector.cpp index 35bb63a640a4..072d07aa4cf8 100644 --- a/src/InstructionSelector.cpp +++ b/src/InstructionSelector.cpp @@ -1,6 +1,7 @@ #include "InstructionSelector.h" #include "CodeGen_Internal.h" +#include "IROperator.h" namespace Halide { namespace Internal { @@ -17,6 +18,14 @@ Expr InstructionSelector::visit(const Div *op) { return IRGraphMutator::visit(op); } +Expr InstructionSelector::visit(const Mod *op) { + if (op->type.is_vector() && op->type.is_int_or_uint()) { + // Lower mod here in order to do pattern-matching on intrinsics. + return mutate(lower_int_uint_mod(op->a, op->b)); + } + return IRGraphMutator::visit(op); +} + Expr InstructionSelector::visit(const VectorReduce *op) { return mutate(codegen->split_vector_reduce(op, Expr())); } diff --git a/src/InstructionSelector.h b/src/InstructionSelector.h index d8e8d44e6761..bc7b1541374a 100644 --- a/src/InstructionSelector.h +++ b/src/InstructionSelector.h @@ -15,8 +15,8 @@ namespace Internal { /** A base class for vector instruction selection. * The default implementation lowers int and uint - * division via `lower_int_uint_div` and splits - * VectorReduce nodes via CodeGen_LLVM::split_vector_reduce(). + * div and mod, and splits VectorReduce nodes via + * CodeGen_LLVM::split_vector_reduce(). */ class InstructionSelector : public IRGraphMutator { protected: @@ -25,6 +25,7 @@ class InstructionSelector : public IRGraphMutator { using IRGraphMutator::visit; Expr visit(const Div *) override; + Expr visit(const Mod *) override; Expr visit(const VectorReduce *) override; public: From cd0fe8acd5e575d90937b93c0a475be79674d2e5 Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Tue, 2 Aug 2022 13:31:19 -0400 Subject: [PATCH 39/55] clang format --- src/IRMatch.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/IRMatch.h b/src/IRMatch.h index 210688056066..34d237f0f3ea 100644 --- a/src/IRMatch.h +++ b/src/IRMatch.h @@ -1588,7 +1588,7 @@ auto bitwise_xor(A &&a, B &&b) noexcept -> Intrin -HALIDE_ALWAYS_INLINE auto operator^(A &&a, B &&b) noexcept -> auto { +HALIDE_ALWAYS_INLINE auto operator^(A &&a, B &&b) noexcept -> auto{ assert_is_lvalue_if_expr(); assert_is_lvalue_if_expr(); return bitwise_xor(a, b); @@ -1598,7 +1598,7 @@ auto bitwise_and(A &&a, B &&b) noexcept -> Intrin -HALIDE_ALWAYS_INLINE auto operator&(A &&a, B &&b) noexcept -> auto { +HALIDE_ALWAYS_INLINE auto operator&(A &&a, B &&b) noexcept -> auto{ assert_is_lvalue_if_expr(); assert_is_lvalue_if_expr(); return bitwise_and(a, b); @@ -1608,7 +1608,7 @@ auto bitwise_or(A &&a, B &&b) noexcept -> Intrin -HALIDE_ALWAYS_INLINE auto operator|(A &&a, B &&b) noexcept -> auto { +HALIDE_ALWAYS_INLINE auto operator|(A &&a, B &&b) noexcept -> auto{ assert_is_lvalue_if_expr(); assert_is_lvalue_if_expr(); return bitwise_or(a, b); From 6e67ddfe5331b3ae1c75bccf0146ede7dfe55abc Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Mon, 8 Aug 2022 15:23:52 -0400 Subject: [PATCH 40/55] implement pattern matching for SapphireRapids --- src/IRMatch.h | 52 ++++++++++++++++++++++++++++++++++++++++++--- src/X86Optimize.cpp | 50 ++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 96 insertions(+), 6 deletions(-) diff --git a/src/IRMatch.h b/src/IRMatch.h index 210688056066..183e0a1e617d 100644 --- a/src/IRMatch.h +++ b/src/IRMatch.h @@ -1588,7 +1588,7 @@ auto bitwise_xor(A &&a, B &&b) noexcept -> Intrin -HALIDE_ALWAYS_INLINE auto operator^(A &&a, B &&b) noexcept -> auto { +HALIDE_ALWAYS_INLINE auto operator^(A &&a, B &&b) noexcept -> auto{ assert_is_lvalue_if_expr(); assert_is_lvalue_if_expr(); return bitwise_xor(a, b); @@ -1598,7 +1598,7 @@ auto bitwise_and(A &&a, B &&b) noexcept -> Intrin -HALIDE_ALWAYS_INLINE auto operator&(A &&a, B &&b) noexcept -> auto { +HALIDE_ALWAYS_INLINE auto operator&(A &&a, B &&b) noexcept -> auto{ assert_is_lvalue_if_expr(); assert_is_lvalue_if_expr(); return bitwise_and(a, b); @@ -1608,7 +1608,7 @@ auto bitwise_or(A &&a, B &&b) noexcept -> Intrin -HALIDE_ALWAYS_INLINE auto operator|(A &&a, B &&b) noexcept -> auto { +HALIDE_ALWAYS_INLINE auto operator|(A &&a, B &&b) noexcept -> auto{ assert_is_lvalue_if_expr(); assert_is_lvalue_if_expr(); return bitwise_or(a, b); @@ -2501,6 +2501,52 @@ std::ostream &operator<<(std::ostream &s, const IsFloat &op) { return s; } +template +struct IsBFloat { + struct pattern_tag {}; + A a; + int bits; + int lanes; + + constexpr static uint32_t binds = bindings::mask; + + // This rule is a boolean-valued predicate. Bools have type UIntImm. + constexpr static IRNodeType min_node_type = IRNodeType::UIntImm; + constexpr static IRNodeType max_node_type = IRNodeType::UIntImm; + constexpr static bool canonical = true; + + constexpr static bool foldable = true; + + HALIDE_ALWAYS_INLINE + void make_folded_const(halide_scalar_value_t &val, halide_type_t &ty, MatcherState &state) const { + // a is almost certainly a very simple pattern (e.g. a wild), so just inline the make method. + Type t = a.make(state, {}).type(); + val.u.u64 = t.is_bfloat() && (bits == 0 || t.bits() == bits) && (lanes == 0 || t.lanes() == lanes); + ty.code = halide_type_uint; + ty.bits = 1; + ty.lanes = t.lanes(); + } +}; + +template +HALIDE_ALWAYS_INLINE auto is_bfloat(A &&a, int bits = 0, int lanes = 0) noexcept -> IsBFloat { + assert_is_lvalue_if_expr(); + return {pattern_arg(a), bits, lanes}; +} + +template +std::ostream &operator<<(std::ostream &s, const IsBFloat &op) { + s << "is_bfloat(" << op.a; + if (op.bits > 0) { + s << ", " << op.bits; + } + if (op.lanes > 0) { + s << ", " << op.lanes; + } + s << ")"; + return s; +} + template struct IsInt { struct pattern_tag {}; diff --git a/src/X86Optimize.cpp b/src/X86Optimize.cpp index a1f54ce3d2f3..d81b8c66a2f7 100644 --- a/src/X86Optimize.cpp +++ b/src/X86Optimize.cpp @@ -95,8 +95,8 @@ class Optimize_X86 : public InstructionSelector { if ( // Only AVX512_SapphireRapids has accumulating dot products. target.has_feature(Target::AVX512_SapphireRapids) && - // FIXME: add the float16 -> float32 versions as well. - (op->type.element_of() == Int(32)) && + ((op->type.element_of() == Int(32)) || + (op->type.element_of() == Float(32))) && // Accumulating pmaddubsw (rewrite( @@ -130,6 +130,18 @@ class Optimize_X86 : public InstructionSelector { v_instr(VectorInstruction::dot_product, z, x, y), is_int(x, 16, lanes * 2) && is_int(y, 16, lanes * 2)) || + // Accumulating fp dot products. + // TODO(rootjalex): This would be more powerful with lossless_cast checking. + rewrite( + x + h_add(cast(Float(32, lanes * 4), y) * cast(Float(32, lanes * 4), z), lanes), + v_instr(VectorInstruction::dot_product, x, y, z), + is_bfloat(y, 16) && is_bfloat(z, 16)) || + + rewrite( + h_add(cast(Float(32, lanes * 4), x) * cast(Float(32, lanes * 4), y), lanes) + z, + v_instr(VectorInstruction::dot_product, z, x, y), + is_bfloat(x, 16) && is_bfloat(y, 16)) || + false)) { return mutate(rewrite.result); } @@ -414,7 +426,6 @@ class Optimize_X86 : public InstructionSelector { Call::widening_shift_right, Call::widening_sub, })) { - // TODO: Should we have a base-class that does this + the VectorReduce lowering needed below? return mutate(lower_intrinsic(op)); } @@ -487,6 +498,39 @@ class Optimize_X86 : public InstructionSelector { false)) || false)) || + // We can use the AVX512_SapphireRapids accumulating dot products + // on pure VectorReduce nodes with 0 as the accumulator. + ((factor == 4) && + target.has_feature(Target::AVX512_SapphireRapids) && + ((op->type.element_of() == Int(32)) || + (op->type.element_of() == Float(32))) && + + // Accumulating pmaddubsw + (rewrite( + h_add(cast(Int(32, lanes * 4), widening_mul(x, y)), lanes), + v_instr(VectorInstruction::dot_product, make_zero(Int(32, lanes)), x, y), + is_uint(x, 8) && is_int(y, 8)) || + + rewrite( + h_add(cast(Int(32, lanes * 4), widening_mul(x, y)), lanes), + v_instr(VectorInstruction::dot_product, make_zero(Int(32, lanes)), y, x), + is_int(x, 8) && is_uint(y, 8)) || + + // Accumulating pmaddwd. + rewrite( + h_add(widening_mul(x, y), lanes), + v_instr(VectorInstruction::dot_product, make_zero(Int(32, lanes)), x, y), + is_int(x, 16, lanes * 2) && is_int(y, 16, lanes * 2)) || + + // Accumulating fp dot products. + // TODO(rootjalex): This would be more powerful with lossless_cast checking. + rewrite( + h_add(cast(Float(32, lanes * 4), x) * cast(Float(32, lanes * 4), y), lanes), + v_instr(VectorInstruction::dot_product, make_zero(Float(32, lanes)), x, y), + is_bfloat(x, 16) && is_bfloat(y, 16)) || + + false)) || + // psadbw is always supported via SSE2. ((factor == 8) && (rewrite( From 19b2c5efbf762328132963d7a4ed1bdad6991561 Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Mon, 8 Aug 2022 16:57:30 -0400 Subject: [PATCH 41/55] rm stray 'protected' --- src/CodeGen_LLVM.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h index 1fbb4328ac7c..14f021d83e67 100644 --- a/src/CodeGen_LLVM.h +++ b/src/CodeGen_LLVM.h @@ -513,7 +513,6 @@ class CodeGen_LLVM : public IRVisitor { `codegen_vector_reduce`. **/ virtual Expr split_vector_reduce(const VectorReduce *op, const Expr &init) const; -protected: /** Are we inside an atomic node that uses mutex locks? This is used for detecting deadlocks from nested atomics & illegal vectorization. */ bool inside_atomic_mutex_node; From a98f268c3e740d63b82dc7720fa8a12eb8e2aa37 Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Mon, 8 Aug 2022 17:34:38 -0400 Subject: [PATCH 42/55] update x86 saturating_cast rules using intrinsic --- src/X86Optimize.cpp | 48 ++++++++++++++++++++++----------------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/src/X86Optimize.cpp b/src/X86Optimize.cpp index 6ddffc17833b..347366ea1b1f 100644 --- a/src/X86Optimize.cpp +++ b/src/X86Optimize.cpp @@ -214,30 +214,6 @@ class Optimize_X86 : public InstructionSelector { v_instr(VectorInstruction::pmulhrs, x, y), is_int(x, 16) && is_int(y, 16))) || - // saturating_narrow is always supported (via SSE2) for: - // int32 -> int16, int16 -> int8, int16 -> uint8 - rewrite( - cast(Int(16, lanes), max(min(x, i32_i16max), i32_i16min)), - v_instr(VectorInstruction::saturating_narrow, x), - is_int(x, 32)) || - - rewrite( - cast(Int(8, lanes), max(min(x, i16_i8max), i16_i8min)), - v_instr(VectorInstruction::saturating_narrow, x), - is_int(x, 16)) || - - rewrite( - cast(UInt(8, lanes), max(min(x, i16_u8max), i16_u8min)), - v_instr(VectorInstruction::saturating_narrow, x), - is_int(x, 16)) || - - // int32 -> uint16 is supported via SSE41 - (target.has_feature(Target::SSE41) && - rewrite( - cast(UInt(16, lanes), max(min(x, i32_u16max), i32_u16min)), - v_instr(VectorInstruction::saturating_narrow, x), - is_int(x, 32))) || - // f32_to_bf16 is supported only via Target::AVX512_SapphireRapids (target.has_feature(Target::AVX512_SapphireRapids) && rewrite( @@ -295,6 +271,30 @@ class Optimize_X86 : public InstructionSelector { auto y_uint = cast(unsigned_type, y); if ( + // saturating_narrow is always supported (via SSE2) for: + // int32 -> int16, int16 -> int8, int16 -> uint8 + rewrite( + saturating_cast(Int(16, lanes), x), + v_instr(VectorInstruction::saturating_narrow, x), + is_int(x, 32)) || + + rewrite( + saturating_cast(Int(8, lanes), x), + v_instr(VectorInstruction::saturating_narrow, x), + is_int(x, 16)) || + + rewrite( + saturating_cast(UInt(8, lanes), x), + v_instr(VectorInstruction::saturating_narrow, x), + is_int(x, 16)) || + + // int32 -> uint16 is supported via SSE41 + (target.has_feature(Target::SSE41) && + rewrite( + saturating_cast(UInt(16, lanes), x), + v_instr(VectorInstruction::saturating_narrow, x), + is_int(x, 32))) || + // We can redirect signed rounding halving add to unsigned rounding // halving add by adding 128 / 32768 to the result if the sign of the // args differs. From 22d17e7957ce0d30e1590357f56388a704ef9b27 Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Mon, 8 Aug 2022 17:55:30 -0400 Subject: [PATCH 43/55] fix namespace issue --- src/CodeGen_LLVM.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h index 14f021d83e67..9c5d767214f1 100644 --- a/src/CodeGen_LLVM.h +++ b/src/CodeGen_LLVM.h @@ -45,10 +45,11 @@ class GlobalVariable; namespace Halide { struct ExternSignature; -class InstructionSelector; namespace Internal { +class InstructionSelector; + /** A code generator abstract base class. Actual code generators * (e.g. CodeGen_X86) inherit from this. This class is responsible * for taking a Halide Stmt and producing llvm bitcode, machine From e2045bfd2d1fc59d46a00aed9669f05e68abb403 Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Mon, 22 Aug 2022 22:46:59 -0400 Subject: [PATCH 44/55] place Expr constants on the stack --- src/X86Optimize.cpp | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/src/X86Optimize.cpp b/src/X86Optimize.cpp index 347366ea1b1f..db07a59bdf1d 100644 --- a/src/X86Optimize.cpp +++ b/src/X86Optimize.cpp @@ -446,6 +446,13 @@ class Optimize_X86 : public InstructionSelector { const int factor = value_lanes / lanes; Expr value = op->value; + // Useful constants for some of the below rules. + Expr one_i16 = make_one(Int(16, value_lanes)); + Expr one_i8 = make_one(Int(8, value_lanes)); + Expr one_u8 = make_one(Int(8, value_lanes)); + Expr zero_i32 = make_zero(Int(32, lanes)); + Expr zero_f32 = make_zero(Float(32, lanes)); + switch (op->op) { case VectorReduce::Add: { auto rewrite = IRMatcher::rewriter(IRMatcher::h_add(value, lanes), op->type); @@ -462,7 +469,7 @@ class Optimize_X86 : public InstructionSelector { // Horizontal widening add via pmaddwd rewrite( h_add(cast(Int(32, value_lanes), x), lanes), - v_instr(VectorInstruction::dot_product, x, make_const(Int(16, value_lanes), 1)), + v_instr(VectorInstruction::dot_product, x, one_i16), is_int(x, 16)) || (rewrite( @@ -475,17 +482,17 @@ class Optimize_X86 : public InstructionSelector { // Horizontal widening adds using 2-way saturating dot products. (rewrite( h_add(cast(UInt(16, value_lanes), x), lanes), - cast(UInt(16, lanes), typed(Int(16, lanes), v_instr(VectorInstruction::saturating_dot_product, x, make_const(Int(8, value_lanes), 1)))), + cast(UInt(16, lanes), typed(Int(16, lanes), v_instr(VectorInstruction::saturating_dot_product, x, one_i8))), is_uint(x, 8)) || rewrite( h_add(cast(Int(16, value_lanes), x), lanes), - v_instr(VectorInstruction::saturating_dot_product, x, make_const(Int(8, value_lanes), 1)), + v_instr(VectorInstruction::saturating_dot_product, x, one_i8), is_uint(x, 8)) || rewrite( h_add(cast(Int(16, value_lanes), x), lanes), - v_instr(VectorInstruction::saturating_dot_product, make_const(UInt(8, value_lanes), 1), x), + v_instr(VectorInstruction::saturating_dot_product, one_u8, x), is_int(x, 8)) || // SSE41 and AVX2 support horizontal_add via phadd intrinsics. @@ -508,25 +515,25 @@ class Optimize_X86 : public InstructionSelector { // Accumulating pmaddubsw (rewrite( h_add(cast(Int(32, lanes * 4), widening_mul(x, y)), lanes), - v_instr(VectorInstruction::dot_product, make_zero(Int(32, lanes)), x, y), + v_instr(VectorInstruction::dot_product, zero_i32, x, y), is_uint(x, 8) && is_int(y, 8)) || rewrite( h_add(cast(Int(32, lanes * 4), widening_mul(x, y)), lanes), - v_instr(VectorInstruction::dot_product, make_zero(Int(32, lanes)), y, x), + v_instr(VectorInstruction::dot_product, zero_i32, y, x), is_int(x, 8) && is_uint(y, 8)) || // Accumulating pmaddwd. rewrite( h_add(widening_mul(x, y), lanes), - v_instr(VectorInstruction::dot_product, make_zero(Int(32, lanes)), x, y), + v_instr(VectorInstruction::dot_product, zero_i32, x, y), is_int(x, 16, lanes * 2) && is_int(y, 16, lanes * 2)) || // Accumulating fp dot products. // TODO(rootjalex): This would be more powerful with lossless_cast checking. rewrite( h_add(cast(Float(32, lanes * 4), x) * cast(Float(32, lanes * 4), y), lanes), - v_instr(VectorInstruction::dot_product, make_zero(Float(32, lanes)), x, y), + v_instr(VectorInstruction::dot_product, zero_f32, x, y), is_bfloat(x, 16) && is_bfloat(y, 16)) || false)) || From dc4d1f744e2607a3e0ba3e0ce3c7bbc83c9fc377 Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Mon, 22 Aug 2022 23:59:14 -0400 Subject: [PATCH 45/55] i8 -> u8 bugfix --- src/X86Optimize.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/X86Optimize.cpp b/src/X86Optimize.cpp index db07a59bdf1d..679f3b6fd72a 100644 --- a/src/X86Optimize.cpp +++ b/src/X86Optimize.cpp @@ -449,7 +449,7 @@ class Optimize_X86 : public InstructionSelector { // Useful constants for some of the below rules. Expr one_i16 = make_one(Int(16, value_lanes)); Expr one_i8 = make_one(Int(8, value_lanes)); - Expr one_u8 = make_one(Int(8, value_lanes)); + Expr one_u8 = make_one(UInt(8, value_lanes)); Expr zero_i32 = make_zero(Int(32, lanes)); Expr zero_f32 = make_zero(Float(32, lanes)); From 9a5327c6243d8c53b1214150a2583eaceba44b98 Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Wed, 24 Aug 2022 12:06:47 -0400 Subject: [PATCH 46/55] add better type checking in IRMatch for SpecificExpr cases --- src/IRMatch.h | 53 ++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 50 insertions(+), 3 deletions(-) diff --git a/src/IRMatch.h b/src/IRMatch.h index c50be51b76c9..1719628ec8a7 100644 --- a/src/IRMatch.h +++ b/src/IRMatch.h @@ -585,8 +585,13 @@ IntLiteral pattern_arg(int64_t x) { } template -HALIDE_ALWAYS_INLINE void assert_is_lvalue_if_expr() { - static_assert(!std::is_same::type, Expr>::value || std::is_lvalue_reference::value, +static constexpr bool is_lvalue_if_expr() { + return !std::is_same::type, Expr>::value || std::is_lvalue_reference::value; +} + +template +HALIDE_ALWAYS_INLINE static constexpr void assert_is_lvalue_if_expr() { + static_assert(is_lvalue_if_expr(), "Exprs are captured by reference by IRMatcher objects and so must be lvalues"); } @@ -1537,68 +1542,98 @@ HALIDE_ALWAYS_INLINE auto intrin(Call::IntrinsicOp intrinsic_op, Args... args) n template auto abs(A &&a) noexcept -> Intrin { + assert_is_lvalue_if_expr(); return {Call::abs, pattern_arg(a)}; } template auto absd(A &&a, B &&b) noexcept -> Intrin { + assert_is_lvalue_if_expr(); + assert_is_lvalue_if_expr(); return {Call::absd, pattern_arg(a), pattern_arg(b)}; } template auto widening_add(A &&a, B &&b) noexcept -> Intrin { + assert_is_lvalue_if_expr(); + assert_is_lvalue_if_expr(); return {Call::widening_add, pattern_arg(a), pattern_arg(b)}; } template auto widening_sub(A &&a, B &&b) noexcept -> Intrin { + assert_is_lvalue_if_expr(); + assert_is_lvalue_if_expr(); return {Call::widening_sub, pattern_arg(a), pattern_arg(b)}; } template auto widening_mul(A &&a, B &&b) noexcept -> Intrin { + assert_is_lvalue_if_expr(); + assert_is_lvalue_if_expr(); return {Call::widening_mul, pattern_arg(a), pattern_arg(b)}; } template auto saturating_add(A &&a, B &&b) noexcept -> Intrin { + assert_is_lvalue_if_expr(); + assert_is_lvalue_if_expr(); return {Call::saturating_add, pattern_arg(a), pattern_arg(b)}; } template auto saturating_sub(A &&a, B &&b) noexcept -> Intrin { + assert_is_lvalue_if_expr(); + assert_is_lvalue_if_expr(); return {Call::saturating_sub, pattern_arg(a), pattern_arg(b)}; } template auto saturating_cast(const Type &t, A &&a) noexcept -> Intrin { + assert_is_lvalue_if_expr(); Intrin p = {Call::saturating_cast, pattern_arg(a)}; p.optional_type_hint = t; return p; } template auto halving_add(A &&a, B &&b) noexcept -> Intrin { + assert_is_lvalue_if_expr(); + assert_is_lvalue_if_expr(); return {Call::halving_add, pattern_arg(a), pattern_arg(b)}; } template auto halving_sub(A &&a, B &&b) noexcept -> Intrin { + assert_is_lvalue_if_expr(); + assert_is_lvalue_if_expr(); return {Call::halving_sub, pattern_arg(a), pattern_arg(b)}; } template auto rounding_halving_add(A &&a, B &&b) noexcept -> Intrin { + assert_is_lvalue_if_expr(); + assert_is_lvalue_if_expr(); return {Call::rounding_halving_add, pattern_arg(a), pattern_arg(b)}; } template auto shift_left(A &&a, B &&b) noexcept -> Intrin { + assert_is_lvalue_if_expr(); + assert_is_lvalue_if_expr(); return {Call::shift_left, pattern_arg(a), pattern_arg(b)}; } template auto shift_right(A &&a, B &&b) noexcept -> Intrin { + assert_is_lvalue_if_expr(); + assert_is_lvalue_if_expr(); return {Call::shift_right, pattern_arg(a), pattern_arg(b)}; } template auto rounding_shift_left(A &&a, B &&b) noexcept -> Intrin { + assert_is_lvalue_if_expr(); + assert_is_lvalue_if_expr(); return {Call::rounding_shift_left, pattern_arg(a), pattern_arg(b)}; } template auto rounding_shift_right(A &&a, B &&b) noexcept -> Intrin { + assert_is_lvalue_if_expr(); + assert_is_lvalue_if_expr(); return {Call::rounding_shift_right, pattern_arg(a), pattern_arg(b)}; } template auto bitwise_xor(A &&a, B &&b) noexcept -> Intrin { + assert_is_lvalue_if_expr(); + assert_is_lvalue_if_expr(); return {Call::bitwise_xor, pattern_arg(a), pattern_arg(b)}; } template @@ -1609,6 +1644,8 @@ HALIDE_ALWAYS_INLINE auto operator^(A &&a, B &&b) noexcept -> auto{ } template auto bitwise_and(A &&a, B &&b) noexcept -> Intrin { + assert_is_lvalue_if_expr(); + assert_is_lvalue_if_expr(); return {Call::bitwise_and, pattern_arg(a), pattern_arg(b)}; } template @@ -1619,6 +1656,8 @@ HALIDE_ALWAYS_INLINE auto operator&(A &&a, B &&b) noexcept -> auto{ } template auto bitwise_or(A &&a, B &&b) noexcept -> Intrin { + assert_is_lvalue_if_expr(); + assert_is_lvalue_if_expr(); return {Call::bitwise_or, pattern_arg(a), pattern_arg(b)}; } template @@ -1629,10 +1668,16 @@ HALIDE_ALWAYS_INLINE auto operator|(A &&a, B &&b) noexcept -> auto{ } template auto mul_shift_right(A &&a, B &&b, C &&c) noexcept -> Intrin { + assert_is_lvalue_if_expr(); + assert_is_lvalue_if_expr(); + assert_is_lvalue_if_expr(); return {Call::mul_shift_right, pattern_arg(a), pattern_arg(b), pattern_arg(c)}; } template auto rounding_mul_shift_right(A &&a, B &&b, C &&c) noexcept -> Intrin { + assert_is_lvalue_if_expr(); + assert_is_lvalue_if_expr(); + assert_is_lvalue_if_expr(); return {Call::rounding_mul_shift_right, pattern_arg(a), pattern_arg(b), pattern_arg(c)}; } @@ -1828,6 +1873,7 @@ inline std::ostream &operator<<(std::ostream &s, const BroadcastOp &op) { template HALIDE_ALWAYS_INLINE auto broadcast(A &&a, B lanes) noexcept -> BroadcastOp { assert_is_lvalue_if_expr(); + assert_is_lvalue_if_expr(); return {pattern_arg(a), pattern_arg(lanes)}; } @@ -1981,7 +2027,7 @@ struct VectorInstructionOp { VectorInstructionOp(const VectorInstruction::InstructionOp _op, Args... args) noexcept : op(_op), args(args...) { static_assert(sizeof...(Args) > 0 && sizeof...(Args) <= 3, - "VectorInstructionOp must have non-zero arguments, and update make() if more than 3 arguments."); + "VectorInstructionOp must have non-zero arguments, and update make() if more than 3 arguments."); } }; @@ -1997,6 +2043,7 @@ std::ostream &operator<<(std::ostream &s, const VectorInstructionOp &op template HALIDE_ALWAYS_INLINE auto v_instr(const VectorInstruction::InstructionOp op, Args... args) noexcept -> VectorInstructionOp { + static_assert(and_reduce((is_lvalue_if_expr())...), "All parameters to a VectorInstructionOp must be lvalues if Exprs"); return {op, pattern_arg(args)...}; } From 292d8e582c217d9a226b061e577e0c427de667d1 Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Wed, 24 Aug 2022 12:10:53 -0400 Subject: [PATCH 47/55] clang format --- src/IRMatch.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/IRMatch.h b/src/IRMatch.h index 1719628ec8a7..8b69d84d8621 100644 --- a/src/IRMatch.h +++ b/src/IRMatch.h @@ -2027,7 +2027,7 @@ struct VectorInstructionOp { VectorInstructionOp(const VectorInstruction::InstructionOp _op, Args... args) noexcept : op(_op), args(args...) { static_assert(sizeof...(Args) > 0 && sizeof...(Args) <= 3, - "VectorInstructionOp must have non-zero arguments, and update make() if more than 3 arguments."); + "VectorInstructionOp must have non-zero arguments, and update make() if more than 3 arguments."); } }; From 3b0dc43f173c0b9daa0b41444e3584b545ad64e1 Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Wed, 24 Aug 2022 12:35:35 -0400 Subject: [PATCH 48/55] missing && --- src/IRMatch.h | 2 +- src/X86Optimize.cpp | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/IRMatch.h b/src/IRMatch.h index 8b69d84d8621..3574aa432416 100644 --- a/src/IRMatch.h +++ b/src/IRMatch.h @@ -2042,7 +2042,7 @@ std::ostream &operator<<(std::ostream &s, const VectorInstructionOp &op } template -HALIDE_ALWAYS_INLINE auto v_instr(const VectorInstruction::InstructionOp op, Args... args) noexcept -> VectorInstructionOp { +HALIDE_ALWAYS_INLINE auto v_instr(const VectorInstruction::InstructionOp op, Args&&... args) noexcept -> VectorInstructionOp { static_assert(and_reduce((is_lvalue_if_expr())...), "All parameters to a VectorInstructionOp must be lvalues if Exprs"); return {op, pattern_arg(args)...}; } diff --git a/src/X86Optimize.cpp b/src/X86Optimize.cpp index 679f3b6fd72a..f4bd53c2e868 100644 --- a/src/X86Optimize.cpp +++ b/src/X86Optimize.cpp @@ -447,11 +447,11 @@ class Optimize_X86 : public InstructionSelector { Expr value = op->value; // Useful constants for some of the below rules. - Expr one_i16 = make_one(Int(16, value_lanes)); - Expr one_i8 = make_one(Int(8, value_lanes)); - Expr one_u8 = make_one(UInt(8, value_lanes)); - Expr zero_i32 = make_zero(Int(32, lanes)); - Expr zero_f32 = make_zero(Float(32, lanes)); + const Expr one_i16 = make_one(Int(16, value_lanes)); + const Expr one_i8 = make_one(Int(8, value_lanes)); + const Expr one_u8 = make_one(UInt(8, value_lanes)); + const Expr zero_i32 = make_zero(Int(32, lanes)); + const Expr zero_f32 = make_zero(Float(32, lanes)); switch (op->op) { case VectorReduce::Add: { From 1eb0e94fac9c502fd443ff1f35e788dfb1dfd69a Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Wed, 24 Aug 2022 12:49:48 -0400 Subject: [PATCH 49/55] clang format --- src/IRMatch.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/IRMatch.h b/src/IRMatch.h index 3574aa432416..876fbb0aa1b4 100644 --- a/src/IRMatch.h +++ b/src/IRMatch.h @@ -2042,7 +2042,7 @@ std::ostream &operator<<(std::ostream &s, const VectorInstructionOp &op } template -HALIDE_ALWAYS_INLINE auto v_instr(const VectorInstruction::InstructionOp op, Args&&... args) noexcept -> VectorInstructionOp { +HALIDE_ALWAYS_INLINE auto v_instr(const VectorInstruction::InstructionOp op, Args &&...args) noexcept -> VectorInstructionOp { static_assert(and_reduce((is_lvalue_if_expr())...), "All parameters to a VectorInstructionOp must be lvalues if Exprs"); return {op, pattern_arg(args)...}; } From f6eb2bf1d133a28d75a96516d66ae4d9bb0a3aa4 Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Wed, 24 Aug 2022 13:02:38 -0400 Subject: [PATCH 50/55] update SpecificExpr comment + remove dangling TODO comments --- src/IRMatch.h | 2 ++ src/IRMutator.cpp | 2 -- src/IRVisitor.cpp | 2 -- src/X86Optimize.cpp | 16 +--------------- 4 files changed, 3 insertions(+), 19 deletions(-) diff --git a/src/IRMatch.h b/src/IRMatch.h index 876fbb0aa1b4..097aecb89bd2 100644 --- a/src/IRMatch.h +++ b/src/IRMatch.h @@ -211,6 +211,8 @@ struct SpecificExpr { constexpr static IRNodeType max_node_type = IRNodeType::Shuffle; constexpr static bool canonical = true; + // Having SpecificExpr hold an Expr instead of a BaseExprNode reference + // is catastrophic for performance and stack space usage. const BaseExprNode &expr; template diff --git a/src/IRMutator.cpp b/src/IRMutator.cpp index 7b05b46ac22f..b1703a6cccd1 100644 --- a/src/IRMutator.cpp +++ b/src/IRMutator.cpp @@ -328,13 +328,11 @@ Expr IRMutator::visit(const Shuffle *op) { } Expr IRMutator::visit(const VectorInstruction *op) { - // internal_error << "Always implement VectorInstruction visitor for IRMutator subclass\n"; auto [new_args, changed] = mutate_with_changes(op->args); if (!changed) { return op; } return VectorInstruction::make(op->type, op->op, new_args); - // return Expr(); } Expr IRMutator::visit(const VectorReduce *op) { diff --git a/src/IRVisitor.cpp b/src/IRVisitor.cpp index 2332d7bcc2e3..97c55d8075ac 100644 --- a/src/IRVisitor.cpp +++ b/src/IRVisitor.cpp @@ -258,7 +258,6 @@ void IRVisitor::visit(const Shuffle *op) { } void IRVisitor::visit(const VectorInstruction *op) { - // internal_error << "Always implement VectorInstruction visitor for IRVisitor subclass\n"; for (const auto &arg : op->args) { arg.accept(this); } @@ -523,7 +522,6 @@ void IRGraphVisitor::visit(const Shuffle *op) { } void IRGraphVisitor::visit(const VectorInstruction *op) { - // internal_error << "Always implement VectorInstruction visitor for IRGraphVisitor subclass\n"; for (const auto &arg : op->args) { include(arg); } diff --git a/src/X86Optimize.cpp b/src/X86Optimize.cpp index f4bd53c2e868..7a27ac9a92e6 100644 --- a/src/X86Optimize.cpp +++ b/src/X86Optimize.cpp @@ -196,16 +196,6 @@ class Optimize_X86 : public InstructionSelector { auto rewrite = IRMatcher::rewriter(IRMatcher::cast(op->type, op->value), op->type); - // TODO: saturating casts should be intrinsics, and supported in IRMatch.h. - const Expr i32_i16max = cast(Int(32, lanes), Int(16).max()); - const Expr i32_i16min = cast(Int(32, lanes), Int(16).min()); - const Expr i16_i8max = cast(Int(16, lanes), Int(8).max()); - const Expr i16_i8min = cast(Int(16, lanes), Int(8).min()); - const Expr i16_u8max = cast(Int(16, lanes), UInt(8).max()); - const Expr i16_u8min = cast(Int(16, lanes), UInt(8).min()); - const Expr i32_u16max = cast(Int(32, lanes), UInt(16).max()); - const Expr i32_u16min = cast(Int(32, lanes), UInt(16).min()); - if ( // pmulhrs is supported via AVX2 and SSE41, so SSE41 is the LCD. (target.has_feature(Target::SSE41) && @@ -235,7 +225,7 @@ class Optimize_X86 : public InstructionSelector { return IRGraphMutator::visit(op); } - // TODO: This optimization is hard to do via a rewrite-rule because of lossless_cast. + // TODO(rootjalex): This optimization is hard to do via a rewrite-rule because of lossless_cast. // A 16-bit mul-shift-right of less than 16 can sometimes be rounded up to a // full 16 to use pmulh(u)w by left-shifting one of the operands. This is @@ -337,10 +327,6 @@ class Optimize_X86 : public InstructionSelector { typed(Int(16, lanes), 32767), v_instr(VectorInstruction::pmulhrs, x, y)))) || - // TODO(rootjalex): The following intrinsics are - // simply one-to-one mappings, should they even - // be handled here? - // int(8 | 16 | 32) -> uint is supported via SSE41 // float32 is always supported (via SSE2). (((target.has_feature(Target::SSE41) && op->type.is_int() && bits <= 32) || From 95e5070742e7da2246681bfd23e4123afd6394fa Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Mon, 12 Sep 2022 16:57:20 -0700 Subject: [PATCH 51/55] fix signed absd lowering on x86 --- src/X86Optimize.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/X86Optimize.cpp b/src/X86Optimize.cpp index 7a27ac9a92e6..192cb48879e5 100644 --- a/src/X86Optimize.cpp +++ b/src/X86Optimize.cpp @@ -308,7 +308,8 @@ class Optimize_X86 : public InstructionSelector { // Current best way to lower absd on x86. rewrite( absd(x, y), - max(x, y) - min(x, y), + // Cast is a no-op reinterpret. + cast(op->type, max(x, y) - min(x, y)), is_int(x) && is_int(y)) || // pmulh is always supported (via SSE2). From 7aaf9a7802aad3b155efef813f89a01a8a351ef8 Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Mon, 12 Sep 2022 17:00:40 -0700 Subject: [PATCH 52/55] add type assertion to Optimize_X86::mutate --- src/X86Optimize.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/X86Optimize.cpp b/src/X86Optimize.cpp index 192cb48879e5..ae5a4424679b 100644 --- a/src/X86Optimize.cpp +++ b/src/X86Optimize.cpp @@ -68,6 +68,13 @@ class Optimize_X86 : public InstructionSelector { : InstructionSelector(target, codegen) { } + using IRGraphMutator::mutate; + Expr mutate(const Expr &e) override { + Expr expr = IRGraphMutator::mutate(e); + internal_assert(expr.type() == e.type()) << "(X86Optimize) Found type mismatch: " << e << " -> " << expr << "\n"; + return expr; + } + protected: bool should_peephole_optimize(const Type &type) { // We only have peephole optimizations for vectors here. From b4e2e42158f8a3427e45d101e8320b9feae30096 Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Mon, 12 Sep 2022 17:21:37 -0700 Subject: [PATCH 53/55] use shuffle for deinterleave on VectorInstruction --- src/Deinterleave.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/Deinterleave.cpp b/src/Deinterleave.cpp index 1ad291736ff7..c8ea9c4d1831 100644 --- a/src/Deinterleave.cpp +++ b/src/Deinterleave.cpp @@ -196,9 +196,8 @@ class Deinterleaver : public IRGraphMutator { using IRMutator::visit; Expr visit(const VectorInstruction *op) override { - internal_error << "Deinterleaver should never receive VectorInstruction node, received:\n" - << Expr(op) << "\n"; - return Expr(); + // We can't do anything special here. + return give_up_and_shuffle(op); } Expr visit(const VectorReduce *op) override { From 15e1a8c9cedcff9756fda8cf26eeb09090025566 Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Mon, 12 Sep 2022 22:52:40 -0700 Subject: [PATCH 54/55] do not try to extract when a vector is a simple extract_element --- src/Simplify_Shuffle.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/Simplify_Shuffle.cpp b/src/Simplify_Shuffle.cpp index 35622aee9c4e..e8a8aae634ce 100644 --- a/src/Simplify_Shuffle.cpp +++ b/src/Simplify_Shuffle.cpp @@ -9,6 +9,10 @@ using std::vector; Expr Simplify::visit(const Shuffle *op, ExprInfo *bounds) { if (op->is_extract_element()) { + if (op->vectors.size() == 1) { + // We cannot simplify this further. + return op; + } int index = op->indices[0]; internal_assert(index >= 0); for (const Expr &vector : op->vectors) { From cc2fac7e21ac67cf4115112d8e1eeec99b4ccb1c Mon Sep 17 00:00:00 2001 From: Alexander Root Date: Tue, 13 Sep 2022 11:11:04 -0700 Subject: [PATCH 55/55] don't call 'simplify' in deinterleave on extract_lane --- src/Deinterleave.cpp | 6 ++++++ src/Simplify_Shuffle.cpp | 4 ---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/Deinterleave.cpp b/src/Deinterleave.cpp index c8ea9c4d1831..3f380be535c3 100644 --- a/src/Deinterleave.cpp +++ b/src/Deinterleave.cpp @@ -407,6 +407,12 @@ Expr deinterleave(Expr e, int starting_lane, int lane_stride, int new_lanes, con Deinterleaver d(starting_lane, lane_stride, new_lanes, lets); e = d.mutate(e); e = common_subexpression_elimination(e); + if (const Shuffle *shuffle = e.as()) { + if (shuffle->is_extract_element() && shuffle->vectors.size() == 1) { + // calling `simplify` here will produce an infinite recursive loop. + return e; + } + } return simplify(e); } } // namespace diff --git a/src/Simplify_Shuffle.cpp b/src/Simplify_Shuffle.cpp index e8a8aae634ce..35622aee9c4e 100644 --- a/src/Simplify_Shuffle.cpp +++ b/src/Simplify_Shuffle.cpp @@ -9,10 +9,6 @@ using std::vector; Expr Simplify::visit(const Shuffle *op, ExprInfo *bounds) { if (op->is_extract_element()) { - if (op->vectors.size() == 1) { - // We cannot simplify this further. - return op; - } int index = op->indices[0]; internal_assert(index >= 0); for (const Expr &vector : op->vectors) {