From 85e8c692ce688c81283c79d2c668a2b8e09c2e7c Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Mon, 25 Jul 2022 13:14:59 -0400
Subject: [PATCH 01/55] create VectorIntrininsic node

---
 src/Bounds.cpp          |  5 +++++
 src/CodeGen_LLVM.cpp    |  4 ++++
 src/CodeGen_LLVM.h      |  1 +
 src/Expr.h              |  1 +
 src/IR.cpp              | 19 +++++++++++++++++++
 src/IR.h                |  9 +++++++++
 src/IREquality.cpp      |  8 ++++++++
 src/IRMutator.cpp       |  8 ++++++++
 src/IRMutator.h         |  1 +
 src/IRPrinter.cpp       | 10 ++++++++++
 src/IRPrinter.h         |  1 +
 src/IRVisitor.cpp       | 12 ++++++++++++
 src/IRVisitor.h         |  5 +++++
 src/Simplify_Exprs.cpp  |  5 +++++
 src/Simplify_Internal.h |  1 +
 src/StmtToHtml.cpp      |  7 +++++++
 16 files changed, 97 insertions(+)
diff --git a/src/Bounds.cpp b/src/Bounds.cpp
index eb2f9138268b..107bd04185c4 100644
--- a/src/Bounds.cpp
+++ b/src/Bounds.cpp
@@ -1110,6 +1110,11 @@ class Bounds : public IRVisitor {
         op->value.accept(this);
     }
 
+    void visit(const VectorIntrinsic *op) override {
+        // TODO(rootjalex): we may need to implement bounds queries.
+        internal_error << "Unexpected VectorIntrinsic in bounds query: " << Expr(op) << "\n";
+    }
+
     void visit(const Call *op) override {
         TRACK_BOUNDS_INTERVAL;
         TRACK_BOUNDS_INFO("name:", op->name);
diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index 938d7a0e23b6..4990ba6e78c1 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -4000,6 +4000,10 @@ void CodeGen_LLVM::visit(const Shuffle *op) {
     }
 }
 
+void CodeGen_LLVM::visit(const VectorIntrinsic *op) {
+    internal_error << "CodeGen_LLVM received VectorIntrinsic node, should be handled by architecture-specific CodeGen class:\n" << Expr(op) << "\n";
+}
+
 void CodeGen_LLVM::visit(const VectorReduce *op) {
     codegen_vector_reduce(op, Expr());
 }
diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h
index 5982aa1672fd..7c5078428431 100644
--- a/src/CodeGen_LLVM.h
+++ b/src/CodeGen_LLVM.h
@@ -362,6 +362,7 @@ class CodeGen_LLVM : public IRVisitor {
     void visit(const IfThenElse *) override;
     void visit(const Evaluate *) override;
     void visit(const Shuffle *) override;
+    void visit(const VectorIntrinsic *) override;
     void visit(const VectorReduce *) override;
     void visit(const Prefetch *) override;
     void visit(const Atomic *) override;
diff --git a/src/Expr.h b/src/Expr.h
index ac0ec6521d68..aaab7dac23be 100644
--- a/src/Expr.h
+++ b/src/Expr.h
@@ -57,6 +57,7 @@ enum class IRNodeType {
     Call,
     Let,
     Shuffle,
+    VectorIntrinsic,
     VectorReduce,
     // Stmts
     LetStmt,
diff --git a/src/IR.cpp b/src/IR.cpp
index 740234b8e31f..776a8da806ee 100644
--- a/src/IR.cpp
+++ b/src/IR.cpp
@@ -901,6 +901,17 @@ Stmt Atomic::make(const std::string &producer_name,
     return node;
 }
 
+Expr VectorIntrinsic::make(Type type, const std::string &name, const std::vector<Expr> &args) {
+    user_assert(!name.empty()) << "VectorIntrinsic without a name\n";
+    user_assert(!args.empty()) << "VectorInrinsic without arguments\n";
+
+    VectorIntrinsic *node = new VectorIntrinsic;
+    node->type = type;
+    node->name = name;
+    node->args = args;
+    return node;
+}
+
 Expr VectorReduce::make(VectorReduce::Operator op,
                         Expr vec,
                         int lanes) {
@@ -1081,6 +1092,10 @@ void ExprNode<Shuffle>::accept(IRVisitor *v) const {
     v->visit((const Shuffle *)this);
 }
 template<>
+void ExprNode<VectorIntrinsic>::accept(IRVisitor *v) const {
+    v->visit((const VectorIntrinsic *)this);
+}
+template<>
 void ExprNode<VectorReduce>::accept(IRVisitor *v) const {
     v->visit((const VectorReduce *)this);
 }
@@ -1270,6 +1285,10 @@ Expr ExprNode<Shuffle>::mutate_expr(IRMutator *v) const {
     return v->visit((const Shuffle *)this);
 }
 template<>
+Expr ExprNode<VectorIntrinsic>::mutate_expr(IRMutator *v) const {
+    return v->visit((const VectorIntrinsic *)this);
+}
+template<>
 Expr ExprNode<VectorReduce>::mutate_expr(IRMutator *v) const {
     return v->visit((const VectorReduce *)this);
 }
diff --git a/src/IR.h b/src/IR.h
index c6085614b59d..d732d1a43ea7 100644
--- a/src/IR.h
+++ b/src/IR.h
@@ -886,6 +886,15 @@ struct Atomic : public StmtNode<Atomic> {
     static const IRNodeType _node_type = IRNodeType::Atomic;
 };
 
+struct VectorIntrinsic : public ExprNode<VectorIntrinsic> {
+    std::string name;
+    std::vector<Expr> args;
+
+    static Expr make(Type type, const std::string &name, const std::vector<Expr> &args);
+
+    static const IRNodeType _node_type = IRNodeType::VectorIntrinsic;
+};
+
 /** Horizontally reduce a vector to a scalar or narrower vector using
  * the given commutative and associative binary operator. The reduction
  * factor is dictated by the number of lanes in the input and output
diff --git a/src/IREquality.cpp b/src/IREquality.cpp
index 20cb616d2c32..15a9bc01cbbb 100644
--- a/src/IREquality.cpp
+++ b/src/IREquality.cpp
@@ -98,6 +98,7 @@ class IRComparer : public IRVisitor {
     void visit(const Shuffle *) override;
     void visit(const Prefetch *) override;
     void visit(const Atomic *) override;
+    void visit(const VectorIntrinsic *) override;
     void visit(const VectorReduce *) override;
 };
 
@@ -629,6 +630,13 @@ void IRComparer::visit(const Atomic *op) {
     compare_stmt(s->body, op->body);
 }
 
+void IRComparer::visit(const VectorIntrinsic *op) {
+    const VectorIntrinsic *e = expr.as<VectorIntrinsic>();
+
+    compare_names(e->name, op->name);
+    compare_expr_vector(e->args, op->args);
+}
+
 void IRComparer::visit(const VectorReduce *op) {
     const VectorReduce *e = expr.as<VectorReduce>();
 
diff --git a/src/IRMutator.cpp b/src/IRMutator.cpp
index 005937a17008..e075897d6694 100644
--- a/src/IRMutator.cpp
+++ b/src/IRMutator.cpp
@@ -327,6 +327,14 @@ Expr IRMutator::visit(const Shuffle *op) {
     return Shuffle::make(new_vectors, op->indices);
 }
 
+Expr IRMutator::visit(const VectorIntrinsic *op) {
+    auto [new_args, changed] = mutate_with_changes(op->args);
+    if (!changed) {
+        return op;
+    }
+    return VectorIntrinsic::make(op->type, op->name, new_args);
+}
+
 Expr IRMutator::visit(const VectorReduce *op) {
     Expr value = mutate(op->value);
     if (value.same_as(op->value)) {
diff --git a/src/IRMutator.h b/src/IRMutator.h
index c7a1984269d3..e460b036b80f 100644
--- a/src/IRMutator.h
+++ b/src/IRMutator.h
@@ -81,6 +81,7 @@ class IRMutator {
     virtual Expr visit(const Call *);
     virtual Expr visit(const Let *);
     virtual Expr visit(const Shuffle *);
+    virtual Expr visit(const VectorIntrinsic *);
     virtual Expr visit(const VectorReduce *);
 
     virtual Stmt visit(const LetStmt *);
diff --git a/src/IRPrinter.cpp b/src/IRPrinter.cpp
index 38f57e46649e..f609f28763fd 100644
--- a/src/IRPrinter.cpp
+++ b/src/IRPrinter.cpp
@@ -1073,6 +1073,16 @@ void IRPrinter::visit(const Shuffle *op) {
     }
 }
 
+void IRPrinter::visit(const VectorIntrinsic *op) {
+    stream << "("
+           << op->type
+           << ")vector_intrinsic(\""
+           << op->name
+           << "\", ";
+    print_list(op->args);
+    stream << ")";
+}
+
 void IRPrinter::visit(const VectorReduce *op) {
     stream << "("
            << op->type
diff --git a/src/IRPrinter.h b/src/IRPrinter.h
index 666235988cd7..1e7cc048b805 100644
--- a/src/IRPrinter.h
+++ b/src/IRPrinter.h
@@ -194,6 +194,7 @@ class IRPrinter : public IRVisitor {
     void visit(const IfThenElse *) override;
     void visit(const Evaluate *) override;
     void visit(const Shuffle *) override;
+    void visit(const VectorIntrinsic *) override;
     void visit(const VectorReduce *) override;
     void visit(const Prefetch *) override;
     void visit(const Atomic *) override;
diff --git a/src/IRVisitor.cpp b/src/IRVisitor.cpp
index 7f9993987200..3b1956a51d8a 100644
--- a/src/IRVisitor.cpp
+++ b/src/IRVisitor.cpp
@@ -257,6 +257,12 @@ void IRVisitor::visit(const Shuffle *op) {
     }
 }
 
+void IRVisitor::visit(const VectorIntrinsic *op) {
+    for (const auto &arg : op->args) {
+        arg.accept(this);
+    }
+}
+
 void IRVisitor::visit(const VectorReduce *op) {
     op->value.accept(this);
 }
@@ -515,6 +521,12 @@ void IRGraphVisitor::visit(const Shuffle *op) {
     }
 }
 
+void IRGraphVisitor::visit(const VectorIntrinsic *op) {
+    for (const auto &arg : op->args) {
+        include(arg);
+    }
+}
+
 void IRGraphVisitor::visit(const VectorReduce *op) {
     include(op->value);
 }
diff --git a/src/IRVisitor.h b/src/IRVisitor.h
index 4e1650ff22be..c9c170dd851d 100644
--- a/src/IRVisitor.h
+++ b/src/IRVisitor.h
@@ -71,6 +71,7 @@ class IRVisitor {
     virtual void visit(const IfThenElse *);
     virtual void visit(const Evaluate *);
     virtual void visit(const Shuffle *);
+    virtual void visit(const VectorIntrinsic *);
     virtual void visit(const VectorReduce *);
     virtual void visit(const Prefetch *);
     virtual void visit(const Fork *);
@@ -142,6 +143,7 @@ class IRGraphVisitor : public IRVisitor {
     void visit(const IfThenElse *) override;
     void visit(const Evaluate *) override;
     void visit(const Shuffle *) override;
+    void visit(const VectorIntrinsic *) override;
     void visit(const VectorReduce *) override;
     void visit(const Prefetch *) override;
     void visit(const Acquire *) override;
@@ -224,6 +226,8 @@ class VariadicVisitor {
             return ((T *)this)->visit((const Let *)node, std::forward<Args>(args)...);
         case IRNodeType::Shuffle:
             return ((T *)this)->visit((const Shuffle *)node, std::forward<Args>(args)...);
+        case IRNodeType::VectorIntrinsic:
+            return ((T *)this)->visit((const VectorIntrinsic *)node, std::forward<Args>(args)...);
         case IRNodeType::VectorReduce:
             return ((T *)this)->visit((const VectorReduce *)node, std::forward<Args>(args)...);
             // Explicitly list the Stmt types rather than using a
@@ -286,6 +290,7 @@ class VariadicVisitor {
         case IRNodeType::Call:
         case IRNodeType::Let:
         case IRNodeType::Shuffle:
+        case IRNodeType::VectorIntrinsic:
         case IRNodeType::VectorReduce:
             internal_error << "Unreachable";
             break;
diff --git a/src/Simplify_Exprs.cpp b/src/Simplify_Exprs.cpp
index 574754686cc6..5aefebfa611f 100644
--- a/src/Simplify_Exprs.cpp
+++ b/src/Simplify_Exprs.cpp
@@ -59,6 +59,11 @@ Expr Simplify::visit(const Broadcast *op, ExprInfo *bounds) {
     }
 }
 
+Expr Simplify::visit(const VectorIntrinsic *op, ExprInfo *bounds) {
+    clear_bounds_info(bounds);
+    return op;
+}
+
 Expr Simplify::visit(const VectorReduce *op, ExprInfo *bounds) {
     Expr value = mutate(op->value, bounds);
 
diff --git a/src/Simplify_Internal.h b/src/Simplify_Internal.h
index a510e5c51f64..7dae150042a4 100644
--- a/src/Simplify_Internal.h
+++ b/src/Simplify_Internal.h
@@ -333,6 +333,7 @@ class Simplify : public VariadicVisitor<Simplify, Expr, Stmt> {
     Expr visit(const Load *op, ExprInfo *bounds);
     Expr visit(const Call *op, ExprInfo *bounds);
     Expr visit(const Shuffle *op, ExprInfo *bounds);
+    Expr visit(const VectorIntrinsic *op, ExprInfo *bounds);
     Expr visit(const VectorReduce *op, ExprInfo *bounds);
     Expr visit(const Let *op, ExprInfo *bounds);
     Stmt visit(const LetStmt *op);
diff --git a/src/StmtToHtml.cpp b/src/StmtToHtml.cpp
index 21bc74dd20ac..1f5b1d20ccff 100644
--- a/src/StmtToHtml.cpp
+++ b/src/StmtToHtml.cpp
@@ -712,6 +712,13 @@ class StmtToHtml : public IRVisitor {
         stream << close_span();
     }
 
+    void visit(const VectorIntrinsic *op) override {
+        stream << open_span("VectoIntrinsic");
+        stream << open_span("Type") << op->type << close_span();
+        print_list(symbol("vector_intrinsic") + "(\"" + op->name + "\"", op->args, ")");
+        stream << close_span();
+    }
+
     void visit(const VectorReduce *op) override {
         stream << open_span("VectorReduce");
         stream << open_span("Type") << op->type << close_span();

From f0931c69796c9ebe5745d9105bc6c838a94dc070 Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Mon, 25 Jul 2022 13:15:30 -0400
Subject: [PATCH 02/55] update IRMatch for VectorIntrinsic node

---
 src/IRMatch.cpp |  19 +++++
 src/IRMatch.h   | 188 +++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 197 insertions(+), 10 deletions(-)

diff --git a/src/IRMatch.cpp b/src/IRMatch.cpp
index 6aba3155777f..f7bb7d457ff1 100644
--- a/src/IRMatch.cpp
+++ b/src/IRMatch.cpp
@@ -296,6 +296,22 @@ class IRMatch : public IRVisitor {
         }
     }
 
+    void visit(const VectorIntrinsic *op) override {
+        const VectorIntrinsic *e = expr.as<VectorIntrinsic>();
+        if (result && e &&
+            types_match(op->type, e->type) &&
+            e->name == op->name &&
+            e->args.size() == op->args.size()) {
+            for (size_t i = 0; result && (i < e->args.size()); i++) {
+                // FIXME: should we early-out? Here and in Call*
+                expr = e->args[i];
+                op->args[i].accept(this);
+            }
+        } else {
+            result = false;
+        }
+    }
+
     void visit(const VectorReduce *op) override {
         const VectorReduce *e = expr.as<VectorReduce>();
         if (result && e && op->op == e->op && types_match(op->type, e->type)) {
@@ -505,6 +521,9 @@ bool equal_helper(const BaseExprNode &a, const BaseExprNode &b) noexcept {
     case IRNodeType::Shuffle:
         return (equal_helper(((const Shuffle &)a).vectors, ((const Shuffle &)b).vectors) &&
                 equal_helper(((const Shuffle &)a).indices, ((const Shuffle &)b).indices));
+    case IRNodeType::VectorIntrinsic:
+        return (((const VectorIntrinsic &)a).name == ((const VectorIntrinsic &)b).name &&
+                equal_helper(((const VectorIntrinsic &)a).args, ((const VectorIntrinsic &)b).args));
     case IRNodeType::VectorReduce:
         // As with Cast above, we use equal instead of equal_helper
         // here, because while we know a.type == b.type, we don't know
diff --git a/src/IRMatch.h b/src/IRMatch.h
index 756b900e1f4d..089521c5bfed 100644
--- a/src/IRMatch.h
+++ b/src/IRMatch.h
@@ -1445,6 +1445,10 @@ struct Intrin {
             return rounding_shift_left(arg0, arg1);
         } else if (intrin == Call::rounding_shift_right) {
             return rounding_shift_right(arg0, arg1);
+        } else if (intrin == Call::bitwise_xor) {
+            return arg0 ^ arg1;
+        } else if (intrin == Call::bitwise_and) {
+            return arg0 & arg1;
         }
 
         Expr arg2 = std::get<const_min(2, sizeof...(Args) - 1)>(args).make(state, type_hint);
@@ -1521,6 +1525,14 @@ HALIDE_ALWAYS_INLINE auto intrin(Call::IntrinsicOp intrinsic_op, Args... args) n
     return {intrinsic_op, pattern_arg(args)...};
 }
 
+template<typename A>
+auto abs(A &&a) noexcept -> Intrin<decltype(pattern_arg(a))> {
+    return {Call::abs, pattern_arg(a)};
+}
+template<typename A, typename B>
+auto absd(A &&a, B &&b) noexcept -> Intrin<decltype(pattern_arg(a)), decltype(pattern_arg(b))> {
+    return {Call::absd, pattern_arg(a), pattern_arg(b)};
+}
 template<typename A, typename B>
 auto widening_add(A &&a, B &&b) noexcept -> Intrin<decltype(pattern_arg(a)), decltype(pattern_arg(b))> {
     return {Call::widening_add, pattern_arg(a), pattern_arg(b)};
@@ -1569,6 +1581,36 @@ template<typename A, typename B>
 auto rounding_shift_right(A &&a, B &&b) noexcept -> Intrin<decltype(pattern_arg(a)), decltype(pattern_arg(b))> {
     return {Call::rounding_shift_right, pattern_arg(a), pattern_arg(b)};
 }
+template<typename A, typename B>
+auto bitwise_xor(A &&a, B &&b) noexcept -> Intrin<decltype(pattern_arg(a)), decltype(pattern_arg(b))> {
+    return {Call::bitwise_xor, pattern_arg(a), pattern_arg(b)};
+}
+template<typename A, typename B>
+HALIDE_ALWAYS_INLINE auto operator^(A &&a, B &&b) noexcept -> auto {
+    assert_is_lvalue_if_expr<A>();
+    assert_is_lvalue_if_expr<B>();
+    return bitwise_xor(a, b);
+}
+template<typename A, typename B>
+auto bitwise_and(A &&a, B &&b) noexcept -> Intrin<decltype(pattern_arg(a)), decltype(pattern_arg(b))> {
+    return {Call::bitwise_and, pattern_arg(a), pattern_arg(b)};
+}
+template<typename A, typename B>
+HALIDE_ALWAYS_INLINE auto operator&(A &&a, B &&b) noexcept -> auto {
+    assert_is_lvalue_if_expr<A>();
+    assert_is_lvalue_if_expr<B>();
+    return bitwise_and(a, b);
+}
+template<typename A, typename B>
+auto bitwise_or(A &&a, B &&b) noexcept -> Intrin<decltype(pattern_arg(a)), decltype(pattern_arg(b))> {
+    return {Call::bitwise_or, pattern_arg(a), pattern_arg(b)};
+}
+template<typename A, typename B>
+HALIDE_ALWAYS_INLINE auto operator|(A &&a, B &&b) noexcept -> auto {
+    assert_is_lvalue_if_expr<A>();
+    assert_is_lvalue_if_expr<B>();
+    return bitwise_or(a, b);
+}
 template<typename A, typename B, typename C>
 auto mul_shift_right(A &&a, B &&b, C &&c) noexcept -> Intrin<decltype(pattern_arg(a)), decltype(pattern_arg(b)), decltype(pattern_arg(c))> {
     return {Call::mul_shift_right, pattern_arg(a), pattern_arg(b), pattern_arg(c)};
@@ -1839,6 +1881,109 @@ HALIDE_ALWAYS_INLINE auto ramp(A &&a, B &&b, C &&c) noexcept -> RampOp<decltype(
     return {pattern_arg(a), pattern_arg(b), pattern_arg(c)};
 }
 
+template<typename... Args>
+struct VectorIntrinOp {
+    struct pattern_tag {};
+    const std::string &intrin_name;
+    std::tuple<Args...> args;
+
+    static constexpr uint32_t binds = bitwise_or_reduce((bindings<Args>::mask)...);
+
+    constexpr static IRNodeType min_node_type = IRNodeType::VectorIntrinsic;
+    constexpr static IRNodeType max_node_type = IRNodeType::VectorIntrinsic;
+    constexpr static bool canonical = and_reduce((Args::canonical)...);
+
+    template<int i,
+             uint32_t bound,
+             typename = typename std::enable_if<(i < sizeof...(Args))>::type>
+    HALIDE_ALWAYS_INLINE bool match_args(int, const VectorIntrinsic &v, MatcherState &state) const noexcept {
+        using T = decltype(std::get<i>(args));
+        return (std::get<i>(args).template match<bound>(*v.args[i].get(), state) &&
+                match_args<i + 1, bound | bindings<T>::mask>(0, v, state));
+    }
+
+    template<int i, uint32_t binds>
+    HALIDE_ALWAYS_INLINE bool match_args(double, const VectorIntrinsic &v, MatcherState &state) const noexcept {
+        return true;
+    }
+
+    template<uint32_t bound>
+    HALIDE_ALWAYS_INLINE bool match(const BaseExprNode &e, MatcherState &state) const noexcept {
+        if (e.node_type != IRNodeType::VectorIntrinsic) {
+            return false;
+        }
+        const VectorIntrinsic &v = (const VectorIntrinsic &)e;
+        return (v.name == intrin_name && match_args<0, bound>(0, v, state));
+    }
+
+    template<int i,
+             typename = typename std::enable_if<(i < sizeof...(Args))>::type>
+    HALIDE_ALWAYS_INLINE void print_args(int, std::ostream &s) const {
+        s << std::get<i>(args);
+        if (i + 1 < sizeof...(Args)) {
+            s << ", ";
+        }
+        print_args<i + 1>(0, s);
+    }
+
+    template<int i>
+    HALIDE_ALWAYS_INLINE void print_args(double, std::ostream &s) const {
+    }
+
+    HALIDE_ALWAYS_INLINE
+    void print_args(std::ostream &s) const {
+        print_args<0>(0, s);
+    }
+
+    HALIDE_ALWAYS_INLINE
+    Expr make(MatcherState &state, halide_type_t type_hint) const {
+        std::vector<Expr> r_args(sizeof...(Args));
+        // TODO(rootjalex): How do we do type hints for the args?
+        // TODO(rootjalex): Is there a way to do basically an unrolled
+        // loop of the below? this is ugly.
+        // Supposedly C++20 will have constexpr std::transform, perhaps
+        // we can use that when Halide upgrades.
+
+        r_args[0] = std::get<0>(args).make(state, {});
+        if constexpr (sizeof...(Args) > 1) {
+            r_args[1] = std::get<1>(args).make(state, {});
+        }
+        if constexpr (sizeof...(Args) > 2) {
+            r_args[2] = std::get<2>(args).make(state, {});
+        }
+
+        // for (int i = 0; i < sizeof...(Args); i++) {
+        //     // TODO(rootjalex): how do we do type-hints here?
+        //     args[i] = std::get<i>(args).make(state, {});
+        // }
+        return VectorIntrinsic::make(type_hint, intrin_name, r_args);
+    }
+
+    constexpr static bool foldable = false;
+
+    HALIDE_ALWAYS_INLINE
+    VectorIntrinOp(const std::string &name, Args... args) noexcept
+        : intrin_name(name), args(args...) {
+        static_assert(sizeof...(Args) > 0 && sizeof...(Args) <= 3,
+                      "VectorIntrinsicOp must have non-zero arguments, and update make() if more than 3 arguments.");
+    }
+};
+
+template<typename... Args>
+std::ostream &operator<<(std::ostream &s, const VectorIntrinOp<Args...> &op) {
+    // TODO(rootjalex): Should we print the type?
+    s << "vector_intrin(\"";
+    s << op.intrin_name << "\", ";
+    op.print_args(s);
+    s << ")";
+    return s;
+}
+
+template<typename... Args>
+HALIDE_ALWAYS_INLINE auto v_intrin(const std::string &name, Args... args) noexcept -> VectorIntrinOp<decltype(pattern_arg(args))...> {
+    return {name, pattern_arg(args)...};
+}
+
 template<typename A, typename B, VectorReduce::Operator reduce_op>
 struct VectorReduceOp {
     struct pattern_tag {};
@@ -1895,6 +2040,12 @@ HALIDE_ALWAYS_INLINE auto h_add(A &&a, B lanes) noexcept -> VectorReduceOp<declt
     return {pattern_arg(a), pattern_arg(lanes)};
 }
 
+template<typename A, typename B>
+HALIDE_ALWAYS_INLINE auto h_satadd(A &&a, B lanes) noexcept -> VectorReduceOp<decltype(pattern_arg(a)), decltype(pattern_arg(lanes)), VectorReduce::SaturatingAdd> {
+    assert_is_lvalue_if_expr<A>();
+    return {pattern_arg(a), pattern_arg(lanes)};
+}
+
 template<typename A, typename B>
 HALIDE_ALWAYS_INLINE auto h_min(A &&a, B lanes) noexcept -> VectorReduceOp<decltype(pattern_arg(a)), decltype(pattern_arg(lanes)), VectorReduce::Min> {
     assert_is_lvalue_if_expr<A>();
@@ -2273,6 +2424,8 @@ template<typename A>
 struct IsFloat {
     struct pattern_tag {};
     A a;
+    int bits;
+    int lanes;
 
     constexpr static uint32_t binds = bindings<A>::mask;
 
@@ -2287,7 +2440,7 @@ struct IsFloat {
     void make_folded_const(halide_scalar_value_t &val, halide_type_t &ty, MatcherState &state) const {
         // a is almost certainly a very simple pattern (e.g. a wild), so just inline the make method.
         Type t = a.make(state, {}).type();
-        val.u.u64 = t.is_float();
+        val.u.u64 = t.is_float() && (bits == 0 || t.bits() == bits) && (lanes == 0 || t.lanes() == lanes);
         ty.code = halide_type_uint;
         ty.bits = 1;
         ty.lanes = t.lanes();
@@ -2295,14 +2448,21 @@ struct IsFloat {
 };
 
 template<typename A>
-HALIDE_ALWAYS_INLINE auto is_float(A &&a) noexcept -> IsFloat<decltype(pattern_arg(a))> {
+HALIDE_ALWAYS_INLINE auto is_float(A &&a, int bits = 0, int lanes = 0) noexcept -> IsFloat<decltype(pattern_arg(a))> {
     assert_is_lvalue_if_expr<A>();
-    return {pattern_arg(a)};
+    return {pattern_arg(a), bits, lanes};
 }
 
 template<typename A>
 std::ostream &operator<<(std::ostream &s, const IsFloat<A> &op) {
-    s << "is_float(" << op.a << ")";
+    s << "is_float(" << op.a;
+    if (op.bits > 0) {
+        s << ", " << op.bits;
+    }
+    if (op.lanes > 0) {
+        s << ", " << op.lanes;
+    }
+    s << ")";
     return s;
 }
 
@@ -2311,6 +2471,7 @@ struct IsInt {
     struct pattern_tag {};
     A a;
     int bits;
+    int lanes;
 
     constexpr static uint32_t binds = bindings<A>::mask;
 
@@ -2325,7 +2486,7 @@ struct IsInt {
     void make_folded_const(halide_scalar_value_t &val, halide_type_t &ty, MatcherState &state) const {
         // a is almost certainly a very simple pattern (e.g. a wild), so just inline the make method.
         Type t = a.make(state, {}).type();
-        val.u.u64 = t.is_int() && (bits == 0 || t.bits() == bits);
+        val.u.u64 = t.is_int() && (bits == 0 || t.bits() == bits) && (lanes == 0 || t.lanes() == lanes);
         ty.code = halide_type_uint;
         ty.bits = 1;
         ty.lanes = t.lanes();
@@ -2333,9 +2494,9 @@ struct IsInt {
 };
 
 template<typename A>
-HALIDE_ALWAYS_INLINE auto is_int(A &&a, int bits = 0) noexcept -> IsInt<decltype(pattern_arg(a))> {
+HALIDE_ALWAYS_INLINE auto is_int(A &&a, int bits = 0, int lanes = 0) noexcept -> IsInt<decltype(pattern_arg(a))> {
     assert_is_lvalue_if_expr<A>();
-    return {pattern_arg(a), bits};
+    return {pattern_arg(a), bits, lanes};
 }
 
 template<typename A>
@@ -2344,6 +2505,9 @@ std::ostream &operator<<(std::ostream &s, const IsInt<A> &op) {
     if (op.bits > 0) {
         s << ", " << op.bits;
     }
+    if (op.lanes > 0) {
+        s << ", " << op.lanes;
+    }
     s << ")";
     return s;
 }
@@ -2353,6 +2517,7 @@ struct IsUInt {
     struct pattern_tag {};
     A a;
     int bits;
+    int lanes;
 
     constexpr static uint32_t binds = bindings<A>::mask;
 
@@ -2367,7 +2532,7 @@ struct IsUInt {
     void make_folded_const(halide_scalar_value_t &val, halide_type_t &ty, MatcherState &state) const {
         // a is almost certainly a very simple pattern (e.g. a wild), so just inline the make method.
         Type t = a.make(state, {}).type();
-        val.u.u64 = t.is_uint() && (bits == 0 || t.bits() == bits);
+        val.u.u64 = t.is_uint() && (bits == 0 || t.bits() == bits) && (lanes == 0 || t.lanes() == lanes);
         ty.code = halide_type_uint;
         ty.bits = 1;
         ty.lanes = t.lanes();
@@ -2375,9 +2540,9 @@ struct IsUInt {
 };
 
 template<typename A>
-HALIDE_ALWAYS_INLINE auto is_uint(A &&a, int bits = 0) noexcept -> IsUInt<decltype(pattern_arg(a))> {
+HALIDE_ALWAYS_INLINE auto is_uint(A &&a, int bits = 0, int lanes = 0) noexcept -> IsUInt<decltype(pattern_arg(a))> {
     assert_is_lvalue_if_expr<A>();
-    return {pattern_arg(a), bits};
+    return {pattern_arg(a), bits, lanes};
 }
 
 template<typename A>
@@ -2386,6 +2551,9 @@ std::ostream &operator<<(std::ostream &s, const IsUInt<A> &op) {
     if (op.bits > 0) {
         s << ", " << op.bits;
     }
+    if (op.lanes > 0) {
+        s << ", " << op.lanes;
+    }
     s << ")";
     return s;
 }

From ac5b6f2b1032c2e30012236b16b0615fc67626a1 Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Mon, 25 Jul 2022 17:39:13 -0400
Subject: [PATCH 03/55] implement optimize_x86_instructions

---
 Makefile            |   6 +-
 src/CMakeLists.txt  |   2 +
 src/CodeGen_X86.cpp | 355 +-------------------------
 src/IRMatch.h       |  33 +++
 src/Lower.cpp       |   8 +
 src/X86Optimize.cpp | 588 ++++++++++++++++++++++++++++++++++++++++++++
 src/X86Optimize.h   |  22 ++
 7 files changed, 664 insertions(+), 350 deletions(-)
 create mode 100644 src/X86Optimize.cpp
 create mode 100644 src/X86Optimize.h

diff --git a/Makefile b/Makefile
index 61d78a7cc2dd..dabec3536bab 100644
--- a/Makefile
+++ b/Makefile
@@ -573,7 +573,8 @@ SOURCE_FILES = \
   Var.cpp \
   VectorizeLoops.cpp \
   WasmExecutor.cpp \
-  WrapCalls.cpp
+  WrapCalls.cpp \
+  X86Optimize.cpp
 
 # The externally-visible header files that go into making Halide.h.
 # Don't include anything here that includes llvm headers.
@@ -738,7 +739,8 @@ HEADER_FILES = \
   Util.h \
   Var.h \
   VectorizeLoops.h \
-  WrapCalls.h
+  WrapCalls.h \
+  X86Optimize.h
 
 OBJECTS = $(SOURCE_FILES:%.cpp=$(BUILD_DIR)/%.o)
 HEADERS = $(HEADER_FILES:%.h=$(SRC_DIR)/%.h)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index e0015a65551a..64d1a9f4316e 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -165,6 +165,7 @@ set(HEADER_FILES
     VectorizeLoops.h
     WasmExecutor.h
     WrapCalls.h
+    X86Optimize.h
     )
 
 set(SOURCE_FILES
@@ -342,6 +343,7 @@ set(SOURCE_FILES
     VectorizeLoops.cpp
     WasmExecutor.cpp
     WrapCalls.cpp
+    X86Optimize.cpp
     )
 
 ##
diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp
index 7529c31688f4..0e99407726b8 100644
--- a/src/CodeGen_X86.cpp
+++ b/src/CodeGen_X86.cpp
@@ -7,6 +7,7 @@
 #include "LLVM_Headers.h"
 #include "Simplify.h"
 #include "Util.h"
+#include "X86Optimize.h"
 
 namespace Halide {
 namespace Internal {
@@ -21,30 +22,6 @@ using namespace llvm;
 
 namespace {
 
-// Populate feature flags in a target according to those implied by
-// existing flags, so that instruction patterns can just check for the
-// oldest feature flag that supports an instruction.
-Target complete_x86_target(Target t) {
-    if (t.has_feature(Target::AVX512_SapphireRapids)) {
-        t.set_feature(Target::AVX512_Cannonlake);
-    }
-    if (t.has_feature(Target::AVX512_Cannonlake)) {
-        t.set_feature(Target::AVX512_Skylake);
-    }
-    if (t.has_feature(Target::AVX512_Cannonlake) ||
-        t.has_feature(Target::AVX512_Skylake) ||
-        t.has_feature(Target::AVX512_KNL)) {
-        t.set_feature(Target::AVX2);
-    }
-    if (t.has_feature(Target::AVX2)) {
-        t.set_feature(Target::AVX);
-    }
-    if (t.has_feature(Target::AVX)) {
-        t.set_feature(Target::SSE41);
-    }
-    return t;
-}
-
 /** A code generator that emits x86 code from a given Halide stmt. */
 class CodeGen_X86 : public CodeGen_Posix {
 public:
@@ -69,10 +46,7 @@ class CodeGen_X86 : public CodeGen_Posix {
 
     /** Nodes for which we want to emit specific sse/avx intrinsics */
     // @{
-    void visit(const Add *) override;
-    void visit(const Sub *) override;
     void visit(const Cast *) override;
-    void visit(const Call *) override;
     void visit(const GT *) override;
     void visit(const LT *) override;
     void visit(const LE *) override;
@@ -83,7 +57,7 @@ class CodeGen_X86 : public CodeGen_Posix {
     void visit(const Allocate *) override;
     void visit(const Load *) override;
     void visit(const Store *) override;
-    void codegen_vector_reduce(const VectorReduce *, const Expr &init) override;
+    void visit(const VectorIntrinsic *) override;
     // @}
 
 private:
@@ -265,85 +239,6 @@ void CodeGen_X86::init_module() {
     }
 }
 
-// i32(i16_a)*i32(i16_b) +/- i32(i16_c)*i32(i16_d) can be done by
-// interleaving a, c, and b, d, and then using dot_product.
-bool should_use_dot_product(const Expr &a, const Expr &b, vector<Expr> &result) {
-    Type t = a.type();
-    internal_assert(b.type() == t);
-
-    if (!(t.is_int() && t.bits() == 32 && t.lanes() >= 4)) {
-        return false;
-    }
-
-    const Call *ma = Call::as_intrinsic(a, {Call::widening_mul});
-    const Call *mb = Call::as_intrinsic(b, {Call::widening_mul});
-    // dot_product can't handle mixed type widening muls.
-    if (ma && ma->args[0].type() != ma->args[1].type()) {
-        return false;
-    }
-    if (mb && mb->args[0].type() != mb->args[1].type()) {
-        return false;
-    }
-    // If the operands are widening shifts, we might be able to treat these as
-    // multiplies.
-    const Call *sa = Call::as_intrinsic(a, {Call::widening_shift_left});
-    const Call *sb = Call::as_intrinsic(b, {Call::widening_shift_left});
-    if (sa && !is_const(sa->args[1])) {
-        sa = nullptr;
-    }
-    if (sb && !is_const(sb->args[1])) {
-        sb = nullptr;
-    }
-    if ((ma || sa) && (mb || sb)) {
-        Expr a0 = ma ? ma->args[0] : sa->args[0];
-        Expr a1 = ma ? ma->args[1] : lossless_cast(sa->args[0].type(), simplify(make_const(sa->type, 1) << sa->args[1]));
-        Expr b0 = mb ? mb->args[0] : sb->args[0];
-        Expr b1 = mb ? mb->args[1] : lossless_cast(sb->args[0].type(), simplify(make_const(sb->type, 1) << sb->args[1]));
-        if (a1.defined() && b1.defined()) {
-            std::vector<Expr> args = {a0, a1, b0, b1};
-            result.swap(args);
-            return true;
-        }
-    }
-    return false;
-}
-
-void CodeGen_X86::visit(const Add *op) {
-    vector<Expr> matches;
-    if (should_use_dot_product(op->a, op->b, matches)) {
-        Expr ac = Shuffle::make_interleave({matches[0], matches[2]});
-        Expr bd = Shuffle::make_interleave({matches[1], matches[3]});
-        value = call_overloaded_intrin(op->type, "dot_product", {ac, bd});
-        if (value) {
-            return;
-        }
-    }
-    CodeGen_Posix::visit(op);
-}
-
-void CodeGen_X86::visit(const Sub *op) {
-    vector<Expr> matches;
-    if (should_use_dot_product(op->a, op->b, matches)) {
-        // Negate one of the factors in the second expression
-        Expr negative_2 = lossless_negate(matches[2]);
-        Expr negative_3 = lossless_negate(matches[3]);
-        if (negative_2.defined() || negative_3.defined()) {
-            if (negative_2.defined()) {
-                matches[2] = negative_2;
-            } else {
-                matches[3] = negative_3;
-            }
-            Expr ac = Shuffle::make_interleave({matches[0], matches[2]});
-            Expr bd = Shuffle::make_interleave({matches[1], matches[3]});
-            value = call_overloaded_intrin(op->type, "dot_product", {ac, bd});
-            if (value) {
-                return;
-            }
-        }
-    }
-    CodeGen_Posix::visit(op);
-}
-
 void CodeGen_X86::visit(const GT *op) {
     Type t = op->a.type();
 
@@ -450,43 +345,12 @@ void CodeGen_X86::visit(const Select *op) {
 }
 
 void CodeGen_X86::visit(const Cast *op) {
-
     if (!op->type.is_vector()) {
         // We only have peephole optimizations for vectors in here.
         CodeGen_Posix::visit(op);
         return;
     }
 
-    struct Pattern {
-        string intrin;
-        Expr pattern;
-    };
-
-    // clang-format off
-    static Pattern patterns[] = {
-        // This isn't rounding_multiply_quantzied(i16, i16, 15) because it doesn't
-        // saturate the result.
-        {"pmulhrs", i16(rounding_shift_right(widening_mul(wild_i16x_, wild_i16x_), 15))},
-
-        {"saturating_narrow", i16_sat(wild_i32x_)},
-        {"saturating_narrow", u16_sat(wild_i32x_)},
-        {"saturating_narrow", i8_sat(wild_i16x_)},
-        {"saturating_narrow", u8_sat(wild_i16x_)},
-
-        {"f32_to_bf16", bf16(wild_f32x_)},
-    };
-    // clang-format on
-
-    vector<Expr> matches;
-    for (const Pattern &p : patterns) {
-        if (expr_match(p.pattern, op, matches)) {
-            value = call_overloaded_intrin(op->type, p.intrin, matches);
-            if (value) {
-                return;
-            }
-        }
-    }
-
     if (const Call *mul = Call::as_intrinsic(op->value, {Call::widening_mul})) {
         if (op->value.type().bits() < op->type.bits() && op->type.bits() <= 32) {
             // LLVM/x86 really doesn't like 8 -> 16 bit multiplication. If we're
@@ -501,216 +365,6 @@ void CodeGen_X86::visit(const Cast *op) {
     CodeGen_Posix::visit(op);
 }
 
-void CodeGen_X86::visit(const Call *op) {
-    if (!op->type.is_vector()) {
-        // We only have peephole optimizations for vectors in here.
-        CodeGen_Posix::visit(op);
-        return;
-    }
-
-    // A 16-bit mul-shift-right of less than 16 can sometimes be rounded up to a
-    // full 16 to use pmulh(u)w by left-shifting one of the operands. This is
-    // handled here instead of in the lowering of mul_shift_right because it's
-    // unlikely to be a good idea on platforms other than x86, as it adds an
-    // extra shift in the fully-lowered case.
-    if ((op->type.element_of() == UInt(16) ||
-         op->type.element_of() == Int(16)) &&
-        op->is_intrinsic(Call::mul_shift_right)) {
-        internal_assert(op->args.size() == 3);
-        const uint64_t *shift = as_const_uint(op->args[2]);
-        if (shift && *shift < 16 && *shift >= 8) {
-            Type narrow = op->type.with_bits(8);
-            Expr narrow_a = lossless_cast(narrow, op->args[0]);
-            Expr narrow_b = narrow_a.defined() ? Expr() : lossless_cast(narrow, op->args[1]);
-            int shift_left = 16 - (int)(*shift);
-            if (narrow_a.defined()) {
-                codegen(mul_shift_right(op->args[0] << shift_left, op->args[1], 16));
-                return;
-            } else if (narrow_b.defined()) {
-                codegen(mul_shift_right(op->args[0], op->args[1] << shift_left, 16));
-                return;
-            }
-        }
-    } else if (op->type.is_int() &&
-               op->type.bits() <= 16 &&
-               op->is_intrinsic(Call::rounding_halving_add)) {
-        // We can redirect signed rounding halving add to unsigned rounding
-        // halving add by adding 128 / 32768 to the result if the sign of the
-        // args differs.
-        internal_assert(op->args.size() == 2);
-        Type t = op->type.with_code(halide_type_uint);
-        Expr a = cast(t, op->args[0]);
-        Expr b = cast(t, op->args[1]);
-        codegen(cast(op->type, rounding_halving_add(a, b) + ((a ^ b) & (1 << (t.bits() - 1)))));
-        return;
-    } else if (op->is_intrinsic(Call::absd)) {
-        internal_assert(op->args.size() == 2);
-        if (op->args[0].type().is_uint()) {
-            // On x86, there are many 3-instruction sequences to compute absd of
-            // unsigned integers. This one consists solely of instructions with
-            // throughput of 3 ops per cycle on Cannon Lake.
-            //
-            // Solution due to Wojciech Mula:
-            // http://0x80.pl/notesen/2018-03-11-sse-abs-unsigned.html
-            codegen(saturating_sub(op->args[0], op->args[1]) | saturating_sub(op->args[1], op->args[0]));
-            return;
-        } else if (op->args[0].type().is_int()) {
-            codegen(Max::make(op->args[0], op->args[1]) - Min::make(op->args[0], op->args[1]));
-            return;
-        }
-    }
-
-    struct Pattern {
-        string intrin;
-        Expr pattern;
-    };
-
-    // clang-format off
-    static Pattern patterns[] = {
-        {"pmulh", mul_shift_right(wild_i16x_, wild_i16x_, 16)},
-        {"pmulh", mul_shift_right(wild_u16x_, wild_u16x_, 16)},
-        {"saturating_pmulhrs", rounding_mul_shift_right(wild_i16x_, wild_i16x_, 15)},
-    };
-    // clang-format on
-
-    vector<Expr> matches;
-    for (const auto &pattern : patterns) {
-        if (expr_match(pattern.pattern, op, matches)) {
-            value = call_overloaded_intrin(op->type, pattern.intrin, matches);
-            if (value) {
-                return;
-            }
-        }
-    }
-
-    CodeGen_Posix::visit(op);
-}
-
-void CodeGen_X86::codegen_vector_reduce(const VectorReduce *op, const Expr &init) {
-    if (op->op != VectorReduce::Add && op->op != VectorReduce::SaturatingAdd) {
-        CodeGen_Posix::codegen_vector_reduce(op, init);
-        return;
-    }
-    const int factor = op->value.type().lanes() / op->type.lanes();
-
-    struct Pattern {
-        VectorReduce::Operator reduce_op;
-        int factor;
-        Expr pattern;
-        const char *intrin;
-        Type narrow_type;
-        uint32_t flags = 0;
-        enum {
-            CombineInit = 1 << 0,
-            SwapOperands = 1 << 1,
-            SingleArg = 1 << 2,
-        };
-    };
-    // clang-format off
-    // These patterns are roughly sorted "best to worst", in case there are two
-    // patterns that match the expression.
-    static const Pattern patterns[] = {
-        // 4-way dot products
-        {VectorReduce::Add, 4, i32(widening_mul(wild_u8x_, wild_i8x_)), "dot_product", {}, Pattern::CombineInit},
-        {VectorReduce::Add, 4, i32(widening_mul(wild_i8x_, wild_u8x_)), "dot_product", {}, Pattern::CombineInit | Pattern::SwapOperands},
-        {VectorReduce::SaturatingAdd, 4, i32(widening_mul(wild_u8x_, wild_i8x_)), "saturating_dot_product", {}, Pattern::CombineInit},
-        {VectorReduce::SaturatingAdd, 4, i32(widening_mul(wild_i8x_, wild_u8x_)), "saturating_dot_product", {}, Pattern::CombineInit | Pattern::SwapOperands},
-
-        // 2-way dot products
-        {VectorReduce::Add, 2, i32(widening_mul(wild_i8x_, wild_i8x_)), "dot_product", Int(16)},
-        {VectorReduce::Add, 2, i32(widening_mul(wild_i8x_, wild_u8x_)), "dot_product", Int(16)},
-        {VectorReduce::Add, 2, i32(widening_mul(wild_u8x_, wild_i8x_)), "dot_product", Int(16)},
-        {VectorReduce::Add, 2, i32(widening_mul(wild_u8x_, wild_u8x_)), "dot_product", Int(16)},
-        {VectorReduce::SaturatingAdd, 2, i32(widening_mul(wild_u8x_, wild_i8x_)), "saturating_dot_product", {}, Pattern::CombineInit},
-        {VectorReduce::SaturatingAdd, 2, i32(widening_mul(wild_i8x_, wild_u8x_)), "saturating_dot_product", {}, Pattern::CombineInit | Pattern::SwapOperands},
-        {VectorReduce::SaturatingAdd, 2, widening_mul(wild_u8x_, wild_i8x_), "saturating_dot_product"},
-        {VectorReduce::SaturatingAdd, 2, widening_mul(wild_i8x_, wild_u8x_), "saturating_dot_product", {}, Pattern::SwapOperands},
-
-        {VectorReduce::Add, 2, i32(widening_mul(wild_i16x_, wild_i16x_)), "dot_product", {}, Pattern::CombineInit},
-        {VectorReduce::Add, 2, i32(widening_mul(wild_i16x_, wild_i16x_)), "dot_product", Int(16)},
-        {VectorReduce::SaturatingAdd, 2, i32(widening_mul(wild_i16x_, wild_i16x_)), "saturating_dot_product", {}, Pattern::CombineInit},
-
-        {VectorReduce::Add, 2, wild_f32x_ * wild_f32x_, "dot_product", BFloat(16), Pattern::CombineInit},
-
-        // One could do a horizontal widening addition with
-        // other dot_products against a vector of ones. Currently disabled
-        // because I haven't found other cases where it's clearly better.
-
-        {VectorReduce::Add, 2, u16(wild_u8x_), "horizontal_widening_add", {}, Pattern::SingleArg},
-        {VectorReduce::Add, 2, i16(wild_u8x_), "horizontal_widening_add", {}, Pattern::SingleArg},
-        {VectorReduce::Add, 2, i16(wild_i8x_), "horizontal_widening_add", {}, Pattern::SingleArg},
-    };
-    // clang-format on
-
-    std::vector<Expr> matches;
-    for (const Pattern &p : patterns) {
-        if (op->op != p.reduce_op || p.factor != factor) {
-            continue;
-        }
-        if (expr_match(p.pattern, op->value, matches)) {
-            if (p.flags & Pattern::SingleArg) {
-                Expr a = matches[0];
-
-                if (p.narrow_type.bits() > 0) {
-                    a = lossless_cast(p.narrow_type.with_lanes(a.type().lanes()), a);
-                }
-                if (!a.defined()) {
-                    continue;
-                }
-
-                if (init.defined() && (p.flags & Pattern::CombineInit)) {
-                    value = call_overloaded_intrin(op->type, p.intrin, {init, a});
-                    if (value) {
-                        return;
-                    }
-                } else {
-                    value = call_overloaded_intrin(op->type, p.intrin, {a});
-                    if (value) {
-                        if (init.defined()) {
-                            Value *x = value;
-                            Value *y = codegen(init);
-                            value = builder->CreateAdd(x, y);
-                        }
-                        return;
-                    }
-                }
-            } else {
-                Expr a = matches[0];
-                Expr b = matches[1];
-                if (p.flags & Pattern::SwapOperands) {
-                    std::swap(a, b);
-                }
-                if (p.narrow_type.bits() > 0) {
-                    a = lossless_cast(p.narrow_type.with_lanes(a.type().lanes()), a);
-                    b = lossless_cast(p.narrow_type.with_lanes(b.type().lanes()), b);
-                }
-                if (!a.defined() || !b.defined()) {
-                    continue;
-                }
-
-                if (init.defined() && (p.flags & Pattern::CombineInit)) {
-                    value = call_overloaded_intrin(op->type, p.intrin, {init, a, b});
-                    if (value) {
-                        return;
-                    }
-                } else {
-                    value = call_overloaded_intrin(op->type, p.intrin, {a, b});
-                    if (value) {
-                        if (init.defined()) {
-                            Value *x = value;
-                            Value *y = codegen(init);
-                            value = builder->CreateAdd(x, y);
-                        }
-                        return;
-                    }
-                }
-            }
-        }
-    }
-
-    CodeGen_Posix::codegen_vector_reduce(op, init);
-}
-
 void CodeGen_X86::visit(const Allocate *op) {
     ScopedBinding<MemoryType> bind(mem_type, op->name, op->memory_type);
     CodeGen_Posix::visit(op);
@@ -743,6 +397,11 @@ void CodeGen_X86::visit(const Store *op) {
     CodeGen_Posix::visit(op);
 }
 
+void CodeGen_X86::visit(const VectorIntrinsic *op) {
+    value = call_overloaded_intrin(op->type, op->name, op->args);
+    internal_assert(value) << "CodeGen_X86 failed on " << Expr(op) << "\n";
+}
+
 string CodeGen_X86::mcpu_target() const {
     // Perform an ad-hoc guess for the -mcpu given features.
     // WARNING: this is used to drive -mcpu, *NOT* -mtune!
diff --git a/src/IRMatch.h b/src/IRMatch.h
index 089521c5bfed..cc4a4b490664 100644
--- a/src/IRMatch.h
+++ b/src/IRMatch.h
@@ -2198,6 +2198,39 @@ HALIDE_ALWAYS_INLINE auto cast(halide_type_t t, A &&a) noexcept -> CastOp<declty
     return {t, pattern_arg(a)};
 }
 
+// A node for expressing type hints, when rules are ambiguously typed.
+template<typename A>
+struct TypeHint {
+    struct pattern_tag {};
+    Type type;
+    A a;
+
+    constexpr static uint32_t binds = bindings<A>::mask;
+
+    constexpr static IRNodeType min_node_type = IRNodeType::Cast;
+    constexpr static IRNodeType max_node_type = IRNodeType::Cast;
+    constexpr static bool canonical = A::canonical;
+
+    HALIDE_ALWAYS_INLINE
+    Expr make(MatcherState &state, halide_type_t type_hint) const {
+        return a.make(state, type);
+    }
+
+    constexpr static bool foldable = false;
+};
+
+template<typename A>
+std::ostream &operator<<(std::ostream &s, const TypeHint<A> &op) {
+    s << "typed(" << op.type << ", " << op.a << ")";
+    return s;
+}
+
+template<typename A>
+HALIDE_ALWAYS_INLINE auto typed(halide_type_t t, A &&a) noexcept -> TypeHint<decltype(pattern_arg(a))> {
+    assert_is_lvalue_if_expr<A>();
+    return {t, pattern_arg(a)};
+}
+
 template<typename A>
 struct Fold {
     struct pattern_tag {};
diff --git a/src/Lower.cpp b/src/Lower.cpp
index 38ad867686e6..20d98a20562a 100644
--- a/src/Lower.cpp
+++ b/src/Lower.cpp
@@ -75,6 +75,7 @@
 #include "UnsafePromises.h"
 #include "VectorizeLoops.h"
 #include "WrapCalls.h"
+#include "X86Optimize.h"
 
 namespace Halide {
 namespace Internal {
@@ -443,6 +444,13 @@ void lower_impl(const vector<Function> &output_funcs,
         debug(1) << "Skipping GPU offload...\n";
     }
 
+    if (t.arch == Target::X86) {
+        debug(1) << "Performing x86-specific vector instruction selection...\n";
+        s = optimize_x86_instructions(s, t);
+        debug(2) << "Lowering after performing x86-specific vector instruction selection:\n"
+                 << s << "\n\n";
+    }
+
     // TODO: This needs to happen before lowering parallel tasks, because global
     // images used inside parallel loops are rewritten from loads from images to
     // loads from closure parameters. Closure parameters are missing the Buffer<>
diff --git a/src/X86Optimize.cpp b/src/X86Optimize.cpp
new file mode 100644
index 000000000000..3a1d3df3b6f8
--- /dev/null
+++ b/src/X86Optimize.cpp
@@ -0,0 +1,588 @@
+#include "X86Optimize.h"
+
+#include "CSE.h"
+// FIXME: move lower_int_uint_div out of CodeGen_Internal to remove this dependency.
+#include "CodeGen_Internal.h"
+#include "FindIntrinsics.h"
+#include "IR.h"
+#include "IRMatch.h"
+#include "IRMutator.h"
+#include "IROperator.h"
+#include "Simplify.h"
+
+namespace Halide {
+namespace Internal {
+
+// Populate feature flags in a target according to those implied by
+// existing flags, so that instruction patterns can just check for the
+// oldest feature flag that supports an instruction.
+Target complete_x86_target(Target t) {
+    if (t.has_feature(Target::AVX512_SapphireRapids)) {
+        t.set_feature(Target::AVX512_Cannonlake);
+    }
+    if (t.has_feature(Target::AVX512_Cannonlake)) {
+        t.set_feature(Target::AVX512_Skylake);
+    }
+    if (t.has_feature(Target::AVX512_Cannonlake) ||
+        t.has_feature(Target::AVX512_Skylake) ||
+        t.has_feature(Target::AVX512_KNL)) {
+        t.set_feature(Target::AVX2);
+    }
+    if (t.has_feature(Target::AVX2)) {
+        t.set_feature(Target::AVX);
+    }
+    if (t.has_feature(Target::AVX)) {
+        t.set_feature(Target::SSE41);
+    }
+    return t;
+}
+
+#if defined(WITH_X86)
+
+namespace {
+
+// i32(i16_a)*i32(i16_b) +/- i32(i16_c)*i32(i16_d) can be done by
+// interleaving a, c, and b, d, and then using dot_product.
+bool should_use_dot_product(const Expr &a, const Expr &b, std::vector<Expr> &result) {
+    Type t = a.type();
+    internal_assert(b.type() == t) << a << " and " << b << " don't match types\n";
+
+    if (!(t.is_int() && t.bits() == 32 && t.lanes() >= 4)) {
+        return false;
+    }
+
+    const Call *ma = Call::as_intrinsic(a, {Call::widening_mul});
+    const Call *mb = Call::as_intrinsic(b, {Call::widening_mul});
+    // dot_product can't handle mixed type widening muls.
+    if (ma && ma->args[0].type() != ma->args[1].type()) {
+        return false;
+    }
+    if (mb && mb->args[0].type() != mb->args[1].type()) {
+        return false;
+    }
+    // If the operands are widening shifts, we might be able to treat these as
+    // multiplies.
+    const Call *sa = Call::as_intrinsic(a, {Call::widening_shift_left});
+    const Call *sb = Call::as_intrinsic(b, {Call::widening_shift_left});
+    if (sa && !is_const(sa->args[1])) {
+        sa = nullptr;
+    }
+    if (sb && !is_const(sb->args[1])) {
+        sb = nullptr;
+    }
+    if ((ma || sa) && (mb || sb)) {
+        Expr a0 = ma ? ma->args[0] : sa->args[0];
+        Expr a1 = ma ? ma->args[1] : lossless_cast(sa->args[0].type(), simplify(make_const(sa->type, 1) << sa->args[1]));
+        Expr b0 = mb ? mb->args[0] : sb->args[0];
+        Expr b1 = mb ? mb->args[1] : lossless_cast(sb->args[0].type(), simplify(make_const(sb->type, 1) << sb->args[1]));
+        if (a1.defined() && b1.defined()) {
+            std::vector<Expr> args = {a0, a1, b0, b1};
+            result.swap(args);
+            return true;
+        }
+    }
+    return false;
+}
+
+// // Templated saturating casts for use in rewrite rules.
+// template<typename T>
+// auto saturating_cast()
+
+/** A code generator that replaces Halide IR with VectorIntrinsics specific to x86. */
+class Optimize_X86 : public IRMutator {
+public:
+    /** Create an x86 code generator. Processor features can be
+     * enabled using the appropriate flags in the target struct. */
+    Optimize_X86(const Target &t) : target(t) {
+    }
+
+protected:
+
+    bool should_peephole_optimize(const Type &type) {
+        // We only have peephole optimizations for vectors here.
+        // FIXME: should we only optimize vectors that are multiples of the native vector width?
+        //        when we do, we fail simd_op_check tests on weird vector sizes.
+        return type.is_vector();
+    }
+
+    Expr visit(const Div *op) override {
+        if (!should_peephole_optimize(op->type) || !op->type.is_int_or_uint()) {
+            return IRMutator::visit(op);
+        }
+        // Lower division here in order to do pattern-matching on intrinsics.
+        return mutate(lower_int_uint_div(op->a, op->b));
+    }
+
+    /** Nodes for which we want to emit specific sse/avx intrinsics */
+    Expr visit(const Add *op) override {
+        if (!should_peephole_optimize(op->type)) {
+            return IRMutator::visit(op);
+        }
+
+        std::vector<Expr> matches;
+        // TODO(rootjalex): is it possible to rewrite should_use_dot_product
+        // as a series of rewrite-rules? lossless_cast is the hardest part.
+        const int lanes = op->type.lanes();
+
+        // FIXME: should we check for accumulating dot_products first?
+        //        can there even be overlap between these?
+        auto rewrite = IRMatcher::rewriter(IRMatcher::add(op->a, op->b), op->type);
+        if (
+            // Only AVX512_SapphireRapids has accumulating dot products.
+            target.has_feature(Target::AVX512_SapphireRapids) &&
+            // FIXME: add the float16 -> float32 versions as well.
+            (op->type.element_of() == Int(32)) &&
+
+            // Accumulating pmaddubsw
+            (rewrite(
+                x + h_add(cast(Int(32, lanes * 4), widening_mul(y, z)), lanes),
+                v_intrin("dot_product", x, y, z),
+                is_uint(y, 8) && is_int(z, 8)) ||
+
+             rewrite(
+                x + h_add(cast(Int(32, lanes * 4), widening_mul(y, z)), lanes),
+                v_intrin("dot_product", x, z, y),
+                is_int(y, 8) && is_uint(z, 8)) ||
+
+             rewrite(
+                h_add(cast(Int(32, lanes * 4), widening_mul(x, y)), lanes) + z,
+                v_intrin("dot_product", z, x, y),
+                is_uint(x, 8) && is_int(y, 8)) ||
+
+             rewrite(
+                h_add(cast(Int(32, lanes * 4), widening_mul(x, y)), lanes) + z,
+                v_intrin("dot_product", z, y, x),
+                is_int(x, 8) && is_uint(y, 8)) ||
+
+             // Accumulating pmaddwd.
+             rewrite(
+                x + h_add(widening_mul(y, z), lanes),
+                v_intrin("dot_product", x, y, z),
+                is_int(y, 16, lanes * 2) && is_int(z, 16, lanes * 2)) ||
+            
+             rewrite(
+                h_add(widening_mul(x, y), lanes) + z,
+                v_intrin("dot_product", z, x, y),
+                is_int(x, 16, lanes * 2) && is_int(y, 16, lanes * 2)) ||
+
+             false)) {
+            return mutate(rewrite.result);
+        }
+
+        if ((op->type.lanes() % 4 == 0) && should_use_dot_product(op->a, op->b, matches)) {
+            Expr ac = Shuffle::make_interleave({matches[0], matches[2]});
+            Expr bd = Shuffle::make_interleave({matches[1], matches[3]});
+            // We have dot_products for every x86 arch (because SSE2 has it),
+            // so this is `always` safe (as long as the output type lanes has
+            // a factor of 4).
+            return mutate(VectorIntrinsic::make(op->type, "dot_product", {ac, bd}));
+        }
+
+        return IRMutator::visit(op);
+    }
+
+    Expr visit(const Sub *op) override {
+        if (!should_peephole_optimize(op->type)) {
+            return IRMutator::visit(op);
+        }
+
+        std::vector<Expr> matches;
+        // TODO(rootjalex): same issue as the Add case, lossless_cast and
+        // lossless_negate are hard to use in rewrite rules.
+
+        if ((op->type.lanes() % 4 == 0) && should_use_dot_product(op->a, op->b, matches)) {
+            // Negate one of the factors in the second expression
+            Expr negative_2 = lossless_negate(matches[2]);
+            Expr negative_3 = lossless_negate(matches[3]);
+            if (negative_2.defined() || negative_3.defined()) {
+                if (negative_2.defined()) {
+                    matches[2] = negative_2;
+                } else {
+                    matches[3] = negative_3;
+                }
+                Expr ac = Shuffle::make_interleave({matches[0], matches[2]});
+                Expr bd = Shuffle::make_interleave({matches[1], matches[3]});
+                // Always safe, see comment in Add case above.
+                return mutate(VectorIntrinsic::make(op->type, "dot_product", {ac, bd}));
+            }
+        }
+
+        return IRMutator::visit(op);
+    }
+
+    Expr visit(const Cast *op) override {
+        if (!should_peephole_optimize(op->type)) {
+            return IRMutator::visit(op);
+        }
+
+        const int lanes = op->type.lanes();
+
+        auto rewrite = IRMatcher::rewriter(IRMatcher::cast(op->type, op->value), op->type);
+
+        // TODO: saturating casts should be intrinsics, and supported in IRMatch.h.
+        const Expr i32_i16max = cast(Int(32, lanes), Int(16).max());
+        const Expr i32_i16min = cast(Int(32, lanes), Int(16).min());
+        const Expr i16_i8max = cast(Int(16, lanes), Int(8).max());
+        const Expr i16_i8min = cast(Int(16, lanes), Int(8).min());
+        const Expr i16_u8max = cast(Int(16, lanes), UInt(8).max());
+        const Expr i16_u8min = cast(Int(16, lanes), UInt(8).min());
+        const Expr i32_u16max = cast(Int(32, lanes), UInt(16).max());
+        const Expr i32_u16min = cast(Int(32, lanes), UInt(16).min());
+
+        if (
+            // pmulhrs is supported via AVX2 and SSE41, so SSE41 is the LCD.
+            (target.has_feature(Target::SSE41) &&
+             rewrite(
+                cast(Int(16, lanes), rounding_shift_right(widening_mul(x, y), 15)),
+                v_intrin("pmulhrs", x, y),
+                is_int(x, 16) && is_int(y, 16))) ||
+
+            // saturating_narrow is always supported (via SSE2) for:
+            //   int32 -> int16, int16 -> int8, int16 -> uint8
+            rewrite(
+                cast(Int(16, lanes), max(min(x, i32_i16min), i32_i16min)),
+                v_intrin("saturating_narrow", x),
+                is_int(x, 32)) ||
+
+            rewrite(
+                cast(Int(8, lanes), max(min(x, i16_i8min), i16_i8min)),
+                v_intrin("saturating_narrow", x),
+                is_int(x, 16)) ||
+
+            rewrite(
+                cast(UInt(8, lanes), max(min(x, i16_u8min), i16_u8min)),
+                v_intrin("saturating_narrow", x),
+                is_int(x, 16)) ||
+
+            //   int32 -> uint16 is supported via SSE41
+            (target.has_feature(Target::SSE41) &&
+             rewrite(
+                cast(UInt(16, lanes), max(min(x, i32_u16min), i32_u16min)),
+                v_intrin("saturating_narrow", x),
+                is_int(x, 32))) ||
+
+            // f32_to_bf16 is supported only via Target::AVX512_SapphireRapids
+            (target.has_feature(Target::AVX512_SapphireRapids) &&
+             rewrite(
+                cast(BFloat(16, lanes), x),
+                v_intrin("f32_to_bf16", x),
+                is_float(x, 32))) ||
+
+            false) {
+            return mutate(rewrite.result);
+        }
+
+        // TODO: should we handle CodeGen_X86's weird 8 -> 16 bit issue here?
+
+        return IRMutator::visit(op);
+    }
+
+    Expr visit(const Call *op) override {
+        if (!should_peephole_optimize(op->type)) {
+            return IRMutator::visit(op);
+        }
+
+        // TODO: This optimization is hard to do via a rewrite-rule because of lossless_cast.
+
+        // A 16-bit mul-shift-right of less than 16 can sometimes be rounded up to a
+        // full 16 to use pmulh(u)w by left-shifting one of the operands. This is
+        // handled here instead of in the lowering of mul_shift_right because it's
+        // unlikely to be a good idea on platforms other than x86, as it adds an
+        // extra shift in the fully-lowered case.
+        if ((op->type.element_of() == UInt(16) ||
+            op->type.element_of() == Int(16)) &&
+            op->is_intrinsic(Call::mul_shift_right)) {
+            internal_assert(op->args.size() == 3);
+            const uint64_t *shift = as_const_uint(op->args[2]);
+            if (shift && *shift < 16 && *shift >= 8) {
+                Type narrow = op->type.with_bits(8);
+                Expr narrow_a = lossless_cast(narrow, op->args[0]);
+                Expr narrow_b = narrow_a.defined() ? Expr() : lossless_cast(narrow, op->args[1]);
+                int shift_left = 16 - (int)(*shift);
+                if (narrow_a.defined()) {
+                    return mutate(mul_shift_right(op->args[0] << shift_left, op->args[1], 16));
+                } else if (narrow_b.defined()) {
+                    return mutate(mul_shift_right(op->args[0], op->args[1] << shift_left, 16));
+                }
+            }
+        }
+
+        const int lanes = op->type.lanes();
+        const int bits = op->type.bits();
+
+        auto rewrite = IRMatcher::rewriter(op, op->type);
+
+        Type unsigned_type = op->type.with_code(halide_type_uint);
+        auto x_uint = cast(unsigned_type, x);
+        auto y_uint = cast(unsigned_type, y);
+
+        if (
+            // We can redirect signed rounding halving add to unsigned rounding
+            // halving add by adding 128 / 32768 to the result if the sign of the
+            // args differs.
+            ((op->type.is_int() && bits <= 16) &&
+             rewrite(
+                rounding_halving_add(x, y),
+                cast(op->type, rounding_halving_add(x_uint, y_uint) + ((x_uint ^ y_uint) & (1 << (bits - 1)))))) ||
+
+            // On x86, there are many 3-instruction sequences to compute absd of
+            // unsigned integers. This one consists solely of instructions with
+            // throughput of 3 ops per cycle on Cannon Lake.
+            //
+            // Solution due to Wojciech Mula:
+            // http://0x80.pl/notesen/2018-03-11-sse-abs-unsigned.html
+            (op->type.is_uint() &&
+             rewrite(
+                absd(x, y),
+                saturating_sub(x, y) | saturating_sub(y, x))) ||
+
+            // Current best way to lower absd on x86.
+            (op->type.is_int() &&
+             rewrite(
+                absd(x, y),
+                max(x, y) - min(x, y))) ||
+
+            // pmulh is always supported (via SSE2).
+            ((op->type.is_int_or_uint() && bits == 16) &&
+             rewrite(
+                mul_shift_right(x, y, 16),
+                v_intrin("pmulh", x, y))) ||
+
+            // saturating_pmulhrs is supported via SSE41
+            ((target.has_feature(Target::SSE41) &&
+              op->type.is_int() && bits == 16) &&
+             rewrite(
+                rounding_mul_shift_right(x, y, 15),
+                v_intrin("saturating_pmulhrs", x, y))) ||
+
+            // TODO(rootjalex): The following intrinsics are
+            // simply one-to-one mappings, should they even
+            // be handled here?
+
+            // int(8 | 16 | 32) -> uint is supported via SSE41
+            // float is always supported (via SSE2).
+            (((target.has_feature(Target::SSE41) && bits <= 32) ||
+              op->type.is_float()) &&
+             rewrite(
+                abs(x),
+                v_intrin("abs", x))) ||
+
+            // saturating ops for 8 and 16 bits are always supported (via SSE2).
+            ((bits == 8 || bits == 16) &&
+             (rewrite(
+                saturating_add(x, y),
+                v_intrin("saturating_add", x, y)) ||
+             rewrite(
+                saturating_sub(x, y),
+                v_intrin("saturating_sub", x, y)))) ||
+
+            // pavg ops for 8 and 16 bits are always supported (via SSE2).
+            ((op->type.is_uint() && (bits == 8  || bits == 16)) &&
+             rewrite(
+                rounding_halving_add(x, y),
+                v_intrin("rounding_halving_add", x, y))) ||
+
+            // int16 -> int32 widening_mul has a (v)pmaddwd implementation.
+            // always supported (via SSE2).
+            ((op->type.is_int() && (bits == 32)) &&
+             rewrite(
+                widening_mul(x, y),
+                v_intrin("widening_mul", x, y),
+                is_int(x, 16) && is_int(y, 16))) ||
+
+            (target.has_feature(Target::AVX512_SapphireRapids) &&
+             (op->type.is_int() && (bits == 32)) &&
+             // SapphireRapids accumulating dot products.
+             (rewrite(
+                saturating_add(x, h_satadd(cast(Int(32, lanes * 4), widening_mul(y, z)) , lanes)),
+                v_intrin("saturating_dot_product", x, y, z),
+                is_uint(y, 8) && is_int(z, 8)) ||
+
+              rewrite(
+                saturating_add(x, h_satadd(cast(Int(32, lanes * 4), widening_mul(y, z)) , lanes)),
+                v_intrin("saturating_dot_product", x, z, y),
+                is_int(y, 8) && is_uint(z, 8)) ||
+            
+              rewrite(
+                saturating_add(x, h_satadd(cast(Int(32, lanes * 2), widening_mul(y, z)) , lanes)),
+                v_intrin("saturating_dot_product", x, y, z),
+                is_uint(y, 8) && is_int(z, 8)) ||
+
+              rewrite(
+                saturating_add(x, h_satadd(cast(Int(32, lanes * 2), widening_mul(y, z)) , lanes)),
+                v_intrin("saturating_dot_product", x, z, y),
+                is_int(y, 8) && is_uint(z, 8)) ||
+            
+              rewrite(
+                saturating_add(x, h_satadd(widening_mul(y, z) , lanes)),
+                v_intrin("saturating_dot_product", x, z, y),
+                is_int(y, 16, lanes * 2) && is_int(z, 16, lanes * 2)) ||
+            
+             false)) ||
+
+            false) {
+            return mutate(rewrite.result);
+        }
+
+        // Fixed-point intrinsics should be lowered here.
+        // This is safe because this mutator is top-down.
+        if (op->is_intrinsic({
+              Call::halving_add,
+              Call::halving_sub,
+              Call::mul_shift_right,
+              Call::rounding_halving_add,
+              Call::rounding_mul_shift_right,
+              Call::rounding_shift_left,
+              Call::rounding_shift_right,
+              Call::saturating_add,
+              Call::saturating_sub,
+              Call::sorted_avg,
+              Call::widening_add,
+              Call::widening_mul,
+              Call::widening_shift_left,
+              Call::widening_shift_right,
+              Call::widening_sub,
+            })) {
+            // TODO: Should we have a base-class that does this + the VectorReduce lowering needed below?
+            return mutate(lower_intrinsic(op));
+        }
+
+        return IRMutator::visit(op);
+    }
+
+    Expr visit(const VectorReduce *op) override {
+        // FIXME: We need to split up VectorReduce nodes in the same way that
+        //        CodeGen_LLVM::codegen_vector_reduce does, in order to do all
+        //        matching here.
+        if ((op->op != VectorReduce::Add && op->op != VectorReduce::SaturatingAdd) ||
+            !should_peephole_optimize(op->type)) {
+            return IRMutator::visit(op);
+        }
+
+        const int lanes = op->type.lanes();
+        const int value_lanes = op->value.type().lanes();
+        const int factor = value_lanes / lanes;
+        Expr value = op->value;
+
+        switch (op->op) {
+        case VectorReduce::Add: {
+            auto rewrite = IRMatcher::rewriter(IRMatcher::h_add(value, lanes), op->type);
+            auto x_is_int_or_uint = is_int(x) || is_uint(x);
+            auto y_is_int_or_uint = is_int(y) || is_uint(y);
+            if (
+                // 2-way dot-products, int16 -> int32 is always supported (via SSE2).
+                ((factor == 2) &&
+                 (rewrite(
+                    h_add(cast(Int(32, value_lanes), widening_mul(x, y)), lanes),
+                    v_intrin("dot_product", cast(Int(16, value_lanes), x), cast(Int(16, value_lanes), y)),
+                    x_is_int_or_uint && y_is_int_or_uint) ||
+                
+                 // Horizontal widening add via pmaddwd
+                 rewrite(
+                    h_add(cast(Int(32, value_lanes), x), lanes),
+                    v_intrin("dot_product", x, make_const(Int(16, value_lanes), 1)),
+                    is_int(x, 16)) ||
+
+                 (rewrite(
+                    h_add(widening_mul(x, y), lanes),
+                    v_intrin("dot_product", x, y),
+                    is_int(x, 16) && is_int(y, 16))) ||
+                
+                 // pmaddub supported via SSE41
+                 (target.has_feature(Target::SSE41) &&
+                  // Horizontal widening adds using 2-way saturating dot products.
+                  (rewrite(
+                    h_add(cast(UInt(16, value_lanes), x), lanes),
+                    cast(UInt(16, lanes), typed(Int(16, lanes), v_intrin("saturating_dot_product", x, make_const(Int(8, value_lanes), 1)))),
+                    is_uint(x, 8)) ||
+
+                   rewrite(
+                    h_add(cast(Int(16, value_lanes), x), lanes),
+                    v_intrin("saturating_dot_product", x, make_const(Int(8, value_lanes), 1)),
+                    is_uint(x, 8)) ||
+
+                   rewrite(
+                    h_add(cast(Int(16, value_lanes), x), lanes),
+                    v_intrin("saturating_dot_product", make_const(UInt(8, value_lanes), 1), x),
+                    is_int(x, 8)) ||
+
+                   // SSE41 and AVX2 support horizontal_add via phadd intrinsics.
+                   rewrite(
+                    h_add(x, lanes),
+                    v_intrin("horizontal_add", x),
+                    is_int(x, 16, lanes * 2) || is_uint(x, 16, lanes * 2) ||
+                     is_int(x, 32, lanes * 2) || is_uint(x, 32, lanes * 2)) ||
+
+                    // TODO: add in Andrew's psadbw pattern.
+
+                 false)) ||
+
+                false))) {
+                return mutate(rewrite.result);
+            }
+            break;
+        }
+        case VectorReduce::SaturatingAdd: {
+            auto rewrite = IRMatcher::rewriter(IRMatcher::h_satadd(value, lanes), op->type);
+            if (
+                // Saturating dot products are supported via SSE41 and AVX2.
+                ((factor == 2) && target.has_feature(Target::SSE41) &&
+                 (rewrite(
+                    h_satadd(widening_mul(x, y), lanes),
+                    v_intrin("saturating_dot_product", x, y),
+                    is_uint(x, 8) && is_int(y, 8)) ||
+
+                  rewrite(
+                    h_satadd(widening_mul(x, y), lanes),
+                    v_intrin("saturating_dot_product", y, x),
+                    is_int(x, 8) && is_uint(y, 8)) ||
+
+                 false))) {
+                return mutate(rewrite.result);
+            }
+            break;
+        }
+        default:
+            break;
+        }
+
+        // FIXME: We need to split up VectorReduce nodes in the same way that
+        //        CodeGen_LLVM::codegen_vector_reduce does, in order to do all
+        //        matching here.
+
+        return IRMutator::visit(op);
+    }
+
+private:
+    const Target &target;
+
+    IRMatcher::Wild<0> x;
+    IRMatcher::Wild<1> y;
+    IRMatcher::Wild<2> z;
+};
+
+}
+
+
+
+Stmt optimize_x86_instructions(Stmt s, const Target &t) {
+    s = Optimize_X86(complete_x86_target(t)).mutate(s);
+    // Some of the rules above can introduce repeated sub-terms, so run CSE again.
+    s = common_subexpression_elimination(s);
+    return s;
+}
+
+#else  // WITH_X86
+
+Stmt optimize_x86_instructions(Stmt s, const Target &t) {
+    user_error << "x86 not enabled for this build of Halide.\n";
+    return Stmt();
+}
+
+
+#endif  // WITH_X86
+
+}  // namespace Internal
+}  // namespace Halide
+
+
diff --git a/src/X86Optimize.h b/src/X86Optimize.h
new file mode 100644
index 000000000000..bcf375cae1d5
--- /dev/null
+++ b/src/X86Optimize.h
@@ -0,0 +1,22 @@
+#ifndef HALIDE_IR_X86_OPTIMIZE_H
+#define HALIDE_IR_X86_OPTIMIZE_H
+
+/** \file
+ * Tools for optimizing IR for x86.
+ */
+
+#include "Expr.h"
+#include "Target.h"
+
+namespace Halide {
+namespace Internal {
+
+/** Perform vector instruction selection, inserting VectorIntrinsic nodes. */
+Stmt optimize_x86_instructions(Stmt s, const Target &t);
+
+Target complete_x86_target(Target t);
+
+}  // namespace Internal
+}  // namespace Halide
+
+#endif

From 09193f493395bdca9263f59e4ee9f5093dfbd238 Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Mon, 25 Jul 2022 17:43:09 -0400
Subject: [PATCH 04/55] fix typo

---
 src/StmtToHtml.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/StmtToHtml.cpp b/src/StmtToHtml.cpp
index 1f5b1d20ccff..ceddcabec83d 100644
--- a/src/StmtToHtml.cpp
+++ b/src/StmtToHtml.cpp
@@ -713,7 +713,7 @@ class StmtToHtml : public IRVisitor {
     }
 
     void visit(const VectorIntrinsic *op) override {
-        stream << open_span("VectoIntrinsic");
+        stream << open_span("VectorIntrinsic");
         stream << open_span("Type") << op->type << close_span();
         print_list(symbol("vector_intrinsic") + "(\"" + op->name + "\"", op->args, ")");
         stream << close_span();

From 24f74a9a2732b80c44c52e323c69a795aabcaf87 Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Mon, 25 Jul 2022 17:49:22 -0400
Subject: [PATCH 05/55] clang-format

---
 src/CodeGen_LLVM.cpp |   3 +-
 src/IRMatch.h        |   6 +-
 src/X86Optimize.cpp  | 284 +++++++++++++++++++++----------------------
 3 files changed, 143 insertions(+), 150 deletions(-)

diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index 4990ba6e78c1..14db16dba353 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -4001,7 +4001,8 @@ void CodeGen_LLVM::visit(const Shuffle *op) {
 }
 
 void CodeGen_LLVM::visit(const VectorIntrinsic *op) {
-    internal_error << "CodeGen_LLVM received VectorIntrinsic node, should be handled by architecture-specific CodeGen class:\n" << Expr(op) << "\n";
+    internal_error << "CodeGen_LLVM received VectorIntrinsic node, should be handled by architecture-specific CodeGen class:\n"
+                   << Expr(op) << "\n";
 }
 
 void CodeGen_LLVM::visit(const VectorReduce *op) {
diff --git a/src/IRMatch.h b/src/IRMatch.h
index cc4a4b490664..457744f87a21 100644
--- a/src/IRMatch.h
+++ b/src/IRMatch.h
@@ -1586,7 +1586,7 @@ auto bitwise_xor(A &&a, B &&b) noexcept -> Intrin<decltype(pattern_arg(a)), decl
     return {Call::bitwise_xor, pattern_arg(a), pattern_arg(b)};
 }
 template<typename A, typename B>
-HALIDE_ALWAYS_INLINE auto operator^(A &&a, B &&b) noexcept -> auto {
+HALIDE_ALWAYS_INLINE auto operator^(A &&a, B &&b) noexcept -> auto{
     assert_is_lvalue_if_expr<A>();
     assert_is_lvalue_if_expr<B>();
     return bitwise_xor(a, b);
@@ -1596,7 +1596,7 @@ auto bitwise_and(A &&a, B &&b) noexcept -> Intrin<decltype(pattern_arg(a)), decl
     return {Call::bitwise_and, pattern_arg(a), pattern_arg(b)};
 }
 template<typename A, typename B>
-HALIDE_ALWAYS_INLINE auto operator&(A &&a, B &&b) noexcept -> auto {
+HALIDE_ALWAYS_INLINE auto operator&(A &&a, B &&b) noexcept -> auto{
     assert_is_lvalue_if_expr<A>();
     assert_is_lvalue_if_expr<B>();
     return bitwise_and(a, b);
@@ -1606,7 +1606,7 @@ auto bitwise_or(A &&a, B &&b) noexcept -> Intrin<decltype(pattern_arg(a)), declt
     return {Call::bitwise_or, pattern_arg(a), pattern_arg(b)};
 }
 template<typename A, typename B>
-HALIDE_ALWAYS_INLINE auto operator|(A &&a, B &&b) noexcept -> auto {
+HALIDE_ALWAYS_INLINE auto operator|(A &&a, B &&b) noexcept -> auto{
     assert_is_lvalue_if_expr<A>();
     assert_is_lvalue_if_expr<B>();
     return bitwise_or(a, b);
diff --git a/src/X86Optimize.cpp b/src/X86Optimize.cpp
index 3a1d3df3b6f8..16bad9be4413 100644
--- a/src/X86Optimize.cpp
+++ b/src/X86Optimize.cpp
@@ -84,20 +84,16 @@ bool should_use_dot_product(const Expr &a, const Expr &b, std::vector<Expr> &res
     return false;
 }
 
-// // Templated saturating casts for use in rewrite rules.
-// template<typename T>
-// auto saturating_cast()
-
-/** A code generator that replaces Halide IR with VectorIntrinsics specific to x86. */
+/** A top-down code optimizer that replaces Halide IR with VectorIntrinsics specific to x86. */
 class Optimize_X86 : public IRMutator {
 public:
-    /** Create an x86 code generator. Processor features can be
+    /** Create an x86 code optimizer. Processor features can be
      * enabled using the appropriate flags in the target struct. */
-    Optimize_X86(const Target &t) : target(t) {
+    Optimize_X86(const Target &t)
+        : target(t) {
     }
 
 protected:
-
     bool should_peephole_optimize(const Type &type) {
         // We only have peephole optimizations for vectors here.
         // FIXME: should we only optimize vectors that are multiples of the native vector width?
@@ -135,35 +131,35 @@ class Optimize_X86 : public IRMutator {
 
             // Accumulating pmaddubsw
             (rewrite(
-                x + h_add(cast(Int(32, lanes * 4), widening_mul(y, z)), lanes),
-                v_intrin("dot_product", x, y, z),
-                is_uint(y, 8) && is_int(z, 8)) ||
+                 x + h_add(cast(Int(32, lanes * 4), widening_mul(y, z)), lanes),
+                 v_intrin("dot_product", x, y, z),
+                 is_uint(y, 8) && is_int(z, 8)) ||
 
              rewrite(
-                x + h_add(cast(Int(32, lanes * 4), widening_mul(y, z)), lanes),
-                v_intrin("dot_product", x, z, y),
-                is_int(y, 8) && is_uint(z, 8)) ||
+                 x + h_add(cast(Int(32, lanes * 4), widening_mul(y, z)), lanes),
+                 v_intrin("dot_product", x, z, y),
+                 is_int(y, 8) && is_uint(z, 8)) ||
 
              rewrite(
-                h_add(cast(Int(32, lanes * 4), widening_mul(x, y)), lanes) + z,
-                v_intrin("dot_product", z, x, y),
-                is_uint(x, 8) && is_int(y, 8)) ||
+                 h_add(cast(Int(32, lanes * 4), widening_mul(x, y)), lanes) + z,
+                 v_intrin("dot_product", z, x, y),
+                 is_uint(x, 8) && is_int(y, 8)) ||
 
              rewrite(
-                h_add(cast(Int(32, lanes * 4), widening_mul(x, y)), lanes) + z,
-                v_intrin("dot_product", z, y, x),
-                is_int(x, 8) && is_uint(y, 8)) ||
+                 h_add(cast(Int(32, lanes * 4), widening_mul(x, y)), lanes) + z,
+                 v_intrin("dot_product", z, y, x),
+                 is_int(x, 8) && is_uint(y, 8)) ||
 
              // Accumulating pmaddwd.
              rewrite(
-                x + h_add(widening_mul(y, z), lanes),
-                v_intrin("dot_product", x, y, z),
-                is_int(y, 16, lanes * 2) && is_int(z, 16, lanes * 2)) ||
-            
+                 x + h_add(widening_mul(y, z), lanes),
+                 v_intrin("dot_product", x, y, z),
+                 is_int(y, 16, lanes * 2) && is_int(z, 16, lanes * 2)) ||
+
              rewrite(
-                h_add(widening_mul(x, y), lanes) + z,
-                v_intrin("dot_product", z, x, y),
-                is_int(x, 16, lanes * 2) && is_int(y, 16, lanes * 2)) ||
+                 h_add(widening_mul(x, y), lanes) + z,
+                 v_intrin("dot_product", z, x, y),
+                 is_int(x, 16, lanes * 2) && is_int(y, 16, lanes * 2)) ||
 
              false)) {
             return mutate(rewrite.result);
@@ -233,9 +229,9 @@ class Optimize_X86 : public IRMutator {
             // pmulhrs is supported via AVX2 and SSE41, so SSE41 is the LCD.
             (target.has_feature(Target::SSE41) &&
              rewrite(
-                cast(Int(16, lanes), rounding_shift_right(widening_mul(x, y), 15)),
-                v_intrin("pmulhrs", x, y),
-                is_int(x, 16) && is_int(y, 16))) ||
+                 cast(Int(16, lanes), rounding_shift_right(widening_mul(x, y), 15)),
+                 v_intrin("pmulhrs", x, y),
+                 is_int(x, 16) && is_int(y, 16))) ||
 
             // saturating_narrow is always supported (via SSE2) for:
             //   int32 -> int16, int16 -> int8, int16 -> uint8
@@ -257,16 +253,16 @@ class Optimize_X86 : public IRMutator {
             //   int32 -> uint16 is supported via SSE41
             (target.has_feature(Target::SSE41) &&
              rewrite(
-                cast(UInt(16, lanes), max(min(x, i32_u16min), i32_u16min)),
-                v_intrin("saturating_narrow", x),
-                is_int(x, 32))) ||
+                 cast(UInt(16, lanes), max(min(x, i32_u16min), i32_u16min)),
+                 v_intrin("saturating_narrow", x),
+                 is_int(x, 32))) ||
 
             // f32_to_bf16 is supported only via Target::AVX512_SapphireRapids
             (target.has_feature(Target::AVX512_SapphireRapids) &&
              rewrite(
-                cast(BFloat(16, lanes), x),
-                v_intrin("f32_to_bf16", x),
-                is_float(x, 32))) ||
+                 cast(BFloat(16, lanes), x),
+                 v_intrin("f32_to_bf16", x),
+                 is_float(x, 32))) ||
 
             false) {
             return mutate(rewrite.result);
@@ -290,7 +286,7 @@ class Optimize_X86 : public IRMutator {
         // unlikely to be a good idea on platforms other than x86, as it adds an
         // extra shift in the fully-lowered case.
         if ((op->type.element_of() == UInt(16) ||
-            op->type.element_of() == Int(16)) &&
+             op->type.element_of() == Int(16)) &&
             op->is_intrinsic(Call::mul_shift_right)) {
             internal_assert(op->args.size() == 3);
             const uint64_t *shift = as_const_uint(op->args[2]);
@@ -322,8 +318,9 @@ class Optimize_X86 : public IRMutator {
             // args differs.
             ((op->type.is_int() && bits <= 16) &&
              rewrite(
-                rounding_halving_add(x, y),
-                cast(op->type, rounding_halving_add(x_uint, y_uint) + ((x_uint ^ y_uint) & (1 << (bits - 1)))))) ||
+                 rounding_halving_add(x, y),
+                 cast(op->type, rounding_halving_add(x_uint, y_uint) +
+                                    ((x_uint ^ y_uint) & (1 << (bits - 1)))))) ||
 
             // On x86, there are many 3-instruction sequences to compute absd of
             // unsigned integers. This one consists solely of instructions with
@@ -333,27 +330,27 @@ class Optimize_X86 : public IRMutator {
             // http://0x80.pl/notesen/2018-03-11-sse-abs-unsigned.html
             (op->type.is_uint() &&
              rewrite(
-                absd(x, y),
-                saturating_sub(x, y) | saturating_sub(y, x))) ||
+                 absd(x, y),
+                 saturating_sub(x, y) | saturating_sub(y, x))) ||
 
             // Current best way to lower absd on x86.
             (op->type.is_int() &&
              rewrite(
-                absd(x, y),
-                max(x, y) - min(x, y))) ||
+                 absd(x, y),
+                 max(x, y) - min(x, y))) ||
 
             // pmulh is always supported (via SSE2).
             ((op->type.is_int_or_uint() && bits == 16) &&
              rewrite(
-                mul_shift_right(x, y, 16),
-                v_intrin("pmulh", x, y))) ||
+                 mul_shift_right(x, y, 16),
+                 v_intrin("pmulh", x, y))) ||
 
             // saturating_pmulhrs is supported via SSE41
             ((target.has_feature(Target::SSE41) &&
               op->type.is_int() && bits == 16) &&
              rewrite(
-                rounding_mul_shift_right(x, y, 15),
-                v_intrin("saturating_pmulhrs", x, y))) ||
+                 rounding_mul_shift_right(x, y, 15),
+                 v_intrin("saturating_pmulhrs", x, y))) ||
 
             // TODO(rootjalex): The following intrinsics are
             // simply one-to-one mappings, should they even
@@ -364,61 +361,61 @@ class Optimize_X86 : public IRMutator {
             (((target.has_feature(Target::SSE41) && bits <= 32) ||
               op->type.is_float()) &&
              rewrite(
-                abs(x),
-                v_intrin("abs", x))) ||
+                 abs(x),
+                 v_intrin("abs", x))) ||
 
             // saturating ops for 8 and 16 bits are always supported (via SSE2).
             ((bits == 8 || bits == 16) &&
              (rewrite(
-                saturating_add(x, y),
-                v_intrin("saturating_add", x, y)) ||
-             rewrite(
-                saturating_sub(x, y),
-                v_intrin("saturating_sub", x, y)))) ||
+                  saturating_add(x, y),
+                  v_intrin("saturating_add", x, y)) ||
+              rewrite(
+                  saturating_sub(x, y),
+                  v_intrin("saturating_sub", x, y)))) ||
 
             // pavg ops for 8 and 16 bits are always supported (via SSE2).
-            ((op->type.is_uint() && (bits == 8  || bits == 16)) &&
+            ((op->type.is_uint() && (bits == 8 || bits == 16)) &&
              rewrite(
-                rounding_halving_add(x, y),
-                v_intrin("rounding_halving_add", x, y))) ||
+                 rounding_halving_add(x, y),
+                 v_intrin("rounding_halving_add", x, y))) ||
 
             // int16 -> int32 widening_mul has a (v)pmaddwd implementation.
             // always supported (via SSE2).
             ((op->type.is_int() && (bits == 32)) &&
              rewrite(
-                widening_mul(x, y),
-                v_intrin("widening_mul", x, y),
-                is_int(x, 16) && is_int(y, 16))) ||
+                 widening_mul(x, y),
+                 v_intrin("widening_mul", x, y),
+                 is_int(x, 16) && is_int(y, 16))) ||
 
             (target.has_feature(Target::AVX512_SapphireRapids) &&
              (op->type.is_int() && (bits == 32)) &&
              // SapphireRapids accumulating dot products.
              (rewrite(
-                saturating_add(x, h_satadd(cast(Int(32, lanes * 4), widening_mul(y, z)) , lanes)),
-                v_intrin("saturating_dot_product", x, y, z),
-                is_uint(y, 8) && is_int(z, 8)) ||
+                  saturating_add(x, h_satadd(cast(Int(32, lanes * 4), widening_mul(y, z)), lanes)),
+                  v_intrin("saturating_dot_product", x, y, z),
+                  is_uint(y, 8) && is_int(z, 8)) ||
 
               rewrite(
-                saturating_add(x, h_satadd(cast(Int(32, lanes * 4), widening_mul(y, z)) , lanes)),
-                v_intrin("saturating_dot_product", x, z, y),
-                is_int(y, 8) && is_uint(z, 8)) ||
-            
+                  saturating_add(x, h_satadd(cast(Int(32, lanes * 4), widening_mul(y, z)), lanes)),
+                  v_intrin("saturating_dot_product", x, z, y),
+                  is_int(y, 8) && is_uint(z, 8)) ||
+
               rewrite(
-                saturating_add(x, h_satadd(cast(Int(32, lanes * 2), widening_mul(y, z)) , lanes)),
-                v_intrin("saturating_dot_product", x, y, z),
-                is_uint(y, 8) && is_int(z, 8)) ||
+                  saturating_add(x, h_satadd(cast(Int(32, lanes * 2), widening_mul(y, z)), lanes)),
+                  v_intrin("saturating_dot_product", x, y, z),
+                  is_uint(y, 8) && is_int(z, 8)) ||
 
               rewrite(
-                saturating_add(x, h_satadd(cast(Int(32, lanes * 2), widening_mul(y, z)) , lanes)),
-                v_intrin("saturating_dot_product", x, z, y),
-                is_int(y, 8) && is_uint(z, 8)) ||
-            
+                  saturating_add(x, h_satadd(cast(Int(32, lanes * 2), widening_mul(y, z)), lanes)),
+                  v_intrin("saturating_dot_product", x, z, y),
+                  is_int(y, 8) && is_uint(z, 8)) ||
+
               rewrite(
-                saturating_add(x, h_satadd(widening_mul(y, z) , lanes)),
-                v_intrin("saturating_dot_product", x, z, y),
-                is_int(y, 16, lanes * 2) && is_int(z, 16, lanes * 2)) ||
-            
-             false)) ||
+                  saturating_add(x, h_satadd(widening_mul(y, z), lanes)),
+                  v_intrin("saturating_dot_product", x, z, y),
+                  is_int(y, 16, lanes * 2) && is_int(z, 16, lanes * 2)) ||
+
+              false)) ||
 
             false) {
             return mutate(rewrite.result);
@@ -427,21 +424,21 @@ class Optimize_X86 : public IRMutator {
         // Fixed-point intrinsics should be lowered here.
         // This is safe because this mutator is top-down.
         if (op->is_intrinsic({
-              Call::halving_add,
-              Call::halving_sub,
-              Call::mul_shift_right,
-              Call::rounding_halving_add,
-              Call::rounding_mul_shift_right,
-              Call::rounding_shift_left,
-              Call::rounding_shift_right,
-              Call::saturating_add,
-              Call::saturating_sub,
-              Call::sorted_avg,
-              Call::widening_add,
-              Call::widening_mul,
-              Call::widening_shift_left,
-              Call::widening_shift_right,
-              Call::widening_sub,
+                Call::halving_add,
+                Call::halving_sub,
+                Call::mul_shift_right,
+                Call::rounding_halving_add,
+                Call::rounding_mul_shift_right,
+                Call::rounding_shift_left,
+                Call::rounding_shift_right,
+                Call::saturating_add,
+                Call::saturating_sub,
+                Call::sorted_avg,
+                Call::widening_add,
+                Call::widening_mul,
+                Call::widening_shift_left,
+                Call::widening_shift_right,
+                Call::widening_sub,
             })) {
             // TODO: Should we have a base-class that does this + the VectorReduce lowering needed below?
             return mutate(lower_intrinsic(op));
@@ -473,51 +470,51 @@ class Optimize_X86 : public IRMutator {
                 // 2-way dot-products, int16 -> int32 is always supported (via SSE2).
                 ((factor == 2) &&
                  (rewrite(
-                    h_add(cast(Int(32, value_lanes), widening_mul(x, y)), lanes),
-                    v_intrin("dot_product", cast(Int(16, value_lanes), x), cast(Int(16, value_lanes), y)),
-                    x_is_int_or_uint && y_is_int_or_uint) ||
-                
-                 // Horizontal widening add via pmaddwd
-                 rewrite(
-                    h_add(cast(Int(32, value_lanes), x), lanes),
-                    v_intrin("dot_product", x, make_const(Int(16, value_lanes), 1)),
-                    is_int(x, 16)) ||
+                      h_add(cast(Int(32, value_lanes), widening_mul(x, y)), lanes),
+                      v_intrin("dot_product", cast(Int(16, value_lanes), x), cast(Int(16, value_lanes), y)),
+                      x_is_int_or_uint && y_is_int_or_uint) ||
+
+                  // Horizontal widening add via pmaddwd
+                  rewrite(
+                      h_add(cast(Int(32, value_lanes), x), lanes),
+                      v_intrin("dot_product", x, make_const(Int(16, value_lanes), 1)),
+                      is_int(x, 16)) ||
 
-                 (rewrite(
-                    h_add(widening_mul(x, y), lanes),
-                    v_intrin("dot_product", x, y),
-                    is_int(x, 16) && is_int(y, 16))) ||
-                
-                 // pmaddub supported via SSE41
-                 (target.has_feature(Target::SSE41) &&
-                  // Horizontal widening adds using 2-way saturating dot products.
                   (rewrite(
-                    h_add(cast(UInt(16, value_lanes), x), lanes),
-                    cast(UInt(16, lanes), typed(Int(16, lanes), v_intrin("saturating_dot_product", x, make_const(Int(8, value_lanes), 1)))),
-                    is_uint(x, 8)) ||
-
-                   rewrite(
-                    h_add(cast(Int(16, value_lanes), x), lanes),
-                    v_intrin("saturating_dot_product", x, make_const(Int(8, value_lanes), 1)),
-                    is_uint(x, 8)) ||
-
-                   rewrite(
-                    h_add(cast(Int(16, value_lanes), x), lanes),
-                    v_intrin("saturating_dot_product", make_const(UInt(8, value_lanes), 1), x),
-                    is_int(x, 8)) ||
-
-                   // SSE41 and AVX2 support horizontal_add via phadd intrinsics.
-                   rewrite(
-                    h_add(x, lanes),
-                    v_intrin("horizontal_add", x),
-                    is_int(x, 16, lanes * 2) || is_uint(x, 16, lanes * 2) ||
-                     is_int(x, 32, lanes * 2) || is_uint(x, 32, lanes * 2)) ||
+                      h_add(widening_mul(x, y), lanes),
+                      v_intrin("dot_product", x, y),
+                      is_int(x, 16) && is_int(y, 16))) ||
+
+                  // pmaddub supported via SSE41
+                  (target.has_feature(Target::SSE41) &&
+                   // Horizontal widening adds using 2-way saturating dot products.
+                   (rewrite(
+                        h_add(cast(UInt(16, value_lanes), x), lanes),
+                        cast(UInt(16, lanes), typed(Int(16, lanes), v_intrin("saturating_dot_product", x, make_const(Int(8, value_lanes), 1)))),
+                        is_uint(x, 8)) ||
+
+                    rewrite(
+                        h_add(cast(Int(16, value_lanes), x), lanes),
+                        v_intrin("saturating_dot_product", x, make_const(Int(8, value_lanes), 1)),
+                        is_uint(x, 8)) ||
+
+                    rewrite(
+                        h_add(cast(Int(16, value_lanes), x), lanes),
+                        v_intrin("saturating_dot_product", make_const(UInt(8, value_lanes), 1), x),
+                        is_int(x, 8)) ||
+
+                    // SSE41 and AVX2 support horizontal_add via phadd intrinsics.
+                    rewrite(
+                        h_add(x, lanes),
+                        v_intrin("horizontal_add", x),
+                        is_int(x, 16, lanes * 2) || is_uint(x, 16, lanes * 2) ||
+                            is_int(x, 32, lanes * 2) || is_uint(x, 32, lanes * 2)) ||
 
                     // TODO: add in Andrew's psadbw pattern.
 
-                 false)) ||
+                    false)) ||
 
-                false))) {
+                  false))) {
                 return mutate(rewrite.result);
             }
             break;
@@ -528,16 +525,16 @@ class Optimize_X86 : public IRMutator {
                 // Saturating dot products are supported via SSE41 and AVX2.
                 ((factor == 2) && target.has_feature(Target::SSE41) &&
                  (rewrite(
-                    h_satadd(widening_mul(x, y), lanes),
-                    v_intrin("saturating_dot_product", x, y),
-                    is_uint(x, 8) && is_int(y, 8)) ||
+                      h_satadd(widening_mul(x, y), lanes),
+                      v_intrin("saturating_dot_product", x, y),
+                      is_uint(x, 8) && is_int(y, 8)) ||
 
                   rewrite(
-                    h_satadd(widening_mul(x, y), lanes),
-                    v_intrin("saturating_dot_product", y, x),
-                    is_int(x, 8) && is_uint(y, 8)) ||
+                      h_satadd(widening_mul(x, y), lanes),
+                      v_intrin("saturating_dot_product", y, x),
+                      is_int(x, 8) && is_uint(y, 8)) ||
 
-                 false))) {
+                  false))) {
                 return mutate(rewrite.result);
             }
             break;
@@ -561,9 +558,7 @@ class Optimize_X86 : public IRMutator {
     IRMatcher::Wild<2> z;
 };
 
-}
-
-
+}  // namespace
 
 Stmt optimize_x86_instructions(Stmt s, const Target &t) {
     s = Optimize_X86(complete_x86_target(t)).mutate(s);
@@ -579,10 +574,7 @@ Stmt optimize_x86_instructions(Stmt s, const Target &t) {
     return Stmt();
 }
 
-
 #endif  // WITH_X86
 
 }  // namespace Internal
 }  // namespace Halide
-
-

From 58ff01ba76aa782080a7419b1b92a3f01b8ff5ba Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Mon, 25 Jul 2022 18:09:16 -0400
Subject: [PATCH 06/55] add VectorIntrinsic comment

---
 src/IR.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/IR.h b/src/IR.h
index d732d1a43ea7..ad1ad2437123 100644
--- a/src/IR.h
+++ b/src/IR.h
@@ -886,6 +886,9 @@ struct Atomic : public StmtNode<Atomic> {
     static const IRNodeType _node_type = IRNodeType::Atomic;
 };
 
+/** Represent a target-specific vector instruction.
+ *  Intrinsic may not be element-wise operation, i.e.
+ *  dot_products. */
 struct VectorIntrinsic : public ExprNode<VectorIntrinsic> {
     std::string name;
     std::vector<Expr> args;

From 0d30b56cf720d76a602e6652e84fdf62c1b12aa6 Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Mon, 25 Jul 2022 18:11:22 -0400
Subject: [PATCH 07/55] format

---
 src/IRMatch.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/IRMatch.h b/src/IRMatch.h
index 457744f87a21..cc4a4b490664 100644
--- a/src/IRMatch.h
+++ b/src/IRMatch.h
@@ -1586,7 +1586,7 @@ auto bitwise_xor(A &&a, B &&b) noexcept -> Intrin<decltype(pattern_arg(a)), decl
     return {Call::bitwise_xor, pattern_arg(a), pattern_arg(b)};
 }
 template<typename A, typename B>
-HALIDE_ALWAYS_INLINE auto operator^(A &&a, B &&b) noexcept -> auto{
+HALIDE_ALWAYS_INLINE auto operator^(A &&a, B &&b) noexcept -> auto {
     assert_is_lvalue_if_expr<A>();
     assert_is_lvalue_if_expr<B>();
     return bitwise_xor(a, b);
@@ -1596,7 +1596,7 @@ auto bitwise_and(A &&a, B &&b) noexcept -> Intrin<decltype(pattern_arg(a)), decl
     return {Call::bitwise_and, pattern_arg(a), pattern_arg(b)};
 }
 template<typename A, typename B>
-HALIDE_ALWAYS_INLINE auto operator&(A &&a, B &&b) noexcept -> auto{
+HALIDE_ALWAYS_INLINE auto operator&(A &&a, B &&b) noexcept -> auto {
     assert_is_lvalue_if_expr<A>();
     assert_is_lvalue_if_expr<B>();
     return bitwise_and(a, b);
@@ -1606,7 +1606,7 @@ auto bitwise_or(A &&a, B &&b) noexcept -> Intrin<decltype(pattern_arg(a)), declt
     return {Call::bitwise_or, pattern_arg(a), pattern_arg(b)};
 }
 template<typename A, typename B>
-HALIDE_ALWAYS_INLINE auto operator|(A &&a, B &&b) noexcept -> auto{
+HALIDE_ALWAYS_INLINE auto operator|(A &&a, B &&b) noexcept -> auto {
     assert_is_lvalue_if_expr<A>();
     assert_is_lvalue_if_expr<B>();
     return bitwise_or(a, b);

From e9029a261d9e6f7bd2f544879b6e9583d9d51548 Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Mon, 25 Jul 2022 23:31:28 -0400
Subject: [PATCH 08/55] add missing horizontal_add x86Intrinsics

---
 src/CodeGen_X86.cpp | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp
index 0e99407726b8..5d902e866115 100644
--- a/src/CodeGen_X86.cpp
+++ b/src/CodeGen_X86.cpp
@@ -159,6 +159,16 @@ const x86Intrinsic intrinsic_defs[] = {
     // LLVM does not provide an unmasked 128bit cvtneps2bf16 intrinsic, so provide a wrapper around the masked version.
     {"vcvtneps2bf16x4", BFloat(16, 4), "f32_to_bf16", {Float(32, 4)}, Target::AVX512_SapphireRapids},
 
+    // Horizontal adds that use (v)phadd(w | d).
+    {"phaddw_sse3", UInt(16, 8), "horizontal_add", {UInt(16, 16)}, Target::SSE41},
+    {"phaddw_sse3", Int(16, 8), "horizontal_add", {Int(16, 16)}, Target::SSE41},
+    {"phaddw_avx2", UInt(16, 16), "horizontal_add", {UInt(16, 32)}, Target::AVX2},
+    {"phaddw_avx2", Int(16, 16), "horizontal_add", {Int(16, 32)}, Target::AVX2},
+    {"phaddd_sse3", UInt(32, 4), "horizontal_add", {UInt(32, 8)}, Target::SSE41},
+    {"phaddd_sse3", Int(32, 4), "horizontal_add", {Int(32, 8)}, Target::SSE41},
+    {"phaddd_avx2", UInt(32, 8), "horizontal_add", {UInt(32, 16)}, Target::AVX2},
+    {"phaddd_avx2", Int(32, 8), "horizontal_add", {Int(32, 16)}, Target::AVX2},
+
     // 2-way dot products
     {"llvm.x86.avx2.pmadd.ub.sw", Int(16, 16), "saturating_dot_product", {UInt(8, 32), Int(8, 32)}, Target::AVX2},
     {"llvm.x86.ssse3.pmadd.ub.sw.128", Int(16, 8), "saturating_dot_product", {UInt(8, 16), Int(8, 16)}, Target::SSE41},

From 9d2deb482c9d1abdf2c32ba48c6df8e52ff73ca8 Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Mon, 25 Jul 2022 23:33:50 -0400
Subject: [PATCH 09/55] fix bfloat16 abs issue

---
 src/X86Optimize.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/X86Optimize.cpp b/src/X86Optimize.cpp
index 16bad9be4413..691df83b000f 100644
--- a/src/X86Optimize.cpp
+++ b/src/X86Optimize.cpp
@@ -357,9 +357,9 @@ class Optimize_X86 : public IRMutator {
             // be handled here?
 
             // int(8 | 16 | 32) -> uint is supported via SSE41
-            // float is always supported (via SSE2).
+            // float32 is always supported (via SSE2).
             (((target.has_feature(Target::SSE41) && bits <= 32) ||
-              op->type.is_float()) &&
+              (op->type.is_float() && bits == 32) &&
              rewrite(
                  abs(x),
                  v_intrin("abs", x))) ||

From 1a51b8331e34ab9932699797886b4e1ae2e5311a Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Mon, 25 Jul 2022 23:36:07 -0400
Subject: [PATCH 10/55] fix unhandled bitwise_or in IRMatch.h

---
 src/IRMatch.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/IRMatch.h b/src/IRMatch.h
index cc4a4b490664..44082d8a76c7 100644
--- a/src/IRMatch.h
+++ b/src/IRMatch.h
@@ -1449,6 +1449,8 @@ struct Intrin {
             return arg0 ^ arg1;
         } else if (intrin == Call::bitwise_and) {
             return arg0 & arg1;
+        } else if (intrin == Call::bitwise_or) {
+            return arg0 | arg1;
         }
 
         Expr arg2 = std::get<const_min(2, sizeof...(Args) - 1)>(args).make(state, type_hint);

From 614b7ea76dd42166790459a18c3075b181007c25 Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Mon, 25 Jul 2022 23:47:23 -0400
Subject: [PATCH 11/55] missing paren

---
 src/X86Optimize.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/X86Optimize.cpp b/src/X86Optimize.cpp
index 691df83b000f..381b2daf4f3e 100644
--- a/src/X86Optimize.cpp
+++ b/src/X86Optimize.cpp
@@ -359,7 +359,7 @@ class Optimize_X86 : public IRMutator {
             // int(8 | 16 | 32) -> uint is supported via SSE41
             // float32 is always supported (via SSE2).
             (((target.has_feature(Target::SSE41) && bits <= 32) ||
-              (op->type.is_float() && bits == 32) &&
+              (op->type.is_float() && bits == 32)) &&
              rewrite(
                  abs(x),
                  v_intrin("abs", x))) ||

From a5b7e72b88f8c5d06f6d09bb3aa92c50cb0d3c64 Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Tue, 26 Jul 2022 00:06:29 -0400
Subject: [PATCH 12/55] fix buildbot failures (I hope?)

---
 src/CodeGen_C.cpp        | 5 +++++
 src/CodeGen_C.h          | 1 +
 src/Deinterleave.cpp     | 5 +++++
 src/Derivative.cpp       | 3 +++
 src/ModulusRemainder.cpp | 6 ++++++
 src/Monotonic.cpp        | 5 +++++
 6 files changed, 25 insertions(+)

diff --git a/src/CodeGen_C.cpp b/src/CodeGen_C.cpp
index c5b64f0610f5..1477bcd71316 100644
--- a/src/CodeGen_C.cpp
+++ b/src/CodeGen_C.cpp
@@ -2829,6 +2829,11 @@ Expr CodeGen_C::scalarize_vector_reduce(const VectorReduce *op) {
     return Shuffle::make_concat(lanes);
 }
 
+void CodeGen_C::visit(const VectorIntrinsic *op) {
+    internal_error << "CodeGen_C should never receive a VectorIntrinsic, received:\n"
+                   << Expr(op) << "\n";
+}
+
 void CodeGen_C::visit(const VectorReduce *op) {
     stream << get_indent() << "// Vector reduce: " << op->op << "\n";
 
diff --git a/src/CodeGen_C.h b/src/CodeGen_C.h
index 9c06d4bb5630..2427aeef32dc 100644
--- a/src/CodeGen_C.h
+++ b/src/CodeGen_C.h
@@ -235,6 +235,7 @@ class CodeGen_C : public IRPrinter {
     void visit(const Fork *) override;
     void visit(const Acquire *) override;
     void visit(const Atomic *) override;
+    void visit(const VectorIntrinsic *) override;
     void visit(const VectorReduce *) override;
 
     void visit_binop(Type t, const Expr &a, const Expr &b, const char *op);
diff --git a/src/Deinterleave.cpp b/src/Deinterleave.cpp
index e368d851d615..30a5c012a7ba 100644
--- a/src/Deinterleave.cpp
+++ b/src/Deinterleave.cpp
@@ -195,6 +195,11 @@ class Deinterleaver : public IRGraphMutator {
 
     using IRMutator::visit;
 
+    Expr visit(const VectorIntrinsic *op) override {
+        internal_error << "Deinterleaver should never receive VectorIntrinsic node, received:\n"
+                       << Expr(op) << "\n";
+    }
+
     Expr visit(const VectorReduce *op) override {
         std::vector<int> input_lanes;
         int factor = op->value.type().lanes() / op->type.lanes();
diff --git a/src/Derivative.cpp b/src/Derivative.cpp
index c536eeea92ae..851faad0cd4b 100644
--- a/src/Derivative.cpp
+++ b/src/Derivative.cpp
@@ -88,6 +88,9 @@ class ReverseAccumulationVisitor : public IRVisitor {
     void visit(const Shuffle *op) override {
         internal_error << "Encounter unexpected expression \"Shuffle\" when differentiating.";
     }
+    void visit(const VectorIntrinsic *op) override {
+        internal_error << "Encounter unexpected expression \"VectorIntrinsic\" when differentiating.";
+    }
     void visit(const VectorReduce *op) override {
         internal_error << "Encounter unexpected expression \"VectorReduce\" when differentiating.";
     }
diff --git a/src/ModulusRemainder.cpp b/src/ModulusRemainder.cpp
index 34a598e4c7e3..cb2b54957948 100644
--- a/src/ModulusRemainder.cpp
+++ b/src/ModulusRemainder.cpp
@@ -74,6 +74,7 @@ class ComputeModulusRemainder : public IRVisitor {
     void visit(const Free *) override;
     void visit(const Evaluate *) override;
     void visit(const Shuffle *) override;
+    void visit(const VectorIntrinsic *) override;
     void visit(const VectorReduce *) override;
     void visit(const Prefetch *) override;
     void visit(const Atomic *) override;
@@ -213,6 +214,11 @@ void ComputeModulusRemainder::visit(const Shuffle *op) {
     result = ModulusRemainder{};
 }
 
+void ComputeModulusRemainder::visit(const VectorIntrinsic *op) {
+    internal_error << "modulus_remainder of VectorIntrinsic:\n" << Expr(op) << "\n";
+    result = ModulusRemainder{};
+}
+
 void ComputeModulusRemainder::visit(const VectorReduce *op) {
     internal_assert(op->type.is_scalar()) << "modulus_remainder of vector\n";
     result = ModulusRemainder{};
diff --git a/src/Monotonic.cpp b/src/Monotonic.cpp
index ae8978b2cb57..2c51c7fa960b 100644
--- a/src/Monotonic.cpp
+++ b/src/Monotonic.cpp
@@ -534,6 +534,11 @@ class DerivativeBounds : public IRVisitor {
         result = ConstantInterval::single_point(0);
     }
 
+    void visit(const VectorIntrinsic *op) override {
+        // TODO(rootjalex): Should this be an error?
+        result = ConstantInterval::everything();
+    }
+
     void visit(const VectorReduce *op) override {
         op->value.accept(this);
         switch (op->op) {

From c58f85e04b6129709e58b5f08d3cc8ca296fa633 Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Tue, 26 Jul 2022 00:08:16 -0400
Subject: [PATCH 13/55] clang-format

---
 src/ModulusRemainder.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/ModulusRemainder.cpp b/src/ModulusRemainder.cpp
index cb2b54957948..be0226b44e56 100644
--- a/src/ModulusRemainder.cpp
+++ b/src/ModulusRemainder.cpp
@@ -215,7 +215,8 @@ void ComputeModulusRemainder::visit(const Shuffle *op) {
 }
 
 void ComputeModulusRemainder::visit(const VectorIntrinsic *op) {
-    internal_error << "modulus_remainder of VectorIntrinsic:\n" << Expr(op) << "\n";
+    internal_error << "modulus_remainder of VectorIntrinsic:\n"
+                   << Expr(op) << "\n";
     result = ModulusRemainder{};
 }
 

From c9efd3221e7c62cb4ac69d6651c7345938007875 Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Tue, 26 Jul 2022 00:18:55 -0400
Subject: [PATCH 14/55] add empty Expr return to Deinterleaver::visic(const
 VectorIntrinsic*)

---
 src/Deinterleave.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/Deinterleave.cpp b/src/Deinterleave.cpp
index 30a5c012a7ba..0a68517f7f41 100644
--- a/src/Deinterleave.cpp
+++ b/src/Deinterleave.cpp
@@ -198,6 +198,7 @@ class Deinterleaver : public IRGraphMutator {
     Expr visit(const VectorIntrinsic *op) override {
         internal_error << "Deinterleaver should never receive VectorIntrinsic node, received:\n"
                        << Expr(op) << "\n";
+        return Expr();
     }
 
     Expr visit(const VectorReduce *op) override {

From 0e94961a68a7d3173591d3da11ade43aa3a9e92c Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Tue, 26 Jul 2022 01:00:49 -0400
Subject: [PATCH 15/55] fix horizontal_add references

---
 src/runtime/x86_avx2.ll  | 32 ++++++++++++++++++++++++++++++++
 src/runtime/x86_sse41.ll | 17 +++++++++++++++++
 2 files changed, 49 insertions(+)

diff --git a/src/runtime/x86_avx2.ll b/src/runtime/x86_avx2.ll
index 1a80f5b583d3..370dd5a84a24 100644
--- a/src/runtime/x86_avx2.ll
+++ b/src/runtime/x86_avx2.ll
@@ -72,3 +72,35 @@ define weak_odr <16 x i16> @hadd_pmadd_i8_avx2(<32 x i8> %a) nounwind alwaysinli
   ret <16 x i16> %1
 }
 declare <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>) nounwind readnone
+
+define weak_odr <16 x i16> @phaddw_avx2(<32 x i16> %a) nounwind alwaysinline {
+   %1 = shufflevector <32 x i16> %a, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+   %2 = shufflevector <32 x i16> %a, <32 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+   %3 = tail call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %1, <16 x i16> %2)
+   ret <16 x i16> %3
+ }
+ declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+ define weak_odr <8 x i32> @phaddd_avx2(<16 x i32> %a) nounwind alwaysinline {
+   %1 = shufflevector <16 x i32> %a, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+   %2 = shufflevector <16 x i32> %a, <16 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+   %3 = tail call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %1, <8 x i32> %2)
+   ret <8 x i32> %3
+ }
+ declare <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32>, <8 x i32>) nounwind readnone
+
+ define weak_odr <8 x i32> @hadd_pmadd_i16_avx2(<16 x i16> %a) nounwind alwaysinline {
+   %1 = tail call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a, <16 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+   ret <8 x i32> %1
+ }
+
+ define weak_odr <8 x i32> @wmul_pmaddwd_avx2(<8 x i16> %a, <8 x i16> %b) nounwind alwaysinline {
+   %1 = zext <8 x i16> %a to <8 x i32>
+   %2 = zext <8 x i16> %b to <8 x i32>
+   %3 = bitcast <8 x i32> %1 to <16 x i16>
+   %4 = bitcast <8 x i32> %2 to <16 x i16>
+   %res = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %3, <16 x i16> %4)
+   ret <8 x i32> %res
+ }
+ declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) nounwind readnone
+ 
\ No newline at end of file
diff --git a/src/runtime/x86_sse41.ll b/src/runtime/x86_sse41.ll
index f109ee37ec23..b49faca965d9 100644
--- a/src/runtime/x86_sse41.ll
+++ b/src/runtime/x86_sse41.ll
@@ -92,3 +92,20 @@ define weak_odr <8 x i16> @hadd_pmadd_i8_sse3(<16 x i8> %a) nounwind alwaysinlin
   ret <8 x i16> %1
 }
 declare <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>) nounwind readnone
+
+define weak_odr <8 x i16> @phaddw_sse3(<16 x i16> %a) nounwind alwaysinline {
+   %1 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+   %2 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+   %3 = tail call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %1, <8 x i16> %2)
+   ret <8 x i16> %3
+ }
+ declare <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+ define weak_odr <4 x i32> @phaddd_sse3(<8 x i32> %a) nounwind alwaysinline {
+   %1 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+   %2 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+   %3 = tail call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %1, <4 x i32> %2)
+   ret <4 x i32> %3
+ }
+ declare <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32>, <4 x i32>) nounwind readnone
+ 
\ No newline at end of file

From fb538e38015d434ffc6fd704bdada4c5cd5551bf Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Tue, 26 Jul 2022 01:03:27 -0400
Subject: [PATCH 16/55] fix bfloat16 abs issue (again)

---
 src/X86Optimize.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/X86Optimize.cpp b/src/X86Optimize.cpp
index 381b2daf4f3e..0572ac3b2c75 100644
--- a/src/X86Optimize.cpp
+++ b/src/X86Optimize.cpp
@@ -358,7 +358,7 @@ class Optimize_X86 : public IRMutator {
 
             // int(8 | 16 | 32) -> uint is supported via SSE41
             // float32 is always supported (via SSE2).
-            (((target.has_feature(Target::SSE41) && bits <= 32) ||
+            (((target.has_feature(Target::SSE41) && op->type.is_int() && bits <= 32) ||
               (op->type.is_float() && bits == 32)) &&
              rewrite(
                  abs(x),

From c2a61752e4d01539fc71d8c03610b99884068dc0 Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Tue, 26 Jul 2022 01:16:01 -0400
Subject: [PATCH 17/55] fix instruction selection location

---
 src/CodeGen_X86.cpp | 39 +++++++++++++++++++++++++++++++++++++++
 src/Lower.cpp       |  7 -------
 2 files changed, 39 insertions(+), 7 deletions(-)

diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp
index 5d902e866115..73959adf78ac 100644
--- a/src/CodeGen_X86.cpp
+++ b/src/CodeGen_X86.cpp
@@ -30,6 +30,9 @@ class CodeGen_X86 : public CodeGen_Posix {
     CodeGen_X86(Target);
 
 protected:
+    void compile_func(const LoweredFunc &f,
+                      const std::string &simple_name, const std::string &extern_name) override;
+
     string mcpu_target() const override;
     string mcpu_tune() const override;
     string mattrs() const override;
@@ -249,6 +252,42 @@ void CodeGen_X86::init_module() {
     }
 }
 
+// FIXME: This is nearly identical to CodeGen_LLVM, should re-factor this somehow.
+// Only difference is the call to `optimize_x86_instructions()`
+void CodeGen_X86::compile_func(const LoweredFunc &f, const std::string &simple_name,
+                                const std::string &extern_name) {
+    // Generate the function declaration and argument unpacking code.
+    begin_func(f.linkage, simple_name, extern_name, f.args);
+
+    // If building with MSAN, ensure that calls to halide_msan_annotate_buffer_is_initialized()
+    // happen for every output buffer if the function succeeds.
+    if (f.linkage != LinkageType::Internal &&
+        target.has_feature(Target::MSAN)) {
+        llvm::Function *annotate_buffer_fn =
+            module->getFunction("halide_msan_annotate_buffer_is_initialized_as_destructor");
+        internal_assert(annotate_buffer_fn)
+            << "Could not find halide_msan_annotate_buffer_is_initialized_as_destructor in module\n";
+        annotate_buffer_fn->addParamAttr(0, Attribute::NoAlias);
+        for (const auto &arg : f.args) {
+            if (arg.kind == Argument::OutputBuffer) {
+                register_destructor(annotate_buffer_fn, sym_get(arg.name + ".buffer"), OnSuccess);
+            }
+        }
+    }
+
+    // Generate the function body.
+    debug(1) << "Generating llvm bitcode for function " << f.name << "...\n";
+    debug(1) << "X86: Optimizing vector instructions...\n";
+    Stmt body = optimize_x86_instructions(f.body, target);
+    debug(2) << "X86: Lowering after vector instructions:\n"
+             << body << "\n\n";
+
+    body.accept(this);
+
+    // Clean up and return.
+    end_func(f.args);
+}
+
 void CodeGen_X86::visit(const GT *op) {
     Type t = op->a.type();
 
diff --git a/src/Lower.cpp b/src/Lower.cpp
index 20d98a20562a..f25f209ecea4 100644
--- a/src/Lower.cpp
+++ b/src/Lower.cpp
@@ -444,13 +444,6 @@ void lower_impl(const vector<Function> &output_funcs,
         debug(1) << "Skipping GPU offload...\n";
     }
 
-    if (t.arch == Target::X86) {
-        debug(1) << "Performing x86-specific vector instruction selection...\n";
-        s = optimize_x86_instructions(s, t);
-        debug(2) << "Lowering after performing x86-specific vector instruction selection:\n"
-                 << s << "\n\n";
-    }
-
     // TODO: This needs to happen before lowering parallel tasks, because global
     // images used inside parallel loops are rewritten from loads from images to
     // loads from closure parameters. Closure parameters are missing the Buffer<>

From 78edb81d539bb92db18047b97cdb0b7eba6b6b3d Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Tue, 26 Jul 2022 01:18:30 -0400
Subject: [PATCH 18/55] clang format

---
 src/CodeGen_X86.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp
index 73959adf78ac..07269145e2d2 100644
--- a/src/CodeGen_X86.cpp
+++ b/src/CodeGen_X86.cpp
@@ -255,7 +255,7 @@ void CodeGen_X86::init_module() {
 // FIXME: This is nearly identical to CodeGen_LLVM, should re-factor this somehow.
 // Only difference is the call to `optimize_x86_instructions()`
 void CodeGen_X86::compile_func(const LoweredFunc &f, const std::string &simple_name,
-                                const std::string &extern_name) {
+                               const std::string &extern_name) {
     // Generate the function declaration and argument unpacking code.
     begin_func(f.linkage, simple_name, extern_name, f.args);
 

From 53c560b72f4b37aa35010afadb406c8d46ffbe58 Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Tue, 26 Jul 2022 10:57:05 -0400
Subject: [PATCH 19/55] fix virtual function hidden error

---
 src/X86Optimize.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/X86Optimize.cpp b/src/X86Optimize.cpp
index 0572ac3b2c75..ed58b43dc6e3 100644
--- a/src/X86Optimize.cpp
+++ b/src/X86Optimize.cpp
@@ -101,6 +101,8 @@ class Optimize_X86 : public IRMutator {
         return type.is_vector();
     }
 
+    using IRMutator::visit;
+
     Expr visit(const Div *op) override {
         if (!should_peephole_optimize(op->type) || !op->type.is_int_or_uint()) {
             return IRMutator::visit(op);

From 2cfc0c185dde714aabc3abcd1f84901c9b037db0 Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Tue, 26 Jul 2022 11:06:59 -0400
Subject: [PATCH 20/55] fix absd codegen bug

---
 src/X86Optimize.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/X86Optimize.cpp b/src/X86Optimize.cpp
index ed58b43dc6e3..c040befb4194 100644
--- a/src/X86Optimize.cpp
+++ b/src/X86Optimize.cpp
@@ -330,16 +330,16 @@ class Optimize_X86 : public IRMutator {
             //
             // Solution due to Wojciech Mula:
             // http://0x80.pl/notesen/2018-03-11-sse-abs-unsigned.html
-            (op->type.is_uint() &&
-             rewrite(
-                 absd(x, y),
-                 saturating_sub(x, y) | saturating_sub(y, x))) ||
+            rewrite(
+                absd(x, y),
+                saturating_sub(x, y) | saturating_sub(y, x),
+                is_uint(x) && is_uint(y)) ||
 
             // Current best way to lower absd on x86.
-            (op->type.is_int() &&
-             rewrite(
-                 absd(x, y),
-                 max(x, y) - min(x, y))) ||
+            rewrite(
+                absd(x, y),
+                max(x, y) - min(x, y),
+                is_int(x) && is_int(y)) ||
 
             // pmulh is always supported (via SSE2).
             ((op->type.is_int_or_uint() && bits == 16) &&

From 0675e8605157c39b2698bb566b2affea43029a44 Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Tue, 26 Jul 2022 12:43:36 -0400
Subject: [PATCH 21/55] attempt to fix x86 vector-reduction splitting

---
 src/CodeGen_LLVM.cpp | 30 +++++++++++++-------------
 src/CodeGen_LLVM.h   |  9 ++++++++
 src/CodeGen_X86.cpp  | 26 ++++++++++++++++++++++-
 src/X86Optimize.cpp  | 50 ++++++++++++++------------------------------
 src/X86Optimize.h    |  5 ++---
 5 files changed, 67 insertions(+), 53 deletions(-)

diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index 14db16dba353..c13ef1bdd0ac 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -4009,7 +4009,7 @@ void CodeGen_LLVM::visit(const VectorReduce *op) {
     codegen_vector_reduce(op, Expr());
 }
 
-void CodeGen_LLVM::codegen_vector_reduce(const VectorReduce *op, const Expr &init) {
+Expr CodeGen_LLVM::split_vector_reduce(const VectorReduce *op, const Expr &init) const {
     Expr val = op->value;
     const int output_lanes = op->type.lanes();
     const int native_lanes = native_vector_bits() / op->type.bits();
@@ -4049,8 +4049,7 @@ void CodeGen_LLVM::codegen_vector_reduce(const VectorReduce *op, const Expr &ini
             equiv = max(equiv, init);
         }
         equiv = cast(op->type, equiv);
-        equiv.accept(this);
-        return;
+        return equiv;
     }
 
     if (op->type.is_bool() && op->op == VectorReduce::And) {
@@ -4061,8 +4060,7 @@ void CodeGen_LLVM::codegen_vector_reduce(const VectorReduce *op, const Expr &ini
         if (init.defined()) {
             equiv = min(equiv, init);
         }
-        equiv.accept(this);
-        return;
+        return equiv;
     }
 
     if (elt == Float(16) && upgrade_type_for_arithmetic(elt) != elt) {
@@ -4072,8 +4070,7 @@ void CodeGen_LLVM::codegen_vector_reduce(const VectorReduce *op, const Expr &ini
             equiv = binop(equiv, init);
         }
         equiv = cast(op->type, equiv);
-        equiv.accept(this);
-        return;
+        return equiv;
     }
 
     if (output_lanes == 1) {
@@ -4172,8 +4169,7 @@ void CodeGen_LLVM::codegen_vector_reduce(const VectorReduce *op, const Expr &ini
             if (initial_value.defined()) {
                 equiv = binop(initial_value, equiv);
             }
-            equiv.accept(this);
-            return;
+            return equiv;
         }
     }
 
@@ -4196,8 +4192,7 @@ void CodeGen_LLVM::codegen_vector_reduce(const VectorReduce *op, const Expr &ini
             equiv = binop(equiv, init);
         }
         equiv = common_subexpression_elimination(equiv);
-        equiv.accept(this);
-        return;
+        return equiv;
     }
 
     if (factor > 2 && ((factor & 1) == 0)) {
@@ -4229,8 +4224,7 @@ void CodeGen_LLVM::codegen_vector_reduce(const VectorReduce *op, const Expr &ini
             equiv = binop(equiv, init);
         }
         equiv = common_subexpression_elimination(equiv);
-        codegen(equiv);
-        return;
+        return equiv;
     }
 
     // Extract each slice and combine
@@ -4244,8 +4238,14 @@ void CodeGen_LLVM::codegen_vector_reduce(const VectorReduce *op, const Expr &ini
         }
     }
     equiv = common_subexpression_elimination(equiv);
-    codegen(equiv);
-}  // namespace Internal
+    return equiv;
+}
+
+void CodeGen_LLVM::codegen_vector_reduce(const VectorReduce *op, const Expr &init) {
+    Expr equiv = split_vector_reduce(op, init);
+    equiv.accept(this);
+    return;
+}
 
 void CodeGen_LLVM::visit(const Atomic *op) {
     if (!op->mutex_name.empty()) {
diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h
index 7c5078428431..1a479f7829e7 100644
--- a/src/CodeGen_LLVM.h
+++ b/src/CodeGen_LLVM.h
@@ -507,6 +507,15 @@ class CodeGen_LLVM : public IRVisitor {
      * across backends. */
     virtual void codegen_vector_reduce(const VectorReduce *op, const Expr &init);
 
+    // TODO: this probably shouldn't be public, or should be moved where the rest of
+    //       the public methods are.
+public:
+    /** Split up a VectorReduce node if possible, or generate LLVM
+        intrinsics for full reductions. This is used in
+        `codegen_vector_reduce`. **/
+    virtual Expr split_vector_reduce(const VectorReduce *op, const Expr &init) const;
+
+protected:
     /** Are we inside an atomic node that uses mutex locks?
         This is used for detecting deadlocks from nested atomics & illegal vectorization. */
     bool inside_atomic_mutex_node;
diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp
index 07269145e2d2..ba2c44579d59 100644
--- a/src/CodeGen_X86.cpp
+++ b/src/CodeGen_X86.cpp
@@ -22,6 +22,30 @@ using namespace llvm;
 
 namespace {
 
+// Populate feature flags in a target according to those implied by
+// existing flags, so that instruction patterns can just check for the
+// oldest feature flag that supports an instruction.
+Target complete_x86_target(Target t) {
+    if (t.has_feature(Target::AVX512_SapphireRapids)) {
+        t.set_feature(Target::AVX512_Cannonlake);
+    }
+    if (t.has_feature(Target::AVX512_Cannonlake)) {
+        t.set_feature(Target::AVX512_Skylake);
+    }
+    if (t.has_feature(Target::AVX512_Cannonlake) ||
+        t.has_feature(Target::AVX512_Skylake) ||
+        t.has_feature(Target::AVX512_KNL)) {
+        t.set_feature(Target::AVX2);
+    }
+    if (t.has_feature(Target::AVX2)) {
+        t.set_feature(Target::AVX);
+    }
+    if (t.has_feature(Target::AVX)) {
+        t.set_feature(Target::SSE41);
+    }
+    return t;
+}
+
 /** A code generator that emits x86 code from a given Halide stmt. */
 class CodeGen_X86 : public CodeGen_Posix {
 public:
@@ -278,7 +302,7 @@ void CodeGen_X86::compile_func(const LoweredFunc &f, const std::string &simple_n
     // Generate the function body.
     debug(1) << "Generating llvm bitcode for function " << f.name << "...\n";
     debug(1) << "X86: Optimizing vector instructions...\n";
-    Stmt body = optimize_x86_instructions(f.body, target);
+    Stmt body = optimize_x86_instructions(f.body, target, this);
     debug(2) << "X86: Lowering after vector instructions:\n"
              << body << "\n\n";
 
diff --git a/src/X86Optimize.cpp b/src/X86Optimize.cpp
index c040befb4194..f4c2b758db55 100644
--- a/src/X86Optimize.cpp
+++ b/src/X86Optimize.cpp
@@ -13,30 +13,6 @@
 namespace Halide {
 namespace Internal {
 
-// Populate feature flags in a target according to those implied by
-// existing flags, so that instruction patterns can just check for the
-// oldest feature flag that supports an instruction.
-Target complete_x86_target(Target t) {
-    if (t.has_feature(Target::AVX512_SapphireRapids)) {
-        t.set_feature(Target::AVX512_Cannonlake);
-    }
-    if (t.has_feature(Target::AVX512_Cannonlake)) {
-        t.set_feature(Target::AVX512_Skylake);
-    }
-    if (t.has_feature(Target::AVX512_Cannonlake) ||
-        t.has_feature(Target::AVX512_Skylake) ||
-        t.has_feature(Target::AVX512_KNL)) {
-        t.set_feature(Target::AVX2);
-    }
-    if (t.has_feature(Target::AVX2)) {
-        t.set_feature(Target::AVX);
-    }
-    if (t.has_feature(Target::AVX)) {
-        t.set_feature(Target::SSE41);
-    }
-    return t;
-}
-
 #if defined(WITH_X86)
 
 namespace {
@@ -89,8 +65,8 @@ class Optimize_X86 : public IRMutator {
 public:
     /** Create an x86 code optimizer. Processor features can be
      * enabled using the appropriate flags in the target struct. */
-    Optimize_X86(const Target &t)
-        : target(t) {
+    Optimize_X86(const Target &t, const CodeGen_LLVM *c)
+        : target(t), codegen(c) {
     }
 
 protected:
@@ -545,15 +521,21 @@ class Optimize_X86 : public IRMutator {
             break;
         }
 
-        // FIXME: We need to split up VectorReduce nodes in the same way that
-        //        CodeGen_LLVM::codegen_vector_reduce does, in order to do all
-        //        matching here.
+        return attempt_vector_split(op);
+    }
 
-        return IRMutator::visit(op);
+    Expr attempt_vector_split(const VectorReduce *op) {
+        Expr split = codegen->split_vector_reduce(op, Expr());
+        if (split.defined() && !split.same_as(op)) {
+            return mutate(split);
+        } else {
+            return IRMutator::visit(op);
+        }
     }
 
 private:
     const Target &target;
+    const CodeGen_LLVM *codegen;
 
     IRMatcher::Wild<0> x;
     IRMatcher::Wild<1> y;
@@ -562,11 +544,11 @@ class Optimize_X86 : public IRMutator {
 
 }  // namespace
 
-Stmt optimize_x86_instructions(Stmt s, const Target &t) {
-    s = Optimize_X86(complete_x86_target(t)).mutate(s);
+Stmt optimize_x86_instructions(Stmt stmt, const Target &target, const CodeGen_LLVM *codegen) {
+    stmt = Optimize_X86(target, codegen).mutate(stmt);
     // Some of the rules above can introduce repeated sub-terms, so run CSE again.
-    s = common_subexpression_elimination(s);
-    return s;
+    stmt = common_subexpression_elimination(stmt);
+    return stmt;
 }
 
 #else  // WITH_X86
diff --git a/src/X86Optimize.h b/src/X86Optimize.h
index bcf375cae1d5..df37c7dd896f 100644
--- a/src/X86Optimize.h
+++ b/src/X86Optimize.h
@@ -5,6 +5,7 @@
  * Tools for optimizing IR for x86.
  */
 
+#include "CodeGen_LLVM.h"
 #include "Expr.h"
 #include "Target.h"
 
@@ -12,9 +13,7 @@ namespace Halide {
 namespace Internal {
 
 /** Perform vector instruction selection, inserting VectorIntrinsic nodes. */
-Stmt optimize_x86_instructions(Stmt s, const Target &t);
-
-Target complete_x86_target(Target t);
+Stmt optimize_x86_instructions(Stmt stmt, const Target &target, const CodeGen_LLVM *codegen);
 
 }  // namespace Internal
 }  // namespace Halide

From 6471226694d45feae15ac7893cab29e9d77382f7 Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Tue, 26 Jul 2022 13:02:16 -0400
Subject: [PATCH 22/55] clang tidy

---
 src/CodeGen_LLVM.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index c13ef1bdd0ac..f821aaf670d4 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -4244,7 +4244,6 @@ Expr CodeGen_LLVM::split_vector_reduce(const VectorReduce *op, const Expr &init)
 void CodeGen_LLVM::codegen_vector_reduce(const VectorReduce *op, const Expr &init) {
     Expr equiv = split_vector_reduce(op, init);
     equiv.accept(this);
-    return;
 }
 
 void CodeGen_LLVM::visit(const Atomic *op) {

From fb8216607297130f2e84dc63bf7bdb548e3d73f7 Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Tue, 26 Jul 2022 15:14:13 -0400
Subject: [PATCH 23/55] fix MSVC templating bug

---
 src/IRMatch.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/IRMatch.h b/src/IRMatch.h
index 44082d8a76c7..2d1394742713 100644
--- a/src/IRMatch.h
+++ b/src/IRMatch.h
@@ -1948,10 +1948,10 @@ struct VectorIntrinOp {
 
         r_args[0] = std::get<0>(args).make(state, {});
         if constexpr (sizeof...(Args) > 1) {
-            r_args[1] = std::get<1>(args).make(state, {});
+            r_args[1] = std::get<const_min(1, sizeof...(Args) - 1)>(args).make(state, {});
         }
         if constexpr (sizeof...(Args) > 2) {
-            r_args[2] = std::get<2>(args).make(state, {});
+            r_args[2] = std::get<const_min(2, sizeof...(Args) - 1)>(args).make(state, {});
         }
 
         // for (int i = 0; i < sizeof...(Args); i++) {

From f0926064940725259e6bede767d986f2302e78ba Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Wed, 27 Jul 2022 17:31:44 -0400
Subject: [PATCH 24/55] implement Andrew's requested changes

---
 src/Bounds.cpp           |  4 +--
 src/CodeGen_C.cpp        |  4 +--
 src/CodeGen_C.h          |  2 +-
 src/CodeGen_LLVM.cpp     |  4 +--
 src/CodeGen_LLVM.h       |  2 +-
 src/CodeGen_X86.cpp      |  7 ++--
 src/Deinterleave.cpp     |  4 +--
 src/Derivative.cpp       |  4 +--
 src/Expr.h               |  2 +-
 src/IR.cpp               | 45 +++++++++++++++++++-----
 src/IR.h                 | 42 ++++++++++++++++++----
 src/IREquality.cpp       |  8 ++---
 src/IRMatch.cpp          | 12 +++----
 src/IRMatch.h            | 30 ++++++++--------
 src/IRMutator.cpp        |  4 +--
 src/IRMutator.h          |  2 +-
 src/IRPrinter.cpp        |  4 +--
 src/IRPrinter.h          |  2 +-
 src/IRVisitor.cpp        |  4 +--
 src/IRVisitor.h          | 10 +++---
 src/ModulusRemainder.cpp |  6 ++--
 src/Monotonic.cpp        |  2 +-
 src/Simplify_Exprs.cpp   |  2 +-
 src/Simplify_Internal.h  |  2 +-
 src/StmtToHtml.cpp       |  6 ++--
 src/X86Optimize.cpp      | 76 +++++++++++++++++++++-------------------
 src/X86Optimize.h        |  2 +-
 27 files changed, 177 insertions(+), 115 deletions(-)

diff --git a/src/Bounds.cpp b/src/Bounds.cpp
index 107bd04185c4..632001485ba5 100644
--- a/src/Bounds.cpp
+++ b/src/Bounds.cpp
@@ -1110,9 +1110,9 @@ class Bounds : public IRVisitor {
         op->value.accept(this);
     }
 
-    void visit(const VectorIntrinsic *op) override {
+    void visit(const VectorInstruction *op) override {
         // TODO(rootjalex): we may need to implement bounds queries.
-        internal_error << "Unexpected VectorIntrinsic in bounds query: " << Expr(op) << "\n";
+        internal_error << "Unexpected VectorInstruction in bounds query: " << Expr(op) << "\n";
     }
 
     void visit(const Call *op) override {
diff --git a/src/CodeGen_C.cpp b/src/CodeGen_C.cpp
index 1477bcd71316..8bc1abda3502 100644
--- a/src/CodeGen_C.cpp
+++ b/src/CodeGen_C.cpp
@@ -2829,8 +2829,8 @@ Expr CodeGen_C::scalarize_vector_reduce(const VectorReduce *op) {
     return Shuffle::make_concat(lanes);
 }
 
-void CodeGen_C::visit(const VectorIntrinsic *op) {
-    internal_error << "CodeGen_C should never receive a VectorIntrinsic, received:\n"
+void CodeGen_C::visit(const VectorInstruction *op) {
+    internal_error << "CodeGen_C should never receive a VectorInstruction, received:\n"
                    << Expr(op) << "\n";
 }
 
diff --git a/src/CodeGen_C.h b/src/CodeGen_C.h
index 2427aeef32dc..256b35f55efe 100644
--- a/src/CodeGen_C.h
+++ b/src/CodeGen_C.h
@@ -235,7 +235,7 @@ class CodeGen_C : public IRPrinter {
     void visit(const Fork *) override;
     void visit(const Acquire *) override;
     void visit(const Atomic *) override;
-    void visit(const VectorIntrinsic *) override;
+    void visit(const VectorInstruction *) override;
     void visit(const VectorReduce *) override;
 
     void visit_binop(Type t, const Expr &a, const Expr &b, const char *op);
diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index f821aaf670d4..68b4da12ae87 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -4000,8 +4000,8 @@ void CodeGen_LLVM::visit(const Shuffle *op) {
     }
 }
 
-void CodeGen_LLVM::visit(const VectorIntrinsic *op) {
-    internal_error << "CodeGen_LLVM received VectorIntrinsic node, should be handled by architecture-specific CodeGen class:\n"
+void CodeGen_LLVM::visit(const VectorInstruction *op) {
+    internal_error << "CodeGen_LLVM received VectorInstruction node, should be handled by architecture-specific CodeGen class:\n"
                    << Expr(op) << "\n";
 }
 
diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h
index 1a479f7829e7..4f951d5a9131 100644
--- a/src/CodeGen_LLVM.h
+++ b/src/CodeGen_LLVM.h
@@ -362,7 +362,7 @@ class CodeGen_LLVM : public IRVisitor {
     void visit(const IfThenElse *) override;
     void visit(const Evaluate *) override;
     void visit(const Shuffle *) override;
-    void visit(const VectorIntrinsic *) override;
+    void visit(const VectorInstruction *) override;
     void visit(const VectorReduce *) override;
     void visit(const Prefetch *) override;
     void visit(const Atomic *) override;
diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp
index ba2c44579d59..8caa3ee756b0 100644
--- a/src/CodeGen_X86.cpp
+++ b/src/CodeGen_X86.cpp
@@ -84,7 +84,7 @@ class CodeGen_X86 : public CodeGen_Posix {
     void visit(const Allocate *) override;
     void visit(const Load *) override;
     void visit(const Store *) override;
-    void visit(const VectorIntrinsic *) override;
+    void visit(const VectorInstruction *) override;
     // @}
 
 private:
@@ -470,8 +470,9 @@ void CodeGen_X86::visit(const Store *op) {
     CodeGen_Posix::visit(op);
 }
 
-void CodeGen_X86::visit(const VectorIntrinsic *op) {
-    value = call_overloaded_intrin(op->type, op->name, op->args);
+void CodeGen_X86::visit(const VectorInstruction *op) {
+    const std::string name = op->get_instruction_name();
+    value = call_overloaded_intrin(op->type, name, op->args);
     internal_assert(value) << "CodeGen_X86 failed on " << Expr(op) << "\n";
 }
 
diff --git a/src/Deinterleave.cpp b/src/Deinterleave.cpp
index 0a68517f7f41..c2182d4f5192 100644
--- a/src/Deinterleave.cpp
+++ b/src/Deinterleave.cpp
@@ -195,8 +195,8 @@ class Deinterleaver : public IRGraphMutator {
 
     using IRMutator::visit;
 
-    Expr visit(const VectorIntrinsic *op) override {
-        internal_error << "Deinterleaver should never receive VectorIntrinsic node, received:\n"
+    Expr visit(const VectorInstruction *op) override {
+        internal_error << "Deinterleaver should never receive VectorInstruction node, received:\n"
                        << Expr(op) << "\n";
         return Expr();
     }
diff --git a/src/Derivative.cpp b/src/Derivative.cpp
index 851faad0cd4b..cade4b010980 100644
--- a/src/Derivative.cpp
+++ b/src/Derivative.cpp
@@ -88,8 +88,8 @@ class ReverseAccumulationVisitor : public IRVisitor {
     void visit(const Shuffle *op) override {
         internal_error << "Encounter unexpected expression \"Shuffle\" when differentiating.";
     }
-    void visit(const VectorIntrinsic *op) override {
-        internal_error << "Encounter unexpected expression \"VectorIntrinsic\" when differentiating.";
+    void visit(const VectorInstruction *op) override {
+        internal_error << "Encounter unexpected expression \"VectorInstruction\" when differentiating.";
     }
     void visit(const VectorReduce *op) override {
         internal_error << "Encounter unexpected expression \"VectorReduce\" when differentiating.";
diff --git a/src/Expr.h b/src/Expr.h
index aaab7dac23be..efb7526e0eb5 100644
--- a/src/Expr.h
+++ b/src/Expr.h
@@ -57,7 +57,7 @@ enum class IRNodeType {
     Call,
     Let,
     Shuffle,
-    VectorIntrinsic,
+    VectorInstruction,
     VectorReduce,
     // Stmts
     LetStmt,
diff --git a/src/IR.cpp b/src/IR.cpp
index 776a8da806ee..07832b93b6ca 100644
--- a/src/IR.cpp
+++ b/src/IR.cpp
@@ -901,17 +901,46 @@ Stmt Atomic::make(const std::string &producer_name,
     return node;
 }
 
-Expr VectorIntrinsic::make(Type type, const std::string &name, const std::vector<Expr> &args) {
-    user_assert(!name.empty()) << "VectorIntrinsic without a name\n";
+namespace {
+
+const char *const instruction_op_names[] = {
+    // Shared:
+    "abs",
+    "dot_product",
+    "rounding_halving_add",
+    "saturating_add",
+    "saturating_narrow",
+    "saturating_sub",
+    "widening_mul",
+
+    // x86-specific
+    "f32_to_bf16",
+    "horizontal_add",
+    "pmulh",
+    "pmulhrs",
+    "saturating_dot_product",
+};
+
+static_assert(sizeof(instruction_op_names) / sizeof(instruction_op_names[0]) == VectorInstruction::InstructionOpCount,
+              "instruction_op_names needs attention");
+
+}  // namespace
+
+Expr VectorInstruction::make(Type type, InstructionOp op, const std::vector<Expr> &args) {
     user_assert(!args.empty()) << "VectorInrinsic without arguments\n";
 
-    VectorIntrinsic *node = new VectorIntrinsic;
+    VectorInstruction *node = new VectorInstruction;
     node->type = type;
-    node->name = name;
+    node->op = op;
     node->args = args;
     return node;
 }
 
+const char *VectorInstruction::get_instruction_name() const {
+    return instruction_op_names[op];
+}
+
+
 Expr VectorReduce::make(VectorReduce::Operator op,
                         Expr vec,
                         int lanes) {
@@ -1092,8 +1121,8 @@ void ExprNode<Shuffle>::accept(IRVisitor *v) const {
     v->visit((const Shuffle *)this);
 }
 template<>
-void ExprNode<VectorIntrinsic>::accept(IRVisitor *v) const {
-    v->visit((const VectorIntrinsic *)this);
+void ExprNode<VectorInstruction>::accept(IRVisitor *v) const {
+    v->visit((const VectorInstruction *)this);
 }
 template<>
 void ExprNode<VectorReduce>::accept(IRVisitor *v) const {
@@ -1285,8 +1314,8 @@ Expr ExprNode<Shuffle>::mutate_expr(IRMutator *v) const {
     return v->visit((const Shuffle *)this);
 }
 template<>
-Expr ExprNode<VectorIntrinsic>::mutate_expr(IRMutator *v) const {
-    return v->visit((const VectorIntrinsic *)this);
+Expr ExprNode<VectorInstruction>::mutate_expr(IRMutator *v) const {
+    return v->visit((const VectorInstruction *)this);
 }
 template<>
 Expr ExprNode<VectorReduce>::mutate_expr(IRMutator *v) const {
diff --git a/src/IR.h b/src/IR.h
index ad1ad2437123..e0ea25ce5768 100644
--- a/src/IR.h
+++ b/src/IR.h
@@ -886,16 +886,44 @@ struct Atomic : public StmtNode<Atomic> {
     static const IRNodeType _node_type = IRNodeType::Atomic;
 };
 
-/** Represent a target-specific vector instruction.
- *  Intrinsic may not be element-wise operation, i.e.
- *  dot_products. */
-struct VectorIntrinsic : public ExprNode<VectorIntrinsic> {
-    std::string name;
+/** Represent a length-agnostic and target-specific
+ *  vector instruction. Intrinsic may not be element-wise
+ *  operation, i.e. dot_products. Should only be generated
+ *  and consumed during CodeGen. */
+struct VectorInstruction : public ExprNode<VectorInstruction> {
+    // enums for vector instructions. Name is recovered via get_instruction_name()
+    // Specific enum values are *not* guaranteed to be stable across time.
+    // Please keep this list sorted via target architecture (with a shared section).
+    // This last will become more complete as we add Optimize passes for more backends.
+    // If you add an instruction here, update `instruction_op_names` in IR.cpp.
+    enum InstructionOp {
+        // Shared:
+        abs,
+        dot_product,
+        rounding_halving_add,
+        saturating_add,
+        saturating_narrow,
+        saturating_sub,
+        widening_mul,
+
+        // x86-specific
+        f32_to_bf16,
+        horizontal_add,
+        pmulh,
+        pmulhrs,
+        saturating_dot_product,
+
+        InstructionOpCount  // Sentinel: keep last.
+    };
+
+    InstructionOp op;
     std::vector<Expr> args;
 
-    static Expr make(Type type, const std::string &name, const std::vector<Expr> &args);
+    static Expr make(Type type, InstructionOp op, const std::vector<Expr> &args);
+
+    static const IRNodeType _node_type = IRNodeType::VectorInstruction;
 
-    static const IRNodeType _node_type = IRNodeType::VectorIntrinsic;
+    const char *get_instruction_name() const;
 };
 
 /** Horizontally reduce a vector to a scalar or narrower vector using
diff --git a/src/IREquality.cpp b/src/IREquality.cpp
index 15a9bc01cbbb..edcfc3d067dc 100644
--- a/src/IREquality.cpp
+++ b/src/IREquality.cpp
@@ -98,7 +98,7 @@ class IRComparer : public IRVisitor {
     void visit(const Shuffle *) override;
     void visit(const Prefetch *) override;
     void visit(const Atomic *) override;
-    void visit(const VectorIntrinsic *) override;
+    void visit(const VectorInstruction *) override;
     void visit(const VectorReduce *) override;
 };
 
@@ -630,10 +630,10 @@ void IRComparer::visit(const Atomic *op) {
     compare_stmt(s->body, op->body);
 }
 
-void IRComparer::visit(const VectorIntrinsic *op) {
-    const VectorIntrinsic *e = expr.as<VectorIntrinsic>();
+void IRComparer::visit(const VectorInstruction *op) {
+    const VectorInstruction *e = expr.as<VectorInstruction>();
 
-    compare_names(e->name, op->name);
+    compare_scalar(e->op, op->op);
     compare_expr_vector(e->args, op->args);
 }
 
diff --git a/src/IRMatch.cpp b/src/IRMatch.cpp
index f7bb7d457ff1..4cfc409163af 100644
--- a/src/IRMatch.cpp
+++ b/src/IRMatch.cpp
@@ -296,11 +296,11 @@ class IRMatch : public IRVisitor {
         }
     }
 
-    void visit(const VectorIntrinsic *op) override {
-        const VectorIntrinsic *e = expr.as<VectorIntrinsic>();
+    void visit(const VectorInstruction *op) override {
+        const VectorInstruction *e = expr.as<VectorInstruction>();
         if (result && e &&
             types_match(op->type, e->type) &&
-            e->name == op->name &&
+            e->op == op->op &&
             e->args.size() == op->args.size()) {
             for (size_t i = 0; result && (i < e->args.size()); i++) {
                 // FIXME: should we early-out? Here and in Call*
@@ -521,9 +521,9 @@ bool equal_helper(const BaseExprNode &a, const BaseExprNode &b) noexcept {
     case IRNodeType::Shuffle:
         return (equal_helper(((const Shuffle &)a).vectors, ((const Shuffle &)b).vectors) &&
                 equal_helper(((const Shuffle &)a).indices, ((const Shuffle &)b).indices));
-    case IRNodeType::VectorIntrinsic:
-        return (((const VectorIntrinsic &)a).name == ((const VectorIntrinsic &)b).name &&
-                equal_helper(((const VectorIntrinsic &)a).args, ((const VectorIntrinsic &)b).args));
+    case IRNodeType::VectorInstruction:
+        return (((const VectorInstruction &)a).op == ((const VectorInstruction &)b).op &&
+                equal_helper(((const VectorInstruction &)a).args, ((const VectorInstruction &)b).args));
     case IRNodeType::VectorReduce:
         // As with Cast above, we use equal instead of equal_helper
         // here, because while we know a.type == b.type, we don't know
diff --git a/src/IRMatch.h b/src/IRMatch.h
index 2d1394742713..982a8dcaeace 100644
--- a/src/IRMatch.h
+++ b/src/IRMatch.h
@@ -1886,36 +1886,36 @@ HALIDE_ALWAYS_INLINE auto ramp(A &&a, B &&b, C &&c) noexcept -> RampOp<decltype(
 template<typename... Args>
 struct VectorIntrinOp {
     struct pattern_tag {};
-    const std::string &intrin_name;
+    const VectorInstruction::InstructionOp op;
     std::tuple<Args...> args;
 
     static constexpr uint32_t binds = bitwise_or_reduce((bindings<Args>::mask)...);
 
-    constexpr static IRNodeType min_node_type = IRNodeType::VectorIntrinsic;
-    constexpr static IRNodeType max_node_type = IRNodeType::VectorIntrinsic;
+    constexpr static IRNodeType min_node_type = IRNodeType::VectorInstruction;
+    constexpr static IRNodeType max_node_type = IRNodeType::VectorInstruction;
     constexpr static bool canonical = and_reduce((Args::canonical)...);
 
     template<int i,
              uint32_t bound,
              typename = typename std::enable_if<(i < sizeof...(Args))>::type>
-    HALIDE_ALWAYS_INLINE bool match_args(int, const VectorIntrinsic &v, MatcherState &state) const noexcept {
+    HALIDE_ALWAYS_INLINE bool match_args(int, const VectorInstruction &v, MatcherState &state) const noexcept {
         using T = decltype(std::get<i>(args));
         return (std::get<i>(args).template match<bound>(*v.args[i].get(), state) &&
                 match_args<i + 1, bound | bindings<T>::mask>(0, v, state));
     }
 
     template<int i, uint32_t binds>
-    HALIDE_ALWAYS_INLINE bool match_args(double, const VectorIntrinsic &v, MatcherState &state) const noexcept {
+    HALIDE_ALWAYS_INLINE bool match_args(double, const VectorInstruction &v, MatcherState &state) const noexcept {
         return true;
     }
 
     template<uint32_t bound>
     HALIDE_ALWAYS_INLINE bool match(const BaseExprNode &e, MatcherState &state) const noexcept {
-        if (e.node_type != IRNodeType::VectorIntrinsic) {
+        if (e.node_type != IRNodeType::VectorInstruction) {
             return false;
         }
-        const VectorIntrinsic &v = (const VectorIntrinsic &)e;
-        return (v.name == intrin_name && match_args<0, bound>(0, v, state));
+        const VectorInstruction &v = (const VectorInstruction &)e;
+        return (v.op == op && match_args<0, bound>(0, v, state));
     }
 
     template<int i,
@@ -1958,16 +1958,16 @@ struct VectorIntrinOp {
         //     // TODO(rootjalex): how do we do type-hints here?
         //     args[i] = std::get<i>(args).make(state, {});
         // }
-        return VectorIntrinsic::make(type_hint, intrin_name, r_args);
+        return VectorInstruction::make(type_hint, op, r_args);
     }
 
     constexpr static bool foldable = false;
 
     HALIDE_ALWAYS_INLINE
-    VectorIntrinOp(const std::string &name, Args... args) noexcept
-        : intrin_name(name), args(args...) {
+    VectorIntrinOp(const VectorInstruction::InstructionOp _op, Args... args) noexcept
+        : op(_op), args(args...) {
         static_assert(sizeof...(Args) > 0 && sizeof...(Args) <= 3,
-                      "VectorIntrinsicOp must have non-zero arguments, and update make() if more than 3 arguments.");
+                      "VectorInstructionOp must have non-zero arguments, and update make() if more than 3 arguments.");
     }
 };
 
@@ -1975,15 +1975,15 @@ template<typename... Args>
 std::ostream &operator<<(std::ostream &s, const VectorIntrinOp<Args...> &op) {
     // TODO(rootjalex): Should we print the type?
     s << "vector_intrin(\"";
-    s << op.intrin_name << "\", ";
+    s << op.op << "\", ";
     op.print_args(s);
     s << ")";
     return s;
 }
 
 template<typename... Args>
-HALIDE_ALWAYS_INLINE auto v_intrin(const std::string &name, Args... args) noexcept -> VectorIntrinOp<decltype(pattern_arg(args))...> {
-    return {name, pattern_arg(args)...};
+HALIDE_ALWAYS_INLINE auto v_intrin(const VectorInstruction::InstructionOp op, Args... args) noexcept -> VectorIntrinOp<decltype(pattern_arg(args))...> {
+    return {op, pattern_arg(args)...};
 }
 
 template<typename A, typename B, VectorReduce::Operator reduce_op>
diff --git a/src/IRMutator.cpp b/src/IRMutator.cpp
index e075897d6694..b1703a6cccd1 100644
--- a/src/IRMutator.cpp
+++ b/src/IRMutator.cpp
@@ -327,12 +327,12 @@ Expr IRMutator::visit(const Shuffle *op) {
     return Shuffle::make(new_vectors, op->indices);
 }
 
-Expr IRMutator::visit(const VectorIntrinsic *op) {
+Expr IRMutator::visit(const VectorInstruction *op) {
     auto [new_args, changed] = mutate_with_changes(op->args);
     if (!changed) {
         return op;
     }
-    return VectorIntrinsic::make(op->type, op->name, new_args);
+    return VectorInstruction::make(op->type, op->op, new_args);
 }
 
 Expr IRMutator::visit(const VectorReduce *op) {
diff --git a/src/IRMutator.h b/src/IRMutator.h
index e460b036b80f..4729bb08344f 100644
--- a/src/IRMutator.h
+++ b/src/IRMutator.h
@@ -81,7 +81,7 @@ class IRMutator {
     virtual Expr visit(const Call *);
     virtual Expr visit(const Let *);
     virtual Expr visit(const Shuffle *);
-    virtual Expr visit(const VectorIntrinsic *);
+    virtual Expr visit(const VectorInstruction *);
     virtual Expr visit(const VectorReduce *);
 
     virtual Stmt visit(const LetStmt *);
diff --git a/src/IRPrinter.cpp b/src/IRPrinter.cpp
index f609f28763fd..324b399e5548 100644
--- a/src/IRPrinter.cpp
+++ b/src/IRPrinter.cpp
@@ -1073,11 +1073,11 @@ void IRPrinter::visit(const Shuffle *op) {
     }
 }
 
-void IRPrinter::visit(const VectorIntrinsic *op) {
+void IRPrinter::visit(const VectorInstruction *op) {
     stream << "("
            << op->type
            << ")vector_intrinsic(\""
-           << op->name
+           << op->get_instruction_name()
            << "\", ";
     print_list(op->args);
     stream << ")";
diff --git a/src/IRPrinter.h b/src/IRPrinter.h
index 1e7cc048b805..e4e89efd5806 100644
--- a/src/IRPrinter.h
+++ b/src/IRPrinter.h
@@ -194,7 +194,7 @@ class IRPrinter : public IRVisitor {
     void visit(const IfThenElse *) override;
     void visit(const Evaluate *) override;
     void visit(const Shuffle *) override;
-    void visit(const VectorIntrinsic *) override;
+    void visit(const VectorInstruction *) override;
     void visit(const VectorReduce *) override;
     void visit(const Prefetch *) override;
     void visit(const Atomic *) override;
diff --git a/src/IRVisitor.cpp b/src/IRVisitor.cpp
index 3b1956a51d8a..97c55d8075ac 100644
--- a/src/IRVisitor.cpp
+++ b/src/IRVisitor.cpp
@@ -257,7 +257,7 @@ void IRVisitor::visit(const Shuffle *op) {
     }
 }
 
-void IRVisitor::visit(const VectorIntrinsic *op) {
+void IRVisitor::visit(const VectorInstruction *op) {
     for (const auto &arg : op->args) {
         arg.accept(this);
     }
@@ -521,7 +521,7 @@ void IRGraphVisitor::visit(const Shuffle *op) {
     }
 }
 
-void IRGraphVisitor::visit(const VectorIntrinsic *op) {
+void IRGraphVisitor::visit(const VectorInstruction *op) {
     for (const auto &arg : op->args) {
         include(arg);
     }
diff --git a/src/IRVisitor.h b/src/IRVisitor.h
index c9c170dd851d..5df16880dfed 100644
--- a/src/IRVisitor.h
+++ b/src/IRVisitor.h
@@ -71,7 +71,7 @@ class IRVisitor {
     virtual void visit(const IfThenElse *);
     virtual void visit(const Evaluate *);
     virtual void visit(const Shuffle *);
-    virtual void visit(const VectorIntrinsic *);
+    virtual void visit(const VectorInstruction *);
     virtual void visit(const VectorReduce *);
     virtual void visit(const Prefetch *);
     virtual void visit(const Fork *);
@@ -143,7 +143,7 @@ class IRGraphVisitor : public IRVisitor {
     void visit(const IfThenElse *) override;
     void visit(const Evaluate *) override;
     void visit(const Shuffle *) override;
-    void visit(const VectorIntrinsic *) override;
+    void visit(const VectorInstruction *) override;
     void visit(const VectorReduce *) override;
     void visit(const Prefetch *) override;
     void visit(const Acquire *) override;
@@ -226,8 +226,8 @@ class VariadicVisitor {
             return ((T *)this)->visit((const Let *)node, std::forward<Args>(args)...);
         case IRNodeType::Shuffle:
             return ((T *)this)->visit((const Shuffle *)node, std::forward<Args>(args)...);
-        case IRNodeType::VectorIntrinsic:
-            return ((T *)this)->visit((const VectorIntrinsic *)node, std::forward<Args>(args)...);
+        case IRNodeType::VectorInstruction:
+            return ((T *)this)->visit((const VectorInstruction *)node, std::forward<Args>(args)...);
         case IRNodeType::VectorReduce:
             return ((T *)this)->visit((const VectorReduce *)node, std::forward<Args>(args)...);
             // Explicitly list the Stmt types rather than using a
@@ -290,7 +290,7 @@ class VariadicVisitor {
         case IRNodeType::Call:
         case IRNodeType::Let:
         case IRNodeType::Shuffle:
-        case IRNodeType::VectorIntrinsic:
+        case IRNodeType::VectorInstruction:
         case IRNodeType::VectorReduce:
             internal_error << "Unreachable";
             break;
diff --git a/src/ModulusRemainder.cpp b/src/ModulusRemainder.cpp
index be0226b44e56..fcce870a5a29 100644
--- a/src/ModulusRemainder.cpp
+++ b/src/ModulusRemainder.cpp
@@ -74,7 +74,7 @@ class ComputeModulusRemainder : public IRVisitor {
     void visit(const Free *) override;
     void visit(const Evaluate *) override;
     void visit(const Shuffle *) override;
-    void visit(const VectorIntrinsic *) override;
+    void visit(const VectorInstruction *) override;
     void visit(const VectorReduce *) override;
     void visit(const Prefetch *) override;
     void visit(const Atomic *) override;
@@ -214,8 +214,8 @@ void ComputeModulusRemainder::visit(const Shuffle *op) {
     result = ModulusRemainder{};
 }
 
-void ComputeModulusRemainder::visit(const VectorIntrinsic *op) {
-    internal_error << "modulus_remainder of VectorIntrinsic:\n"
+void ComputeModulusRemainder::visit(const VectorInstruction *op) {
+    internal_error << "modulus_remainder of VectorInstruction:\n"
                    << Expr(op) << "\n";
     result = ModulusRemainder{};
 }
diff --git a/src/Monotonic.cpp b/src/Monotonic.cpp
index 2c51c7fa960b..b5d8cea0d928 100644
--- a/src/Monotonic.cpp
+++ b/src/Monotonic.cpp
@@ -534,7 +534,7 @@ class DerivativeBounds : public IRVisitor {
         result = ConstantInterval::single_point(0);
     }
 
-    void visit(const VectorIntrinsic *op) override {
+    void visit(const VectorInstruction *op) override {
         // TODO(rootjalex): Should this be an error?
         result = ConstantInterval::everything();
     }
diff --git a/src/Simplify_Exprs.cpp b/src/Simplify_Exprs.cpp
index 5aefebfa611f..0496fb4fc353 100644
--- a/src/Simplify_Exprs.cpp
+++ b/src/Simplify_Exprs.cpp
@@ -59,7 +59,7 @@ Expr Simplify::visit(const Broadcast *op, ExprInfo *bounds) {
     }
 }
 
-Expr Simplify::visit(const VectorIntrinsic *op, ExprInfo *bounds) {
+Expr Simplify::visit(const VectorInstruction *op, ExprInfo *bounds) {
     clear_bounds_info(bounds);
     return op;
 }
diff --git a/src/Simplify_Internal.h b/src/Simplify_Internal.h
index 7dae150042a4..1b0258a1d150 100644
--- a/src/Simplify_Internal.h
+++ b/src/Simplify_Internal.h
@@ -333,7 +333,7 @@ class Simplify : public VariadicVisitor<Simplify, Expr, Stmt> {
     Expr visit(const Load *op, ExprInfo *bounds);
     Expr visit(const Call *op, ExprInfo *bounds);
     Expr visit(const Shuffle *op, ExprInfo *bounds);
-    Expr visit(const VectorIntrinsic *op, ExprInfo *bounds);
+    Expr visit(const VectorInstruction *op, ExprInfo *bounds);
     Expr visit(const VectorReduce *op, ExprInfo *bounds);
     Expr visit(const Let *op, ExprInfo *bounds);
     Stmt visit(const LetStmt *op);
diff --git a/src/StmtToHtml.cpp b/src/StmtToHtml.cpp
index ceddcabec83d..8dd6afdc8a73 100644
--- a/src/StmtToHtml.cpp
+++ b/src/StmtToHtml.cpp
@@ -712,10 +712,10 @@ class StmtToHtml : public IRVisitor {
         stream << close_span();
     }
 
-    void visit(const VectorIntrinsic *op) override {
-        stream << open_span("VectorIntrinsic");
+    void visit(const VectorInstruction *op) override {
+        stream << open_span("VectorInstruction");
         stream << open_span("Type") << op->type << close_span();
-        print_list(symbol("vector_intrinsic") + "(\"" + op->name + "\"", op->args, ")");
+        print_list(symbol("vector_intrinsic") + "(\"" + op->get_instruction_name() + "\"", op->args, ")");
         stream << close_span();
     }
 
diff --git a/src/X86Optimize.cpp b/src/X86Optimize.cpp
index f4c2b758db55..4972e5c775c4 100644
--- a/src/X86Optimize.cpp
+++ b/src/X86Optimize.cpp
@@ -60,7 +60,7 @@ bool should_use_dot_product(const Expr &a, const Expr &b, std::vector<Expr> &res
     return false;
 }
 
-/** A top-down code optimizer that replaces Halide IR with VectorIntrinsics specific to x86. */
+/** A top-down code optimizer that replaces Halide IR with VectorInstructions specific to x86. */
 class Optimize_X86 : public IRMutator {
 public:
     /** Create an x86 code optimizer. Processor features can be
@@ -110,33 +110,33 @@ class Optimize_X86 : public IRMutator {
             // Accumulating pmaddubsw
             (rewrite(
                  x + h_add(cast(Int(32, lanes * 4), widening_mul(y, z)), lanes),
-                 v_intrin("dot_product", x, y, z),
+                 v_intrin(VectorInstruction::dot_product, x, y, z),
                  is_uint(y, 8) && is_int(z, 8)) ||
 
              rewrite(
                  x + h_add(cast(Int(32, lanes * 4), widening_mul(y, z)), lanes),
-                 v_intrin("dot_product", x, z, y),
+                 v_intrin(VectorInstruction::dot_product, x, z, y),
                  is_int(y, 8) && is_uint(z, 8)) ||
 
              rewrite(
                  h_add(cast(Int(32, lanes * 4), widening_mul(x, y)), lanes) + z,
-                 v_intrin("dot_product", z, x, y),
+                 v_intrin(VectorInstruction::dot_product, z, x, y),
                  is_uint(x, 8) && is_int(y, 8)) ||
 
              rewrite(
                  h_add(cast(Int(32, lanes * 4), widening_mul(x, y)), lanes) + z,
-                 v_intrin("dot_product", z, y, x),
+                 v_intrin(VectorInstruction::dot_product, z, y, x),
                  is_int(x, 8) && is_uint(y, 8)) ||
 
              // Accumulating pmaddwd.
              rewrite(
                  x + h_add(widening_mul(y, z), lanes),
-                 v_intrin("dot_product", x, y, z),
+                 v_intrin(VectorInstruction::dot_product, x, y, z),
                  is_int(y, 16, lanes * 2) && is_int(z, 16, lanes * 2)) ||
 
              rewrite(
                  h_add(widening_mul(x, y), lanes) + z,
-                 v_intrin("dot_product", z, x, y),
+                 v_intrin(VectorInstruction::dot_product, z, x, y),
                  is_int(x, 16, lanes * 2) && is_int(y, 16, lanes * 2)) ||
 
              false)) {
@@ -149,7 +149,7 @@ class Optimize_X86 : public IRMutator {
             // We have dot_products for every x86 arch (because SSE2 has it),
             // so this is `always` safe (as long as the output type lanes has
             // a factor of 4).
-            return mutate(VectorIntrinsic::make(op->type, "dot_product", {ac, bd}));
+            return mutate(VectorInstruction::make(op->type, VectorInstruction::dot_product, {ac, bd}));
         }
 
         return IRMutator::visit(op);
@@ -177,7 +177,7 @@ class Optimize_X86 : public IRMutator {
                 Expr ac = Shuffle::make_interleave({matches[0], matches[2]});
                 Expr bd = Shuffle::make_interleave({matches[1], matches[3]});
                 // Always safe, see comment in Add case above.
-                return mutate(VectorIntrinsic::make(op->type, "dot_product", {ac, bd}));
+                return mutate(VectorInstruction::make(op->type, VectorInstruction::dot_product, {ac, bd}));
             }
         }
 
@@ -208,38 +208,38 @@ class Optimize_X86 : public IRMutator {
             (target.has_feature(Target::SSE41) &&
              rewrite(
                  cast(Int(16, lanes), rounding_shift_right(widening_mul(x, y), 15)),
-                 v_intrin("pmulhrs", x, y),
+                 v_intrin(VectorInstruction::pmulhrs, x, y),
                  is_int(x, 16) && is_int(y, 16))) ||
 
             // saturating_narrow is always supported (via SSE2) for:
             //   int32 -> int16, int16 -> int8, int16 -> uint8
             rewrite(
                 cast(Int(16, lanes), max(min(x, i32_i16min), i32_i16min)),
-                v_intrin("saturating_narrow", x),
+                v_intrin(VectorInstruction::saturating_narrow, x),
                 is_int(x, 32)) ||
 
             rewrite(
                 cast(Int(8, lanes), max(min(x, i16_i8min), i16_i8min)),
-                v_intrin("saturating_narrow", x),
+                v_intrin(VectorInstruction::saturating_narrow, x),
                 is_int(x, 16)) ||
 
             rewrite(
                 cast(UInt(8, lanes), max(min(x, i16_u8min), i16_u8min)),
-                v_intrin("saturating_narrow", x),
+                v_intrin(VectorInstruction::saturating_narrow, x),
                 is_int(x, 16)) ||
 
             //   int32 -> uint16 is supported via SSE41
             (target.has_feature(Target::SSE41) &&
              rewrite(
                  cast(UInt(16, lanes), max(min(x, i32_u16min), i32_u16min)),
-                 v_intrin("saturating_narrow", x),
+                 v_intrin(VectorInstruction::saturating_narrow, x),
                  is_int(x, 32))) ||
 
             // f32_to_bf16 is supported only via Target::AVX512_SapphireRapids
             (target.has_feature(Target::AVX512_SapphireRapids) &&
              rewrite(
                  cast(BFloat(16, lanes), x),
-                 v_intrin("f32_to_bf16", x),
+                 v_intrin(VectorInstruction::f32_to_bf16, x),
                  is_float(x, 32))) ||
 
             false) {
@@ -285,6 +285,7 @@ class Optimize_X86 : public IRMutator {
         const int bits = op->type.bits();
 
         auto rewrite = IRMatcher::rewriter(op, op->type);
+        using IRMatcher::typed;
 
         Type unsigned_type = op->type.with_code(halide_type_uint);
         auto x_uint = cast(unsigned_type, x);
@@ -321,14 +322,17 @@ class Optimize_X86 : public IRMutator {
             ((op->type.is_int_or_uint() && bits == 16) &&
              rewrite(
                  mul_shift_right(x, y, 16),
-                 v_intrin("pmulh", x, y))) ||
+                 v_intrin(VectorInstruction::pmulh, x, y))) ||
 
             // saturating_pmulhrs is supported via SSE41
             ((target.has_feature(Target::SSE41) &&
               op->type.is_int() && bits == 16) &&
              rewrite(
                  rounding_mul_shift_right(x, y, 15),
-                 v_intrin("saturating_pmulhrs", x, y))) ||
+                 // saturating_pmulhrs
+                 select((x == typed(Int(16, lanes), -32768)) && (y == typed(Int(16, lanes), -32768)),
+                        typed(Int(16, lanes), 32767),
+                        v_intrin(VectorInstruction::pmulhrs, x, y)))) ||
 
             // TODO(rootjalex): The following intrinsics are
             // simply one-to-one mappings, should they even
@@ -340,29 +344,29 @@ class Optimize_X86 : public IRMutator {
               (op->type.is_float() && bits == 32)) &&
              rewrite(
                  abs(x),
-                 v_intrin("abs", x))) ||
+                 v_intrin(VectorInstruction::abs, x))) ||
 
             // saturating ops for 8 and 16 bits are always supported (via SSE2).
             ((bits == 8 || bits == 16) &&
              (rewrite(
                   saturating_add(x, y),
-                  v_intrin("saturating_add", x, y)) ||
+                  v_intrin(VectorInstruction::saturating_add, x, y)) ||
               rewrite(
                   saturating_sub(x, y),
-                  v_intrin("saturating_sub", x, y)))) ||
+                  v_intrin(VectorInstruction::saturating_sub, x, y)))) ||
 
             // pavg ops for 8 and 16 bits are always supported (via SSE2).
             ((op->type.is_uint() && (bits == 8 || bits == 16)) &&
              rewrite(
                  rounding_halving_add(x, y),
-                 v_intrin("rounding_halving_add", x, y))) ||
+                 v_intrin(VectorInstruction::rounding_halving_add, x, y))) ||
 
             // int16 -> int32 widening_mul has a (v)pmaddwd implementation.
             // always supported (via SSE2).
             ((op->type.is_int() && (bits == 32)) &&
              rewrite(
                  widening_mul(x, y),
-                 v_intrin("widening_mul", x, y),
+                 v_intrin(VectorInstruction::widening_mul, x, y),
                  is_int(x, 16) && is_int(y, 16))) ||
 
             (target.has_feature(Target::AVX512_SapphireRapids) &&
@@ -370,27 +374,27 @@ class Optimize_X86 : public IRMutator {
              // SapphireRapids accumulating dot products.
              (rewrite(
                   saturating_add(x, h_satadd(cast(Int(32, lanes * 4), widening_mul(y, z)), lanes)),
-                  v_intrin("saturating_dot_product", x, y, z),
+                  v_intrin(VectorInstruction::saturating_dot_product, x, y, z),
                   is_uint(y, 8) && is_int(z, 8)) ||
 
               rewrite(
                   saturating_add(x, h_satadd(cast(Int(32, lanes * 4), widening_mul(y, z)), lanes)),
-                  v_intrin("saturating_dot_product", x, z, y),
+                  v_intrin(VectorInstruction::saturating_dot_product, x, z, y),
                   is_int(y, 8) && is_uint(z, 8)) ||
 
               rewrite(
                   saturating_add(x, h_satadd(cast(Int(32, lanes * 2), widening_mul(y, z)), lanes)),
-                  v_intrin("saturating_dot_product", x, y, z),
+                  v_intrin(VectorInstruction::saturating_dot_product, x, y, z),
                   is_uint(y, 8) && is_int(z, 8)) ||
 
               rewrite(
                   saturating_add(x, h_satadd(cast(Int(32, lanes * 2), widening_mul(y, z)), lanes)),
-                  v_intrin("saturating_dot_product", x, z, y),
+                  v_intrin(VectorInstruction::saturating_dot_product, x, z, y),
                   is_int(y, 8) && is_uint(z, 8)) ||
 
               rewrite(
                   saturating_add(x, h_satadd(widening_mul(y, z), lanes)),
-                  v_intrin("saturating_dot_product", x, z, y),
+                  v_intrin(VectorInstruction::saturating_dot_product, x, z, y),
                   is_int(y, 16, lanes * 2) && is_int(z, 16, lanes * 2)) ||
 
               false)) ||
@@ -449,18 +453,18 @@ class Optimize_X86 : public IRMutator {
                 ((factor == 2) &&
                  (rewrite(
                       h_add(cast(Int(32, value_lanes), widening_mul(x, y)), lanes),
-                      v_intrin("dot_product", cast(Int(16, value_lanes), x), cast(Int(16, value_lanes), y)),
+                      v_intrin(VectorInstruction::dot_product, cast(Int(16, value_lanes), x), cast(Int(16, value_lanes), y)),
                       x_is_int_or_uint && y_is_int_or_uint) ||
 
                   // Horizontal widening add via pmaddwd
                   rewrite(
                       h_add(cast(Int(32, value_lanes), x), lanes),
-                      v_intrin("dot_product", x, make_const(Int(16, value_lanes), 1)),
+                      v_intrin(VectorInstruction::dot_product, x, make_const(Int(16, value_lanes), 1)),
                       is_int(x, 16)) ||
 
                   (rewrite(
                       h_add(widening_mul(x, y), lanes),
-                      v_intrin("dot_product", x, y),
+                      v_intrin(VectorInstruction::dot_product, x, y),
                       is_int(x, 16) && is_int(y, 16))) ||
 
                   // pmaddub supported via SSE41
@@ -468,23 +472,23 @@ class Optimize_X86 : public IRMutator {
                    // Horizontal widening adds using 2-way saturating dot products.
                    (rewrite(
                         h_add(cast(UInt(16, value_lanes), x), lanes),
-                        cast(UInt(16, lanes), typed(Int(16, lanes), v_intrin("saturating_dot_product", x, make_const(Int(8, value_lanes), 1)))),
+                        cast(UInt(16, lanes), typed(Int(16, lanes), v_intrin(VectorInstruction::saturating_dot_product, x, make_const(Int(8, value_lanes), 1)))),
                         is_uint(x, 8)) ||
 
                     rewrite(
                         h_add(cast(Int(16, value_lanes), x), lanes),
-                        v_intrin("saturating_dot_product", x, make_const(Int(8, value_lanes), 1)),
+                        v_intrin(VectorInstruction::saturating_dot_product, x, make_const(Int(8, value_lanes), 1)),
                         is_uint(x, 8)) ||
 
                     rewrite(
                         h_add(cast(Int(16, value_lanes), x), lanes),
-                        v_intrin("saturating_dot_product", make_const(UInt(8, value_lanes), 1), x),
+                        v_intrin(VectorInstruction::saturating_dot_product, make_const(UInt(8, value_lanes), 1), x),
                         is_int(x, 8)) ||
 
                     // SSE41 and AVX2 support horizontal_add via phadd intrinsics.
                     rewrite(
                         h_add(x, lanes),
-                        v_intrin("horizontal_add", x),
+                        v_intrin(VectorInstruction::horizontal_add, x),
                         is_int(x, 16, lanes * 2) || is_uint(x, 16, lanes * 2) ||
                             is_int(x, 32, lanes * 2) || is_uint(x, 32, lanes * 2)) ||
 
@@ -504,12 +508,12 @@ class Optimize_X86 : public IRMutator {
                 ((factor == 2) && target.has_feature(Target::SSE41) &&
                  (rewrite(
                       h_satadd(widening_mul(x, y), lanes),
-                      v_intrin("saturating_dot_product", x, y),
+                      v_intrin(VectorInstruction::saturating_dot_product, x, y),
                       is_uint(x, 8) && is_int(y, 8)) ||
 
                   rewrite(
                       h_satadd(widening_mul(x, y), lanes),
-                      v_intrin("saturating_dot_product", y, x),
+                      v_intrin(VectorInstruction::saturating_dot_product, y, x),
                       is_int(x, 8) && is_uint(y, 8)) ||
 
                   false))) {
diff --git a/src/X86Optimize.h b/src/X86Optimize.h
index df37c7dd896f..9ab9d5f54269 100644
--- a/src/X86Optimize.h
+++ b/src/X86Optimize.h
@@ -12,7 +12,7 @@
 namespace Halide {
 namespace Internal {
 
-/** Perform vector instruction selection, inserting VectorIntrinsic nodes. */
+/** Perform vector instruction selection, inserting VectorInstruction nodes. */
 Stmt optimize_x86_instructions(Stmt stmt, const Target &target, const CodeGen_LLVM *codegen);
 
 }  // namespace Internal

From 339b6b71c4c2e4b6d87312b67fb8c5d393695e38 Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Wed, 27 Jul 2022 17:34:02 -0400
Subject: [PATCH 25/55] undef -> poison

---
 src/runtime/x86_avx2.ll  | 9 ++++-----
 src/runtime/x86_sse41.ll | 9 ++++-----
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/src/runtime/x86_avx2.ll b/src/runtime/x86_avx2.ll
index 4f6f11718940..a340c240d734 100644
--- a/src/runtime/x86_avx2.ll
+++ b/src/runtime/x86_avx2.ll
@@ -74,16 +74,16 @@ define weak_odr <16 x i16> @hadd_pmadd_i8_avx2(<32 x i8> %a) nounwind alwaysinli
 declare <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>) nounwind readnone
 
 define weak_odr <16 x i16> @phaddw_avx2(<32 x i16> %a) nounwind alwaysinline {
-   %1 = shufflevector <32 x i16> %a, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-   %2 = shufflevector <32 x i16> %a, <32 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+   %1 = shufflevector <32 x i16> %a, <32 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+   %2 = shufflevector <32 x i16> %a, <32 x i16> poison, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
    %3 = tail call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %1, <16 x i16> %2)
    ret <16 x i16> %3
  }
  declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>) nounwind readnone
 
  define weak_odr <8 x i32> @phaddd_avx2(<16 x i32> %a) nounwind alwaysinline {
-   %1 = shufflevector <16 x i32> %a, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-   %2 = shufflevector <16 x i32> %a, <16 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+   %1 = shufflevector <16 x i32> %a, <16 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+   %2 = shufflevector <16 x i32> %a, <16 x i32> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
    %3 = tail call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %1, <8 x i32> %2)
    ret <8 x i32> %3
  }
@@ -103,4 +103,3 @@ define weak_odr <16 x i16> @phaddw_avx2(<32 x i16> %a) nounwind alwaysinline {
    ret <8 x i32> %res
  }
  declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) nounwind readnone
- 
\ No newline at end of file
diff --git a/src/runtime/x86_sse41.ll b/src/runtime/x86_sse41.ll
index caf728957e59..59045d5e6337 100644
--- a/src/runtime/x86_sse41.ll
+++ b/src/runtime/x86_sse41.ll
@@ -94,18 +94,17 @@ define weak_odr <8 x i16> @hadd_pmadd_i8_sse3(<16 x i8> %a) nounwind alwaysinlin
 declare <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>) nounwind readnone
 
 define weak_odr <8 x i16> @phaddw_sse3(<16 x i16> %a) nounwind alwaysinline {
-   %1 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-   %2 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+   %1 = shufflevector <16 x i16> %a, <16 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+   %2 = shufflevector <16 x i16> %a, <16 x i16> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    %3 = tail call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %1, <8 x i16> %2)
    ret <8 x i16> %3
  }
  declare <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16>, <8 x i16>) nounwind readnone
 
  define weak_odr <4 x i32> @phaddd_sse3(<8 x i32> %a) nounwind alwaysinline {
-   %1 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-   %2 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+   %1 = shufflevector <8 x i32> %a, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+   %2 = shufflevector <8 x i32> %a, <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    %3 = tail call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %1, <4 x i32> %2)
    ret <4 x i32> %3
  }
  declare <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32>, <4 x i32>) nounwind readnone
- 
\ No newline at end of file

From 17c99240c44dd2b4cc03e63e61010325849f298b Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Wed, 27 Jul 2022 17:36:14 -0400
Subject: [PATCH 26/55] fully remove saturating_pmulhrs

---
 src/CodeGen_X86.cpp      |  2 --
 src/runtime/x86_avx2.ll  | 10 ----------
 src/runtime/x86_sse41.ll | 10 ----------
 3 files changed, 22 deletions(-)

diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp
index 8caa3ee756b0..4d208e970a67 100644
--- a/src/CodeGen_X86.cpp
+++ b/src/CodeGen_X86.cpp
@@ -173,11 +173,9 @@ const x86Intrinsic intrinsic_defs[] = {
     {"llvm.x86.avx2.pmulh.w", Int(16, 16), "pmulh", {Int(16, 16), Int(16, 16)}, Target::AVX2},
     {"llvm.x86.avx2.pmulhu.w", UInt(16, 16), "pmulh", {UInt(16, 16), UInt(16, 16)}, Target::AVX2},
     {"llvm.x86.avx2.pmul.hr.sw", Int(16, 16), "pmulhrs", {Int(16, 16), Int(16, 16)}, Target::AVX2},
-    {"saturating_pmulhrswx16", Int(16, 16), "saturating_pmulhrs", {Int(16, 16), Int(16, 16)}, Target::AVX2},
     {"llvm.x86.sse2.pmulh.w", Int(16, 8), "pmulh", {Int(16, 8), Int(16, 8)}},
     {"llvm.x86.sse2.pmulhu.w", UInt(16, 8), "pmulh", {UInt(16, 8), UInt(16, 8)}},
     {"llvm.x86.ssse3.pmul.hr.sw.128", Int(16, 8), "pmulhrs", {Int(16, 8), Int(16, 8)}, Target::SSE41},
-    {"saturating_pmulhrswx8", Int(16, 8), "saturating_pmulhrs", {Int(16, 8), Int(16, 8)}, Target::SSE41},
 
     // Convert FP32 to BF16
     {"vcvtne2ps2bf16x32", BFloat(16, 32), "f32_to_bf16", {Float(32, 32)}, Target::AVX512_SapphireRapids},
diff --git a/src/runtime/x86_avx2.ll b/src/runtime/x86_avx2.ll
index a340c240d734..221d9560502d 100644
--- a/src/runtime/x86_avx2.ll
+++ b/src/runtime/x86_avx2.ll
@@ -52,16 +52,6 @@ define weak_odr <8 x i32> @abs_i32x8(<8 x i32> %arg) {
  ret <8 x i32> %3
 }
 
-define weak_odr <16 x i16> @saturating_pmulhrswx16(<16 x i16> %a, <16 x i16> %b) nounwind uwtable readnone alwaysinline {
-  %1 = tail call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %a, <16 x i16> %b)
-  %2 = icmp eq <16 x i16> %a, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
-  %3 = icmp eq <16 x i16> %b, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
-  %4 = and <16 x i1> %2, %3
-  %5 = select <16 x i1> %4, <16 x i16> <i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767>, <16 x i16> %1
-  ret <16 x i16> %5
-}
-declare <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16>, <16 x i16>) nounwind readnone
-
 define weak_odr <16 x i16> @hadd_pmadd_u8_avx2(<32 x i8> %a) nounwind alwaysinline {
   %1 = tail call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %a, <32 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
   ret <16 x i16> %1
diff --git a/src/runtime/x86_sse41.ll b/src/runtime/x86_sse41.ll
index 59045d5e6337..6c7b2356de75 100644
--- a/src/runtime/x86_sse41.ll
+++ b/src/runtime/x86_sse41.ll
@@ -72,16 +72,6 @@ define weak_odr <4 x i32> @abs_i32x4(<4 x i32> %x) nounwind uwtable readnone alw
   ret <4 x i32> %3
 }
 
-define weak_odr <8 x i16> @saturating_pmulhrswx8(<8 x i16> %a, <8 x i16> %b) nounwind uwtable readnone alwaysinline {
-  %1 = tail call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16> %a, <8 x i16> %b)
-  %2 = icmp eq <8 x i16> %a, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
-  %3 = icmp eq <8 x i16> %b, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
-  %4 = and <8 x i1> %2, %3
-  %5 = select <8 x i1> %4, <8 x i16> <i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767>, <8 x i16> %1
-  ret <8 x i16> %5
-}
-declare <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16>, <8 x i16>) nounwind readnone
-
 define weak_odr <8 x i16> @hadd_pmadd_u8_sse3(<16 x i8> %a) nounwind alwaysinline {
   %1 = tail call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
   ret <8 x i16> %1

From 6c74a63554761dc84c813c38e90b2042b3bc8977 Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Wed, 27 Jul 2022 17:36:57 -0400
Subject: [PATCH 27/55] clang format

---
 src/IR.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/IR.cpp b/src/IR.cpp
index 07832b93b6ca..6f6ed96e200c 100644
--- a/src/IR.cpp
+++ b/src/IR.cpp
@@ -940,7 +940,6 @@ const char *VectorInstruction::get_instruction_name() const {
     return instruction_op_names[op];
 }
 
-
 Expr VectorReduce::make(VectorReduce::Operator op,
                         Expr vec,
                         int lanes) {

From 11690d793edba3719451115e921731f7c05899a2 Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Wed, 27 Jul 2022 23:42:12 -0400
Subject: [PATCH 28/55] disable UB for VectorInstruction node

---
 src/IRMutator.cpp | 12 +++++++-----
 src/IRVisitor.cpp | 14 ++++++++------
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/src/IRMutator.cpp b/src/IRMutator.cpp
index b1703a6cccd1..9ac27753ae09 100644
--- a/src/IRMutator.cpp
+++ b/src/IRMutator.cpp
@@ -328,11 +328,13 @@ Expr IRMutator::visit(const Shuffle *op) {
 }
 
 Expr IRMutator::visit(const VectorInstruction *op) {
-    auto [new_args, changed] = mutate_with_changes(op->args);
-    if (!changed) {
-        return op;
-    }
-    return VectorInstruction::make(op->type, op->op, new_args);
+    internal_error << "Always implement VectorInstruction visitor for IRMutator subclass\n";
+    // auto [new_args, changed] = mutate_with_changes(op->args);
+    // if (!changed) {
+    //     return op;
+    // }
+    // return VectorInstruction::make(op->type, op->op, new_args);
+    return Expr();
 }
 
 Expr IRMutator::visit(const VectorReduce *op) {
diff --git a/src/IRVisitor.cpp b/src/IRVisitor.cpp
index 97c55d8075ac..bbca23a77a7b 100644
--- a/src/IRVisitor.cpp
+++ b/src/IRVisitor.cpp
@@ -258,9 +258,10 @@ void IRVisitor::visit(const Shuffle *op) {
 }
 
 void IRVisitor::visit(const VectorInstruction *op) {
-    for (const auto &arg : op->args) {
-        arg.accept(this);
-    }
+    internal_error << "Always implement VectorInstruction visitor for IRVisitor subclass\n";
+    // for (const auto &arg : op->args) {
+    //     arg.accept(this);
+    // }
 }
 
 void IRVisitor::visit(const VectorReduce *op) {
@@ -522,9 +523,10 @@ void IRGraphVisitor::visit(const Shuffle *op) {
 }
 
 void IRGraphVisitor::visit(const VectorInstruction *op) {
-    for (const auto &arg : op->args) {
-        include(arg);
-    }
+    internal_error << "Always implement VectorInstruction visitor for IRGraphVisitor subclass\n";
+    // for (const auto &arg : op->args) {
+    //     include(arg);
+    // }
 }
 
 void IRGraphVisitor::visit(const VectorReduce *op) {

From 3648ca6172dfffb337865de61ca59058acdb9395 Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Thu, 28 Jul 2022 00:06:39 -0400
Subject: [PATCH 29/55] implement a base class for instruction selection

---
 Makefile                    |  2 ++
 src/CMakeLists.txt          |  2 ++
 src/InstructionSelector.cpp | 25 ++++++++++++++++++
 src/InstructionSelector.h   | 35 +++++++++++++++++++++++++
 src/X86Optimize.cpp         | 52 ++++++++++++-------------------------
 5 files changed, 80 insertions(+), 36 deletions(-)
 create mode 100644 src/InstructionSelector.cpp
 create mode 100644 src/InstructionSelector.h

diff --git a/Makefile b/Makefile
index bd0381578b47..1eebde7970c4 100644
--- a/Makefile
+++ b/Makefile
@@ -476,6 +476,7 @@ SOURCE_FILES = \
   InjectHostDevBufferCopies.cpp \
   Inline.cpp \
   InlineReductions.cpp \
+  InstructionSelector.cpp \
   IntegerDivisionTable.cpp \
   Interval.cpp \
   Introspection.cpp \
@@ -656,6 +657,7 @@ HEADER_FILES = \
   InjectHostDevBufferCopies.h \
   Inline.h \
   InlineReductions.h \
+  InstructionSelector.h \
   IntegerDivisionTable.h \
   Interval.h \
   Introspection.h \
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 64d1a9f4316e..e1c51f19e641 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -81,6 +81,7 @@ set(HEADER_FILES
     InjectHostDevBufferCopies.h
     Inline.h
     InlineReductions.h
+    InstructionSelector.h
     IntegerDivisionTable.h
     Interval.h
     Introspection.h
@@ -245,6 +246,7 @@ set(SOURCE_FILES
     InjectHostDevBufferCopies.cpp
     Inline.cpp
     InlineReductions.cpp
+    InstructionSelector.cpp
     IntegerDivisionTable.cpp
     Interval.cpp
     Introspection.cpp
diff --git a/src/InstructionSelector.cpp b/src/InstructionSelector.cpp
new file mode 100644
index 000000000000..e408895eec6f
--- /dev/null
+++ b/src/InstructionSelector.cpp
@@ -0,0 +1,25 @@
+#include "InstructionSelector.h"
+
+#include "CodeGen_Internal.h"
+
+
+namespace Halide {
+namespace Internal {
+
+InstructionSelector::InstructionSelector(const Target &t, const CodeGen_LLVM *c) : target(t), codegen(c) {
+}
+
+Expr InstructionSelector::visit(const Div *op) {
+    if (!op->type.is_vector() || !op->type.is_int_or_uint()) {
+        return IRGraphMutator::visit(op);
+    }
+    // Lower division here in order to do pattern-matching on intrinsics.
+    return mutate(lower_int_uint_div(op->a, op->b));
+}
+
+Expr InstructionSelector::visit(const VectorReduce *op) {
+    return codegen->split_vector_reduce(op, Expr());
+}
+
+}  // namespace Internal
+}  // namespace Halide
diff --git a/src/InstructionSelector.h b/src/InstructionSelector.h
new file mode 100644
index 000000000000..723ec6addd69
--- /dev/null
+++ b/src/InstructionSelector.h
@@ -0,0 +1,35 @@
+#ifndef HALIDE_INSTR_SELECTOR_H
+#define HALIDE_INSTR_SELECTOR_H
+
+/** \file
+ * Defines a base class for VectorInstruction selection.
+ */
+
+#include "CodeGen_LLVM.h"
+#include "IR.h"
+#include "IRMutator.h"
+#include "Target.h"
+
+namespace Halide {
+namespace Internal {
+
+/** A base class for vector instruction selection.
+ *  The default implementation lowers int and uint
+ *  division via `lower_int_uint_div` and splits
+ *  VectorReduce nodes via CodeGen_LLVM::split_vector_reduce().
+ */
+class InstructionSelector : public IRGraphMutator {
+protected:
+    const Target &target;
+    const CodeGen_LLVM *codegen;
+
+    Expr visit(const Div *) override;
+    Expr visit(const VectorReduce *) override;
+public:
+    InstructionSelector(const Target &target, const CodeGen_LLVM *codegen);
+};
+
+}  // namespace Internal
+}  // namespace Halide
+
+#endif
diff --git a/src/X86Optimize.cpp b/src/X86Optimize.cpp
index 4972e5c775c4..bf2c317f189b 100644
--- a/src/X86Optimize.cpp
+++ b/src/X86Optimize.cpp
@@ -1,9 +1,8 @@
 #include "X86Optimize.h"
 
 #include "CSE.h"
-// FIXME: move lower_int_uint_div out of CodeGen_Internal to remove this dependency.
-#include "CodeGen_Internal.h"
 #include "FindIntrinsics.h"
+#include "InstructionSelector.h"
 #include "IR.h"
 #include "IRMatch.h"
 #include "IRMutator.h"
@@ -61,12 +60,12 @@ bool should_use_dot_product(const Expr &a, const Expr &b, std::vector<Expr> &res
 }
 
 /** A top-down code optimizer that replaces Halide IR with VectorInstructions specific to x86. */
-class Optimize_X86 : public IRMutator {
+class Optimize_X86 : public InstructionSelector {
 public:
     /** Create an x86 code optimizer. Processor features can be
      * enabled using the appropriate flags in the target struct. */
-    Optimize_X86(const Target &t, const CodeGen_LLVM *c)
-        : target(t), codegen(c) {
+    Optimize_X86(const Target &target, const CodeGen_LLVM *codegen)
+        : InstructionSelector(target, codegen) {
     }
 
 protected:
@@ -77,20 +76,12 @@ class Optimize_X86 : public IRMutator {
         return type.is_vector();
     }
 
-    using IRMutator::visit;
-
-    Expr visit(const Div *op) override {
-        if (!should_peephole_optimize(op->type) || !op->type.is_int_or_uint()) {
-            return IRMutator::visit(op);
-        }
-        // Lower division here in order to do pattern-matching on intrinsics.
-        return mutate(lower_int_uint_div(op->a, op->b));
-    }
+    using IRGraphMutator::visit;
 
     /** Nodes for which we want to emit specific sse/avx intrinsics */
     Expr visit(const Add *op) override {
         if (!should_peephole_optimize(op->type)) {
-            return IRMutator::visit(op);
+            return IRGraphMutator::visit(op);
         }
 
         std::vector<Expr> matches;
@@ -152,12 +143,12 @@ class Optimize_X86 : public IRMutator {
             return mutate(VectorInstruction::make(op->type, VectorInstruction::dot_product, {ac, bd}));
         }
 
-        return IRMutator::visit(op);
+        return IRGraphMutator::visit(op);
     }
 
     Expr visit(const Sub *op) override {
         if (!should_peephole_optimize(op->type)) {
-            return IRMutator::visit(op);
+            return IRGraphMutator::visit(op);
         }
 
         std::vector<Expr> matches;
@@ -181,12 +172,12 @@ class Optimize_X86 : public IRMutator {
             }
         }
 
-        return IRMutator::visit(op);
+        return IRGraphMutator::visit(op);
     }
 
     Expr visit(const Cast *op) override {
         if (!should_peephole_optimize(op->type)) {
-            return IRMutator::visit(op);
+            return IRGraphMutator::visit(op);
         }
 
         const int lanes = op->type.lanes();
@@ -248,12 +239,12 @@ class Optimize_X86 : public IRMutator {
 
         // TODO: should we handle CodeGen_X86's weird 8 -> 16 bit issue here?
 
-        return IRMutator::visit(op);
+        return IRGraphMutator::visit(op);
     }
 
     Expr visit(const Call *op) override {
         if (!should_peephole_optimize(op->type)) {
-            return IRMutator::visit(op);
+            return IRGraphMutator::visit(op);
         }
 
         // TODO: This optimization is hard to do via a rewrite-rule because of lossless_cast.
@@ -405,6 +396,7 @@ class Optimize_X86 : public IRMutator {
 
         // Fixed-point intrinsics should be lowered here.
         // This is safe because this mutator is top-down.
+        // FIXME: Should this be default behavior of the base InstructionSelector class?
         if (op->is_intrinsic({
                 Call::halving_add,
                 Call::halving_sub,
@@ -426,7 +418,7 @@ class Optimize_X86 : public IRMutator {
             return mutate(lower_intrinsic(op));
         }
 
-        return IRMutator::visit(op);
+        return IRGraphMutator::visit(op);
     }
 
     Expr visit(const VectorReduce *op) override {
@@ -435,7 +427,7 @@ class Optimize_X86 : public IRMutator {
         //        matching here.
         if ((op->op != VectorReduce::Add && op->op != VectorReduce::SaturatingAdd) ||
             !should_peephole_optimize(op->type)) {
-            return IRMutator::visit(op);
+            return InstructionSelector::visit(op);
         }
 
         const int lanes = op->type.lanes();
@@ -525,22 +517,10 @@ class Optimize_X86 : public IRMutator {
             break;
         }
 
-        return attempt_vector_split(op);
-    }
-
-    Expr attempt_vector_split(const VectorReduce *op) {
-        Expr split = codegen->split_vector_reduce(op, Expr());
-        if (split.defined() && !split.same_as(op)) {
-            return mutate(split);
-        } else {
-            return IRMutator::visit(op);
-        }
+        return InstructionSelector::visit(op);
     }
 
 private:
-    const Target &target;
-    const CodeGen_LLVM *codegen;
-
     IRMatcher::Wild<0> x;
     IRMatcher::Wild<1> y;
     IRMatcher::Wild<2> z;

From c21bec5bbdcfb459cb31cc914c2189f0cee71139 Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Thu, 28 Jul 2022 00:24:05 -0400
Subject: [PATCH 30/55] clang format

---
 src/InstructionSelector.cpp |  4 ++--
 src/InstructionSelector.h   |  1 +
 src/X86Optimize.cpp         | 44 ++++++++++++++++++-------------------
 3 files changed, 25 insertions(+), 24 deletions(-)

diff --git a/src/InstructionSelector.cpp b/src/InstructionSelector.cpp
index e408895eec6f..569c04925638 100644
--- a/src/InstructionSelector.cpp
+++ b/src/InstructionSelector.cpp
@@ -2,11 +2,11 @@
 
 #include "CodeGen_Internal.h"
 
-
 namespace Halide {
 namespace Internal {
 
-InstructionSelector::InstructionSelector(const Target &t, const CodeGen_LLVM *c) : target(t), codegen(c) {
+InstructionSelector::InstructionSelector(const Target &t, const CodeGen_LLVM *c)
+    : target(t), codegen(c) {
 }
 
 Expr InstructionSelector::visit(const Div *op) {
diff --git a/src/InstructionSelector.h b/src/InstructionSelector.h
index 723ec6addd69..351a7c785a58 100644
--- a/src/InstructionSelector.h
+++ b/src/InstructionSelector.h
@@ -25,6 +25,7 @@ class InstructionSelector : public IRGraphMutator {
 
     Expr visit(const Div *) override;
     Expr visit(const VectorReduce *) override;
+
 public:
     InstructionSelector(const Target &target, const CodeGen_LLVM *codegen);
 };
diff --git a/src/X86Optimize.cpp b/src/X86Optimize.cpp
index 99918be40f30..541337fb14f9 100644
--- a/src/X86Optimize.cpp
+++ b/src/X86Optimize.cpp
@@ -2,11 +2,11 @@
 
 #include "CSE.h"
 #include "FindIntrinsics.h"
-#include "InstructionSelector.h"
 #include "IR.h"
 #include "IRMatch.h"
 #include "IRMutator.h"
 #include "IROperator.h"
+#include "InstructionSelector.h"
 #include "Simplify.h"
 
 namespace Halide {
@@ -490,40 +490,40 @@ class Optimize_X86 : public InstructionSelector {
                 // psadbw is always supported via SSE2.
                 ((factor == 8) &&
                  (rewrite(
-                    h_add(cast(UInt(64, value_lanes), absd(x, y)), lanes),
-                    v_intrin(VectorInstruction::sum_absd, x, y),
-                    is_uint(x, 8) && is_uint(y, 8)) ||
-                  
+                      h_add(cast(UInt(64, value_lanes), absd(x, y)), lanes),
+                      v_intrin(VectorInstruction::sum_absd, x, y),
+                      is_uint(x, 8) && is_uint(y, 8)) ||
+
                   // Rewrite non-native sum-of-absolute-difference variants to the native
                   // op. We support reducing to various types. We could consider supporting
                   // multiple reduction factors too, but in general we don't handle non-native
                   // reduction factors for VectorReduce nodes (yet?).
                   rewrite(
-                    h_add(cast(UInt(16, value_lanes), absd(x, y)), lanes),
-                    cast(UInt(16, lanes), typed(UInt(64, lanes), v_intrin(VectorInstruction::sum_absd, x, y))),
-                    is_uint(x, 8) && is_uint(y, 8)) ||
-                
+                      h_add(cast(UInt(16, value_lanes), absd(x, y)), lanes),
+                      cast(UInt(16, lanes), typed(UInt(64, lanes), v_intrin(VectorInstruction::sum_absd, x, y))),
+                      is_uint(x, 8) && is_uint(y, 8)) ||
+
                   rewrite(
-                    h_add(cast(UInt(32, value_lanes), absd(x, y)), lanes),
-                    cast(UInt(32, lanes), typed(UInt(64, lanes), v_intrin(VectorInstruction::sum_absd, x, y))),
-                    is_uint(x, 8) && is_uint(y, 8)) ||
+                      h_add(cast(UInt(32, value_lanes), absd(x, y)), lanes),
+                      cast(UInt(32, lanes), typed(UInt(64, lanes), v_intrin(VectorInstruction::sum_absd, x, y))),
+                      is_uint(x, 8) && is_uint(y, 8)) ||
 
                   rewrite(
-                    h_add(cast(Int(16, value_lanes), absd(x, y)), lanes),
-                    cast(Int(16, lanes), typed(UInt(64, lanes), v_intrin(VectorInstruction::sum_absd, x, y))),
-                    is_uint(x, 8) && is_uint(y, 8)) ||
+                      h_add(cast(Int(16, value_lanes), absd(x, y)), lanes),
+                      cast(Int(16, lanes), typed(UInt(64, lanes), v_intrin(VectorInstruction::sum_absd, x, y))),
+                      is_uint(x, 8) && is_uint(y, 8)) ||
 
                   rewrite(
-                    h_add(cast(Int(32, value_lanes), absd(x, y)), lanes),
-                    cast(Int(32, lanes), typed(UInt(64, lanes), v_intrin(VectorInstruction::sum_absd, x, y))),
-                    is_uint(x, 8) && is_uint(y, 8)) ||
+                      h_add(cast(Int(32, value_lanes), absd(x, y)), lanes),
+                      cast(Int(32, lanes), typed(UInt(64, lanes), v_intrin(VectorInstruction::sum_absd, x, y))),
+                      is_uint(x, 8) && is_uint(y, 8)) ||
 
                   rewrite(
-                    h_add(cast(Int(64, value_lanes), absd(x, y)), lanes),
-                    cast(Int(64, lanes), typed(UInt(64, lanes), v_intrin(VectorInstruction::sum_absd, x, y))),
-                    is_uint(x, 8) && is_uint(y, 8)) ||
+                      h_add(cast(Int(64, value_lanes), absd(x, y)), lanes),
+                      cast(Int(64, lanes), typed(UInt(64, lanes), v_intrin(VectorInstruction::sum_absd, x, y))),
+                      is_uint(x, 8) && is_uint(y, 8)) ||
 
-                 false))) {
+                  false))) {
                 return mutate(rewrite.result);
             }
             break;

From e6502f8a300f01616ec22c92deb858a8b45171e7 Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Thu, 28 Jul 2022 00:32:28 -0400
Subject: [PATCH 31/55] fix last remnants of vector intrinsic -> vector
 instruction renaming

---
 src/IRMatch.h       |  8 ++---
 src/IRPrinter.cpp   |  2 +-
 src/StmtToHtml.cpp  |  2 +-
 src/X86Optimize.cpp | 78 ++++++++++++++++++++++-----------------------
 4 files changed, 45 insertions(+), 45 deletions(-)

diff --git a/src/IRMatch.h b/src/IRMatch.h
index 982a8dcaeace..60d54f8e391d 100644
--- a/src/IRMatch.h
+++ b/src/IRMatch.h
@@ -1884,7 +1884,7 @@ HALIDE_ALWAYS_INLINE auto ramp(A &&a, B &&b, C &&c) noexcept -> RampOp<decltype(
 }
 
 template<typename... Args>
-struct VectorIntrinOp {
+struct VectorInstrOp {
     struct pattern_tag {};
     const VectorInstruction::InstructionOp op;
     std::tuple<Args...> args;
@@ -1964,7 +1964,7 @@ struct VectorIntrinOp {
     constexpr static bool foldable = false;
 
     HALIDE_ALWAYS_INLINE
-    VectorIntrinOp(const VectorInstruction::InstructionOp _op, Args... args) noexcept
+    VectorInstrOp(const VectorInstruction::InstructionOp _op, Args... args) noexcept
         : op(_op), args(args...) {
         static_assert(sizeof...(Args) > 0 && sizeof...(Args) <= 3,
                       "VectorInstructionOp must have non-zero arguments, and update make() if more than 3 arguments.");
@@ -1972,7 +1972,7 @@ struct VectorIntrinOp {
 };
 
 template<typename... Args>
-std::ostream &operator<<(std::ostream &s, const VectorIntrinOp<Args...> &op) {
+std::ostream &operator<<(std::ostream &s, const VectorInstrOp<Args...> &op) {
     // TODO(rootjalex): Should we print the type?
     s << "vector_intrin(\"";
     s << op.op << "\", ";
@@ -1982,7 +1982,7 @@ std::ostream &operator<<(std::ostream &s, const VectorIntrinOp<Args...> &op) {
 }
 
 template<typename... Args>
-HALIDE_ALWAYS_INLINE auto v_intrin(const VectorInstruction::InstructionOp op, Args... args) noexcept -> VectorIntrinOp<decltype(pattern_arg(args))...> {
+HALIDE_ALWAYS_INLINE auto v_instr(const VectorInstruction::InstructionOp op, Args... args) noexcept -> VectorInstrOp<decltype(pattern_arg(args))...> {
     return {op, pattern_arg(args)...};
 }
 
diff --git a/src/IRPrinter.cpp b/src/IRPrinter.cpp
index 324b399e5548..78d0e087d7cb 100644
--- a/src/IRPrinter.cpp
+++ b/src/IRPrinter.cpp
@@ -1076,7 +1076,7 @@ void IRPrinter::visit(const Shuffle *op) {
 void IRPrinter::visit(const VectorInstruction *op) {
     stream << "("
            << op->type
-           << ")vector_intrinsic(\""
+           << ")vector_instruction(\""
            << op->get_instruction_name()
            << "\", ";
     print_list(op->args);
diff --git a/src/StmtToHtml.cpp b/src/StmtToHtml.cpp
index 8dd6afdc8a73..36db8155c525 100644
--- a/src/StmtToHtml.cpp
+++ b/src/StmtToHtml.cpp
@@ -715,7 +715,7 @@ class StmtToHtml : public IRVisitor {
     void visit(const VectorInstruction *op) override {
         stream << open_span("VectorInstruction");
         stream << open_span("Type") << op->type << close_span();
-        print_list(symbol("vector_intrinsic") + "(\"" + op->get_instruction_name() + "\"", op->args, ")");
+        print_list(symbol("vector_instruction") + "(\"" + op->get_instruction_name() + "\"", op->args, ")");
         stream << close_span();
     }
 
diff --git a/src/X86Optimize.cpp b/src/X86Optimize.cpp
index 541337fb14f9..b6e5812ab07f 100644
--- a/src/X86Optimize.cpp
+++ b/src/X86Optimize.cpp
@@ -101,33 +101,33 @@ class Optimize_X86 : public InstructionSelector {
             // Accumulating pmaddubsw
             (rewrite(
                  x + h_add(cast(Int(32, lanes * 4), widening_mul(y, z)), lanes),
-                 v_intrin(VectorInstruction::dot_product, x, y, z),
+                 v_instr(VectorInstruction::dot_product, x, y, z),
                  is_uint(y, 8) && is_int(z, 8)) ||
 
              rewrite(
                  x + h_add(cast(Int(32, lanes * 4), widening_mul(y, z)), lanes),
-                 v_intrin(VectorInstruction::dot_product, x, z, y),
+                 v_instr(VectorInstruction::dot_product, x, z, y),
                  is_int(y, 8) && is_uint(z, 8)) ||
 
              rewrite(
                  h_add(cast(Int(32, lanes * 4), widening_mul(x, y)), lanes) + z,
-                 v_intrin(VectorInstruction::dot_product, z, x, y),
+                 v_instr(VectorInstruction::dot_product, z, x, y),
                  is_uint(x, 8) && is_int(y, 8)) ||
 
              rewrite(
                  h_add(cast(Int(32, lanes * 4), widening_mul(x, y)), lanes) + z,
-                 v_intrin(VectorInstruction::dot_product, z, y, x),
+                 v_instr(VectorInstruction::dot_product, z, y, x),
                  is_int(x, 8) && is_uint(y, 8)) ||
 
              // Accumulating pmaddwd.
              rewrite(
                  x + h_add(widening_mul(y, z), lanes),
-                 v_intrin(VectorInstruction::dot_product, x, y, z),
+                 v_instr(VectorInstruction::dot_product, x, y, z),
                  is_int(y, 16, lanes * 2) && is_int(z, 16, lanes * 2)) ||
 
              rewrite(
                  h_add(widening_mul(x, y), lanes) + z,
-                 v_intrin(VectorInstruction::dot_product, z, x, y),
+                 v_instr(VectorInstruction::dot_product, z, x, y),
                  is_int(x, 16, lanes * 2) && is_int(y, 16, lanes * 2)) ||
 
              false)) {
@@ -199,38 +199,38 @@ class Optimize_X86 : public InstructionSelector {
             (target.has_feature(Target::SSE41) &&
              rewrite(
                  cast(Int(16, lanes), rounding_shift_right(widening_mul(x, y), 15)),
-                 v_intrin(VectorInstruction::pmulhrs, x, y),
+                 v_instr(VectorInstruction::pmulhrs, x, y),
                  is_int(x, 16) && is_int(y, 16))) ||
 
             // saturating_narrow is always supported (via SSE2) for:
             //   int32 -> int16, int16 -> int8, int16 -> uint8
             rewrite(
                 cast(Int(16, lanes), max(min(x, i32_i16min), i32_i16min)),
-                v_intrin(VectorInstruction::saturating_narrow, x),
+                v_instr(VectorInstruction::saturating_narrow, x),
                 is_int(x, 32)) ||
 
             rewrite(
                 cast(Int(8, lanes), max(min(x, i16_i8min), i16_i8min)),
-                v_intrin(VectorInstruction::saturating_narrow, x),
+                v_instr(VectorInstruction::saturating_narrow, x),
                 is_int(x, 16)) ||
 
             rewrite(
                 cast(UInt(8, lanes), max(min(x, i16_u8min), i16_u8min)),
-                v_intrin(VectorInstruction::saturating_narrow, x),
+                v_instr(VectorInstruction::saturating_narrow, x),
                 is_int(x, 16)) ||
 
             //   int32 -> uint16 is supported via SSE41
             (target.has_feature(Target::SSE41) &&
              rewrite(
                  cast(UInt(16, lanes), max(min(x, i32_u16min), i32_u16min)),
-                 v_intrin(VectorInstruction::saturating_narrow, x),
+                 v_instr(VectorInstruction::saturating_narrow, x),
                  is_int(x, 32))) ||
 
             // f32_to_bf16 is supported only via Target::AVX512_SapphireRapids
             (target.has_feature(Target::AVX512_SapphireRapids) &&
              rewrite(
                  cast(BFloat(16, lanes), x),
-                 v_intrin(VectorInstruction::f32_to_bf16, x),
+                 v_instr(VectorInstruction::f32_to_bf16, x),
                  is_float(x, 32))) ||
 
             false) {
@@ -313,7 +313,7 @@ class Optimize_X86 : public InstructionSelector {
             ((op->type.is_int_or_uint() && bits == 16) &&
              rewrite(
                  mul_shift_right(x, y, 16),
-                 v_intrin(VectorInstruction::pmulh, x, y))) ||
+                 v_instr(VectorInstruction::pmulh, x, y))) ||
 
             // saturating_pmulhrs is supported via SSE41
             ((target.has_feature(Target::SSE41) &&
@@ -323,7 +323,7 @@ class Optimize_X86 : public InstructionSelector {
                  // saturating_pmulhrs
                  select((x == typed(Int(16, lanes), -32768)) && (y == typed(Int(16, lanes), -32768)),
                         typed(Int(16, lanes), 32767),
-                        v_intrin(VectorInstruction::pmulhrs, x, y)))) ||
+                        v_instr(VectorInstruction::pmulhrs, x, y)))) ||
 
             // TODO(rootjalex): The following intrinsics are
             // simply one-to-one mappings, should they even
@@ -335,29 +335,29 @@ class Optimize_X86 : public InstructionSelector {
               (op->type.is_float() && bits == 32)) &&
              rewrite(
                  abs(x),
-                 v_intrin(VectorInstruction::abs, x))) ||
+                 v_instr(VectorInstruction::abs, x))) ||
 
             // saturating ops for 8 and 16 bits are always supported (via SSE2).
             ((bits == 8 || bits == 16) &&
              (rewrite(
                   saturating_add(x, y),
-                  v_intrin(VectorInstruction::saturating_add, x, y)) ||
+                  v_instr(VectorInstruction::saturating_add, x, y)) ||
               rewrite(
                   saturating_sub(x, y),
-                  v_intrin(VectorInstruction::saturating_sub, x, y)))) ||
+                  v_instr(VectorInstruction::saturating_sub, x, y)))) ||
 
             // pavg ops for 8 and 16 bits are always supported (via SSE2).
             ((op->type.is_uint() && (bits == 8 || bits == 16)) &&
              rewrite(
                  rounding_halving_add(x, y),
-                 v_intrin(VectorInstruction::rounding_halving_add, x, y))) ||
+                 v_instr(VectorInstruction::rounding_halving_add, x, y))) ||
 
             // int16 -> int32 widening_mul has a (v)pmaddwd implementation.
             // always supported (via SSE2).
             ((op->type.is_int() && (bits == 32)) &&
              rewrite(
                  widening_mul(x, y),
-                 v_intrin(VectorInstruction::widening_mul, x, y),
+                 v_instr(VectorInstruction::widening_mul, x, y),
                  is_int(x, 16) && is_int(y, 16))) ||
 
             (target.has_feature(Target::AVX512_SapphireRapids) &&
@@ -365,27 +365,27 @@ class Optimize_X86 : public InstructionSelector {
              // SapphireRapids accumulating dot products.
              (rewrite(
                   saturating_add(x, h_satadd(cast(Int(32, lanes * 4), widening_mul(y, z)), lanes)),
-                  v_intrin(VectorInstruction::saturating_dot_product, x, y, z),
+                  v_instr(VectorInstruction::saturating_dot_product, x, y, z),
                   is_uint(y, 8) && is_int(z, 8)) ||
 
               rewrite(
                   saturating_add(x, h_satadd(cast(Int(32, lanes * 4), widening_mul(y, z)), lanes)),
-                  v_intrin(VectorInstruction::saturating_dot_product, x, z, y),
+                  v_instr(VectorInstruction::saturating_dot_product, x, z, y),
                   is_int(y, 8) && is_uint(z, 8)) ||
 
               rewrite(
                   saturating_add(x, h_satadd(cast(Int(32, lanes * 2), widening_mul(y, z)), lanes)),
-                  v_intrin(VectorInstruction::saturating_dot_product, x, y, z),
+                  v_instr(VectorInstruction::saturating_dot_product, x, y, z),
                   is_uint(y, 8) && is_int(z, 8)) ||
 
               rewrite(
                   saturating_add(x, h_satadd(cast(Int(32, lanes * 2), widening_mul(y, z)), lanes)),
-                  v_intrin(VectorInstruction::saturating_dot_product, x, z, y),
+                  v_instr(VectorInstruction::saturating_dot_product, x, z, y),
                   is_int(y, 8) && is_uint(z, 8)) ||
 
               rewrite(
                   saturating_add(x, h_satadd(widening_mul(y, z), lanes)),
-                  v_intrin(VectorInstruction::saturating_dot_product, x, z, y),
+                  v_instr(VectorInstruction::saturating_dot_product, x, z, y),
                   is_int(y, 16, lanes * 2) && is_int(z, 16, lanes * 2)) ||
 
               false)) ||
@@ -445,18 +445,18 @@ class Optimize_X86 : public InstructionSelector {
                 ((factor == 2) &&
                  (rewrite(
                       h_add(cast(Int(32, value_lanes), widening_mul(x, y)), lanes),
-                      v_intrin(VectorInstruction::dot_product, cast(Int(16, value_lanes), x), cast(Int(16, value_lanes), y)),
+                      v_instr(VectorInstruction::dot_product, cast(Int(16, value_lanes), x), cast(Int(16, value_lanes), y)),
                       x_is_int_or_uint && y_is_int_or_uint) ||
 
                   // Horizontal widening add via pmaddwd
                   rewrite(
                       h_add(cast(Int(32, value_lanes), x), lanes),
-                      v_intrin(VectorInstruction::dot_product, x, make_const(Int(16, value_lanes), 1)),
+                      v_instr(VectorInstruction::dot_product, x, make_const(Int(16, value_lanes), 1)),
                       is_int(x, 16)) ||
 
                   (rewrite(
                       h_add(widening_mul(x, y), lanes),
-                      v_intrin(VectorInstruction::dot_product, x, y),
+                      v_instr(VectorInstruction::dot_product, x, y),
                       is_int(x, 16) && is_int(y, 16))) ||
 
                   // pmaddub supported via SSE41
@@ -464,23 +464,23 @@ class Optimize_X86 : public InstructionSelector {
                    // Horizontal widening adds using 2-way saturating dot products.
                    (rewrite(
                         h_add(cast(UInt(16, value_lanes), x), lanes),
-                        cast(UInt(16, lanes), typed(Int(16, lanes), v_intrin(VectorInstruction::saturating_dot_product, x, make_const(Int(8, value_lanes), 1)))),
+                        cast(UInt(16, lanes), typed(Int(16, lanes), v_instr(VectorInstruction::saturating_dot_product, x, make_const(Int(8, value_lanes), 1)))),
                         is_uint(x, 8)) ||
 
                     rewrite(
                         h_add(cast(Int(16, value_lanes), x), lanes),
-                        v_intrin(VectorInstruction::saturating_dot_product, x, make_const(Int(8, value_lanes), 1)),
+                        v_instr(VectorInstruction::saturating_dot_product, x, make_const(Int(8, value_lanes), 1)),
                         is_uint(x, 8)) ||
 
                     rewrite(
                         h_add(cast(Int(16, value_lanes), x), lanes),
-                        v_intrin(VectorInstruction::saturating_dot_product, make_const(UInt(8, value_lanes), 1), x),
+                        v_instr(VectorInstruction::saturating_dot_product, make_const(UInt(8, value_lanes), 1), x),
                         is_int(x, 8)) ||
 
                     // SSE41 and AVX2 support horizontal_add via phadd intrinsics.
                     rewrite(
                         h_add(x, lanes),
-                        v_intrin(VectorInstruction::horizontal_add, x),
+                        v_instr(VectorInstruction::horizontal_add, x),
                         is_int(x, 16, lanes * 2) || is_uint(x, 16, lanes * 2) ||
                             is_int(x, 32, lanes * 2) || is_uint(x, 32, lanes * 2)) ||
 
@@ -491,7 +491,7 @@ class Optimize_X86 : public InstructionSelector {
                 ((factor == 8) &&
                  (rewrite(
                       h_add(cast(UInt(64, value_lanes), absd(x, y)), lanes),
-                      v_intrin(VectorInstruction::sum_absd, x, y),
+                      v_instr(VectorInstruction::sum_absd, x, y),
                       is_uint(x, 8) && is_uint(y, 8)) ||
 
                   // Rewrite non-native sum-of-absolute-difference variants to the native
@@ -500,27 +500,27 @@ class Optimize_X86 : public InstructionSelector {
                   // reduction factors for VectorReduce nodes (yet?).
                   rewrite(
                       h_add(cast(UInt(16, value_lanes), absd(x, y)), lanes),
-                      cast(UInt(16, lanes), typed(UInt(64, lanes), v_intrin(VectorInstruction::sum_absd, x, y))),
+                      cast(UInt(16, lanes), typed(UInt(64, lanes), v_instr(VectorInstruction::sum_absd, x, y))),
                       is_uint(x, 8) && is_uint(y, 8)) ||
 
                   rewrite(
                       h_add(cast(UInt(32, value_lanes), absd(x, y)), lanes),
-                      cast(UInt(32, lanes), typed(UInt(64, lanes), v_intrin(VectorInstruction::sum_absd, x, y))),
+                      cast(UInt(32, lanes), typed(UInt(64, lanes), v_instr(VectorInstruction::sum_absd, x, y))),
                       is_uint(x, 8) && is_uint(y, 8)) ||
 
                   rewrite(
                       h_add(cast(Int(16, value_lanes), absd(x, y)), lanes),
-                      cast(Int(16, lanes), typed(UInt(64, lanes), v_intrin(VectorInstruction::sum_absd, x, y))),
+                      cast(Int(16, lanes), typed(UInt(64, lanes), v_instr(VectorInstruction::sum_absd, x, y))),
                       is_uint(x, 8) && is_uint(y, 8)) ||
 
                   rewrite(
                       h_add(cast(Int(32, value_lanes), absd(x, y)), lanes),
-                      cast(Int(32, lanes), typed(UInt(64, lanes), v_intrin(VectorInstruction::sum_absd, x, y))),
+                      cast(Int(32, lanes), typed(UInt(64, lanes), v_instr(VectorInstruction::sum_absd, x, y))),
                       is_uint(x, 8) && is_uint(y, 8)) ||
 
                   rewrite(
                       h_add(cast(Int(64, value_lanes), absd(x, y)), lanes),
-                      cast(Int(64, lanes), typed(UInt(64, lanes), v_intrin(VectorInstruction::sum_absd, x, y))),
+                      cast(Int(64, lanes), typed(UInt(64, lanes), v_instr(VectorInstruction::sum_absd, x, y))),
                       is_uint(x, 8) && is_uint(y, 8)) ||
 
                   false))) {
@@ -535,12 +535,12 @@ class Optimize_X86 : public InstructionSelector {
                 ((factor == 2) && target.has_feature(Target::SSE41) &&
                  (rewrite(
                       h_satadd(widening_mul(x, y), lanes),
-                      v_intrin(VectorInstruction::saturating_dot_product, x, y),
+                      v_instr(VectorInstruction::saturating_dot_product, x, y),
                       is_uint(x, 8) && is_int(y, 8)) ||
 
                   rewrite(
                       h_satadd(widening_mul(x, y), lanes),
-                      v_intrin(VectorInstruction::saturating_dot_product, y, x),
+                      v_instr(VectorInstruction::saturating_dot_product, y, x),
                       is_int(x, 8) && is_uint(y, 8)) ||
 
                   false))) {

From 6d2bfd145f5692b7e5d3025f677c10e9c1aca4b4 Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Thu, 28 Jul 2022 01:05:26 -0400
Subject: [PATCH 32/55] fix virtual func hidden error

---
 src/InstructionSelector.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/InstructionSelector.h b/src/InstructionSelector.h
index 351a7c785a58..e0489da14979 100644
--- a/src/InstructionSelector.h
+++ b/src/InstructionSelector.h
@@ -23,6 +23,7 @@ class InstructionSelector : public IRGraphMutator {
     const Target &target;
     const CodeGen_LLVM *codegen;
 
+    using IRGraphMutator::visit;
     Expr visit(const Div *) override;
     Expr visit(const VectorReduce *) override;
 

From fa2d4e246efcd22e8a18d9e90b2211bfe76f4eec Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Thu, 28 Jul 2022 01:09:31 -0400
Subject: [PATCH 33/55] remove 'implement VI visitor' error msg

---
 src/IRMutator.cpp | 14 +++++++-------
 src/IRVisitor.cpp | 16 ++++++++--------
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/IRMutator.cpp b/src/IRMutator.cpp
index 9ac27753ae09..7b05b46ac22f 100644
--- a/src/IRMutator.cpp
+++ b/src/IRMutator.cpp
@@ -328,13 +328,13 @@ Expr IRMutator::visit(const Shuffle *op) {
 }
 
 Expr IRMutator::visit(const VectorInstruction *op) {
-    internal_error << "Always implement VectorInstruction visitor for IRMutator subclass\n";
-    // auto [new_args, changed] = mutate_with_changes(op->args);
-    // if (!changed) {
-    //     return op;
-    // }
-    // return VectorInstruction::make(op->type, op->op, new_args);
-    return Expr();
+    // internal_error << "Always implement VectorInstruction visitor for IRMutator subclass\n";
+    auto [new_args, changed] = mutate_with_changes(op->args);
+    if (!changed) {
+        return op;
+    }
+    return VectorInstruction::make(op->type, op->op, new_args);
+    // return Expr();
 }
 
 Expr IRMutator::visit(const VectorReduce *op) {
diff --git a/src/IRVisitor.cpp b/src/IRVisitor.cpp
index bbca23a77a7b..2332d7bcc2e3 100644
--- a/src/IRVisitor.cpp
+++ b/src/IRVisitor.cpp
@@ -258,10 +258,10 @@ void IRVisitor::visit(const Shuffle *op) {
 }
 
 void IRVisitor::visit(const VectorInstruction *op) {
-    internal_error << "Always implement VectorInstruction visitor for IRVisitor subclass\n";
-    // for (const auto &arg : op->args) {
-    //     arg.accept(this);
-    // }
+    // internal_error << "Always implement VectorInstruction visitor for IRVisitor subclass\n";
+    for (const auto &arg : op->args) {
+        arg.accept(this);
+    }
 }
 
 void IRVisitor::visit(const VectorReduce *op) {
@@ -523,10 +523,10 @@ void IRGraphVisitor::visit(const Shuffle *op) {
 }
 
 void IRGraphVisitor::visit(const VectorInstruction *op) {
-    internal_error << "Always implement VectorInstruction visitor for IRGraphVisitor subclass\n";
-    // for (const auto &arg : op->args) {
-    //     include(arg);
-    // }
+    // internal_error << "Always implement VectorInstruction visitor for IRGraphVisitor subclass\n";
+    for (const auto &arg : op->args) {
+        include(arg);
+    }
 }
 
 void IRGraphVisitor::visit(const VectorReduce *op) {

From ec2cd4ebf2e66ac9f927184dfe1121e5402288ee Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Thu, 28 Jul 2022 12:41:49 -0400
Subject: [PATCH 34/55] address nits

---
 src/IRMatch.h               | 10 +++++-----
 src/InstructionSelector.cpp | 10 +++++-----
 src/InstructionSelector.h   |  4 ++--
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/IRMatch.h b/src/IRMatch.h
index 60d54f8e391d..210688056066 100644
--- a/src/IRMatch.h
+++ b/src/IRMatch.h
@@ -1884,7 +1884,7 @@ HALIDE_ALWAYS_INLINE auto ramp(A &&a, B &&b, C &&c) noexcept -> RampOp<decltype(
 }
 
 template<typename... Args>
-struct VectorInstrOp {
+struct VectorInstructionOp {
     struct pattern_tag {};
     const VectorInstruction::InstructionOp op;
     std::tuple<Args...> args;
@@ -1964,7 +1964,7 @@ struct VectorInstrOp {
     constexpr static bool foldable = false;
 
     HALIDE_ALWAYS_INLINE
-    VectorInstrOp(const VectorInstruction::InstructionOp _op, Args... args) noexcept
+    VectorInstructionOp(const VectorInstruction::InstructionOp _op, Args... args) noexcept
         : op(_op), args(args...) {
         static_assert(sizeof...(Args) > 0 && sizeof...(Args) <= 3,
                       "VectorInstructionOp must have non-zero arguments, and update make() if more than 3 arguments.");
@@ -1972,9 +1972,9 @@ struct VectorInstrOp {
 };
 
 template<typename... Args>
-std::ostream &operator<<(std::ostream &s, const VectorInstrOp<Args...> &op) {
+std::ostream &operator<<(std::ostream &s, const VectorInstructionOp<Args...> &op) {
     // TODO(rootjalex): Should we print the type?
-    s << "vector_intrin(\"";
+    s << "vector_instr(\"";
     s << op.op << "\", ";
     op.print_args(s);
     s << ")";
@@ -1982,7 +1982,7 @@ std::ostream &operator<<(std::ostream &s, const VectorInstrOp<Args...> &op) {
 }
 
 template<typename... Args>
-HALIDE_ALWAYS_INLINE auto v_instr(const VectorInstruction::InstructionOp op, Args... args) noexcept -> VectorInstrOp<decltype(pattern_arg(args))...> {
+HALIDE_ALWAYS_INLINE auto v_instr(const VectorInstruction::InstructionOp op, Args... args) noexcept -> VectorInstructionOp<decltype(pattern_arg(args))...> {
     return {op, pattern_arg(args)...};
 }
 
diff --git a/src/InstructionSelector.cpp b/src/InstructionSelector.cpp
index 569c04925638..35bb63a640a4 100644
--- a/src/InstructionSelector.cpp
+++ b/src/InstructionSelector.cpp
@@ -10,15 +10,15 @@ InstructionSelector::InstructionSelector(const Target &t, const CodeGen_LLVM *c)
 }
 
 Expr InstructionSelector::visit(const Div *op) {
-    if (!op->type.is_vector() || !op->type.is_int_or_uint()) {
-        return IRGraphMutator::visit(op);
+    if (op->type.is_vector() && op->type.is_int_or_uint()) {
+        // Lower division here in order to do pattern-matching on intrinsics.
+        return mutate(lower_int_uint_div(op->a, op->b));
     }
-    // Lower division here in order to do pattern-matching on intrinsics.
-    return mutate(lower_int_uint_div(op->a, op->b));
+    return IRGraphMutator::visit(op);
 }
 
 Expr InstructionSelector::visit(const VectorReduce *op) {
-    return codegen->split_vector_reduce(op, Expr());
+    return mutate(codegen->split_vector_reduce(op, Expr()));
 }
 
 }  // namespace Internal
diff --git a/src/InstructionSelector.h b/src/InstructionSelector.h
index e0489da14979..d8e8d44e6761 100644
--- a/src/InstructionSelector.h
+++ b/src/InstructionSelector.h
@@ -1,5 +1,5 @@
-#ifndef HALIDE_INSTR_SELECTOR_H
-#define HALIDE_INSTR_SELECTOR_H
+#ifndef HALIDE_INSTRUCTION_SELECTOR_H
+#define HALIDE_INSTRUCTION_SELECTOR_H
 
 /** \file
  * Defines a base class for VectorInstruction selection.

From 0e5cfcfe2520fdc8b58b992358edef57d9d7f5d9 Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Thu, 28 Jul 2022 13:39:17 -0400
Subject: [PATCH 35/55] temporary HVX/CSE fix

---
 src/X86Optimize.cpp | 15 ++++++++++-----
 src/X86Optimize.h   |  2 +-
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/src/X86Optimize.cpp b/src/X86Optimize.cpp
index b6e5812ab07f..b5e77cf32c5b 100644
--- a/src/X86Optimize.cpp
+++ b/src/X86Optimize.cpp
@@ -563,16 +563,21 @@ class Optimize_X86 : public InstructionSelector {
 
 }  // namespace
 
-Stmt optimize_x86_instructions(Stmt stmt, const Target &target, const CodeGen_LLVM *codegen) {
-    stmt = Optimize_X86(target, codegen).mutate(stmt);
+Stmt optimize_x86_instructions(const Stmt &s, const Target &target, const CodeGen_LLVM *codegen) {
+    Stmt stmt = Optimize_X86(target, codegen).mutate(s);
+
     // Some of the rules above can introduce repeated sub-terms, so run CSE again.
-    stmt = common_subexpression_elimination(stmt);
-    return stmt;
+    if (!stmt.same_as(s)) {
+        stmt = common_subexpression_elimination(stmt);
+        return stmt;
+    } else {
+        return s;
+    }
 }
 
 #else  // WITH_X86
 
-Stmt optimize_x86_instructions(Stmt s, const Target &t) {
+Stmt optimize_x86_instructions(const Stmt &s, const Target &t) {
     user_error << "x86 not enabled for this build of Halide.\n";
     return Stmt();
 }
diff --git a/src/X86Optimize.h b/src/X86Optimize.h
index 9ab9d5f54269..9732a2dba545 100644
--- a/src/X86Optimize.h
+++ b/src/X86Optimize.h
@@ -13,7 +13,7 @@ namespace Halide {
 namespace Internal {
 
 /** Perform vector instruction selection, inserting VectorInstruction nodes. */
-Stmt optimize_x86_instructions(Stmt stmt, const Target &target, const CodeGen_LLVM *codegen);
+Stmt optimize_x86_instructions(const Stmt &stmt, const Target &target, const CodeGen_LLVM *codegen);
 
 }  // namespace Internal
 }  // namespace Halide

From b3b3551bd9bb23bf179f9daadc243cbfb20a4bcf Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Thu, 28 Jul 2022 13:52:12 -0400
Subject: [PATCH 36/55] fix case without WITH_X86

---
 src/X86Optimize.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/X86Optimize.cpp b/src/X86Optimize.cpp
index b5e77cf32c5b..a1f54ce3d2f3 100644
--- a/src/X86Optimize.cpp
+++ b/src/X86Optimize.cpp
@@ -577,7 +577,7 @@ Stmt optimize_x86_instructions(const Stmt &s, const Target &target, const CodeGe
 
 #else  // WITH_X86
 
-Stmt optimize_x86_instructions(const Stmt &s, const Target &t) {
+Stmt optimize_x86_instructions(const Stmt &s, const Target &t, const CodeGen_LLVM *codegen) {
     user_error << "x86 not enabled for this build of Halide.\n";
     return Stmt();
 }

From 40f575ca93733cd18e48a56fe6ae68993405a9a8 Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Fri, 29 Jul 2022 00:29:46 -0400
Subject: [PATCH 37/55] fix x86 saturating_narrow pattern mistake

---
 src/X86Optimize.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/X86Optimize.cpp b/src/X86Optimize.cpp
index b5e77cf32c5b..e845beade690 100644
--- a/src/X86Optimize.cpp
+++ b/src/X86Optimize.cpp
@@ -205,24 +205,24 @@ class Optimize_X86 : public InstructionSelector {
             // saturating_narrow is always supported (via SSE2) for:
             //   int32 -> int16, int16 -> int8, int16 -> uint8
             rewrite(
-                cast(Int(16, lanes), max(min(x, i32_i16min), i32_i16min)),
+                cast(Int(16, lanes), max(min(x, i32_i16max), i32_i16min)),
                 v_instr(VectorInstruction::saturating_narrow, x),
                 is_int(x, 32)) ||
 
             rewrite(
-                cast(Int(8, lanes), max(min(x, i16_i8min), i16_i8min)),
+                cast(Int(8, lanes), max(min(x, i16_i8max), i16_i8min)),
                 v_instr(VectorInstruction::saturating_narrow, x),
                 is_int(x, 16)) ||
 
             rewrite(
-                cast(UInt(8, lanes), max(min(x, i16_u8min), i16_u8min)),
+                cast(UInt(8, lanes), max(min(x, i16_u8max), i16_u8min)),
                 v_instr(VectorInstruction::saturating_narrow, x),
                 is_int(x, 16)) ||
 
             //   int32 -> uint16 is supported via SSE41
             (target.has_feature(Target::SSE41) &&
              rewrite(
-                 cast(UInt(16, lanes), max(min(x, i32_u16min), i32_u16min)),
+                 cast(UInt(16, lanes), max(min(x, i32_u16max), i32_u16min)),
                  v_instr(VectorInstruction::saturating_narrow, x),
                  is_int(x, 32))) ||
 

From 545fbe819423c24408e68367eba5f2116ec95ebe Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Tue, 2 Aug 2022 13:28:13 -0400
Subject: [PATCH 38/55] lower mod in InstructionSelector too

---
 src/CodeGen_LLVM.h          | 10 +++++++---
 src/InstructionSelector.cpp |  9 +++++++++
 src/InstructionSelector.h   |  5 +++--
 3 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h
index 5ea221baa577..1fbb4328ac7c 100644
--- a/src/CodeGen_LLVM.h
+++ b/src/CodeGen_LLVM.h
@@ -45,6 +45,7 @@ class GlobalVariable;
 namespace Halide {
 
 struct ExternSignature;
+class InstructionSelector;
 
 namespace Internal {
 
@@ -507,9 +508,6 @@ class CodeGen_LLVM : public IRVisitor {
      * across backends. */
     virtual void codegen_vector_reduce(const VectorReduce *op, const Expr &init);
 
-    // TODO: this probably shouldn't be public, or should be moved where the rest of
-    //       the public methods are.
-public:
     /** Split up a VectorReduce node if possible, or generate LLVM
         intrinsics for full reductions. This is used in
         `codegen_vector_reduce`. **/
@@ -604,6 +602,12 @@ class CodeGen_LLVM : public IRVisitor {
      * represents a unique struct type created by a closure or similar.
      */
     std::map<llvm::Value *, llvm::Type *> struct_type_recovery;
+
+    /** Instruction selection uses `split_vector_reduce` and
+     *  `upgrade_type_for_arithmetic`, so needs access to those
+     *  methods.
+     */
+    friend class InstructionSelector;
 };
 
 }  // namespace Internal
diff --git a/src/InstructionSelector.cpp b/src/InstructionSelector.cpp
index 35bb63a640a4..072d07aa4cf8 100644
--- a/src/InstructionSelector.cpp
+++ b/src/InstructionSelector.cpp
@@ -1,6 +1,7 @@
 #include "InstructionSelector.h"
 
 #include "CodeGen_Internal.h"
+#include "IROperator.h"
 
 namespace Halide {
 namespace Internal {
@@ -17,6 +18,14 @@ Expr InstructionSelector::visit(const Div *op) {
     return IRGraphMutator::visit(op);
 }
 
+Expr InstructionSelector::visit(const Mod *op) {
+    if (op->type.is_vector() && op->type.is_int_or_uint()) {
+        // Lower mod here in order to do pattern-matching on intrinsics.
+        return mutate(lower_int_uint_mod(op->a, op->b));
+    }
+    return IRGraphMutator::visit(op);
+}
+
 Expr InstructionSelector::visit(const VectorReduce *op) {
     return mutate(codegen->split_vector_reduce(op, Expr()));
 }
diff --git a/src/InstructionSelector.h b/src/InstructionSelector.h
index d8e8d44e6761..bc7b1541374a 100644
--- a/src/InstructionSelector.h
+++ b/src/InstructionSelector.h
@@ -15,8 +15,8 @@ namespace Internal {
 
 /** A base class for vector instruction selection.
  *  The default implementation lowers int and uint
- *  division via `lower_int_uint_div` and splits
- *  VectorReduce nodes via CodeGen_LLVM::split_vector_reduce().
+ *  div and mod, and splits VectorReduce nodes via
+ *  CodeGen_LLVM::split_vector_reduce().
  */
 class InstructionSelector : public IRGraphMutator {
 protected:
@@ -25,6 +25,7 @@ class InstructionSelector : public IRGraphMutator {
 
     using IRGraphMutator::visit;
     Expr visit(const Div *) override;
+    Expr visit(const Mod *) override;
     Expr visit(const VectorReduce *) override;
 
 public:

From cd0fe8acd5e575d90937b93c0a475be79674d2e5 Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Tue, 2 Aug 2022 13:31:19 -0400
Subject: [PATCH 39/55] clang format

---
 src/IRMatch.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/IRMatch.h b/src/IRMatch.h
index 210688056066..34d237f0f3ea 100644
--- a/src/IRMatch.h
+++ b/src/IRMatch.h
@@ -1588,7 +1588,7 @@ auto bitwise_xor(A &&a, B &&b) noexcept -> Intrin<decltype(pattern_arg(a)), decl
     return {Call::bitwise_xor, pattern_arg(a), pattern_arg(b)};
 }
 template<typename A, typename B>
-HALIDE_ALWAYS_INLINE auto operator^(A &&a, B &&b) noexcept -> auto {
+HALIDE_ALWAYS_INLINE auto operator^(A &&a, B &&b) noexcept -> auto{
     assert_is_lvalue_if_expr<A>();
     assert_is_lvalue_if_expr<B>();
     return bitwise_xor(a, b);
@@ -1598,7 +1598,7 @@ auto bitwise_and(A &&a, B &&b) noexcept -> Intrin<decltype(pattern_arg(a)), decl
     return {Call::bitwise_and, pattern_arg(a), pattern_arg(b)};
 }
 template<typename A, typename B>
-HALIDE_ALWAYS_INLINE auto operator&(A &&a, B &&b) noexcept -> auto {
+HALIDE_ALWAYS_INLINE auto operator&(A &&a, B &&b) noexcept -> auto{
     assert_is_lvalue_if_expr<A>();
     assert_is_lvalue_if_expr<B>();
     return bitwise_and(a, b);
@@ -1608,7 +1608,7 @@ auto bitwise_or(A &&a, B &&b) noexcept -> Intrin<decltype(pattern_arg(a)), declt
     return {Call::bitwise_or, pattern_arg(a), pattern_arg(b)};
 }
 template<typename A, typename B>
-HALIDE_ALWAYS_INLINE auto operator|(A &&a, B &&b) noexcept -> auto {
+HALIDE_ALWAYS_INLINE auto operator|(A &&a, B &&b) noexcept -> auto{
     assert_is_lvalue_if_expr<A>();
     assert_is_lvalue_if_expr<B>();
     return bitwise_or(a, b);

From 6e67ddfe5331b3ae1c75bccf0146ede7dfe55abc Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Mon, 8 Aug 2022 15:23:52 -0400
Subject: [PATCH 40/55] implement pattern matching for SapphireRapids

---
 src/IRMatch.h       | 52 ++++++++++++++++++++++++++++++++++++++++++---
 src/X86Optimize.cpp | 50 ++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 96 insertions(+), 6 deletions(-)

diff --git a/src/IRMatch.h b/src/IRMatch.h
index 210688056066..183e0a1e617d 100644
--- a/src/IRMatch.h
+++ b/src/IRMatch.h
@@ -1588,7 +1588,7 @@ auto bitwise_xor(A &&a, B &&b) noexcept -> Intrin<decltype(pattern_arg(a)), decl
     return {Call::bitwise_xor, pattern_arg(a), pattern_arg(b)};
 }
 template<typename A, typename B>
-HALIDE_ALWAYS_INLINE auto operator^(A &&a, B &&b) noexcept -> auto {
+HALIDE_ALWAYS_INLINE auto operator^(A &&a, B &&b) noexcept -> auto{
     assert_is_lvalue_if_expr<A>();
     assert_is_lvalue_if_expr<B>();
     return bitwise_xor(a, b);
@@ -1598,7 +1598,7 @@ auto bitwise_and(A &&a, B &&b) noexcept -> Intrin<decltype(pattern_arg(a)), decl
     return {Call::bitwise_and, pattern_arg(a), pattern_arg(b)};
 }
 template<typename A, typename B>
-HALIDE_ALWAYS_INLINE auto operator&(A &&a, B &&b) noexcept -> auto {
+HALIDE_ALWAYS_INLINE auto operator&(A &&a, B &&b) noexcept -> auto{
     assert_is_lvalue_if_expr<A>();
     assert_is_lvalue_if_expr<B>();
     return bitwise_and(a, b);
@@ -1608,7 +1608,7 @@ auto bitwise_or(A &&a, B &&b) noexcept -> Intrin<decltype(pattern_arg(a)), declt
     return {Call::bitwise_or, pattern_arg(a), pattern_arg(b)};
 }
 template<typename A, typename B>
-HALIDE_ALWAYS_INLINE auto operator|(A &&a, B &&b) noexcept -> auto {
+HALIDE_ALWAYS_INLINE auto operator|(A &&a, B &&b) noexcept -> auto{
     assert_is_lvalue_if_expr<A>();
     assert_is_lvalue_if_expr<B>();
     return bitwise_or(a, b);
@@ -2501,6 +2501,52 @@ std::ostream &operator<<(std::ostream &s, const IsFloat<A> &op) {
     return s;
 }
 
+template<typename A>
+struct IsBFloat {
+    struct pattern_tag {};
+    A a;
+    int bits;
+    int lanes;
+
+    constexpr static uint32_t binds = bindings<A>::mask;
+
+    // This rule is a boolean-valued predicate. Bools have type UIntImm.
+    constexpr static IRNodeType min_node_type = IRNodeType::UIntImm;
+    constexpr static IRNodeType max_node_type = IRNodeType::UIntImm;
+    constexpr static bool canonical = true;
+
+    constexpr static bool foldable = true;
+
+    HALIDE_ALWAYS_INLINE
+    void make_folded_const(halide_scalar_value_t &val, halide_type_t &ty, MatcherState &state) const {
+        // a is almost certainly a very simple pattern (e.g. a wild), so just inline the make method.
+        Type t = a.make(state, {}).type();
+        val.u.u64 = t.is_bfloat() && (bits == 0 || t.bits() == bits) && (lanes == 0 || t.lanes() == lanes);
+        ty.code = halide_type_uint;
+        ty.bits = 1;
+        ty.lanes = t.lanes();
+    }
+};
+
+template<typename A>
+HALIDE_ALWAYS_INLINE auto is_bfloat(A &&a, int bits = 0, int lanes = 0) noexcept -> IsBFloat<decltype(pattern_arg(a))> {
+    assert_is_lvalue_if_expr<A>();
+    return {pattern_arg(a), bits, lanes};
+}
+
+template<typename A>
+std::ostream &operator<<(std::ostream &s, const IsBFloat<A> &op) {
+    s << "is_bfloat(" << op.a;
+    if (op.bits > 0) {
+        s << ", " << op.bits;
+    }
+    if (op.lanes > 0) {
+        s << ", " << op.lanes;
+    }
+    s << ")";
+    return s;
+}
+
 template<typename A>
 struct IsInt {
     struct pattern_tag {};
diff --git a/src/X86Optimize.cpp b/src/X86Optimize.cpp
index a1f54ce3d2f3..d81b8c66a2f7 100644
--- a/src/X86Optimize.cpp
+++ b/src/X86Optimize.cpp
@@ -95,8 +95,8 @@ class Optimize_X86 : public InstructionSelector {
         if (
             // Only AVX512_SapphireRapids has accumulating dot products.
             target.has_feature(Target::AVX512_SapphireRapids) &&
-            // FIXME: add the float16 -> float32 versions as well.
-            (op->type.element_of() == Int(32)) &&
+            ((op->type.element_of() == Int(32)) ||
+             (op->type.element_of() == Float(32))) &&
 
             // Accumulating pmaddubsw
             (rewrite(
@@ -130,6 +130,18 @@ class Optimize_X86 : public InstructionSelector {
                  v_instr(VectorInstruction::dot_product, z, x, y),
                  is_int(x, 16, lanes * 2) && is_int(y, 16, lanes * 2)) ||
 
+             // Accumulating fp dot products.
+             // TODO(rootjalex): This would be more powerful with lossless_cast checking.
+             rewrite(
+                 x + h_add(cast(Float(32, lanes * 4), y) * cast(Float(32, lanes * 4), z), lanes),
+                 v_instr(VectorInstruction::dot_product, x, y, z),
+                 is_bfloat(y, 16) && is_bfloat(z, 16)) ||
+
+             rewrite(
+                 h_add(cast(Float(32, lanes * 4), x) * cast(Float(32, lanes * 4), y), lanes) + z,
+                 v_instr(VectorInstruction::dot_product, z, x, y),
+                 is_bfloat(x, 16) && is_bfloat(y, 16)) ||
+
              false)) {
             return mutate(rewrite.result);
         }
@@ -414,7 +426,6 @@ class Optimize_X86 : public InstructionSelector {
                 Call::widening_shift_right,
                 Call::widening_sub,
             })) {
-            // TODO: Should we have a base-class that does this + the VectorReduce lowering needed below?
             return mutate(lower_intrinsic(op));
         }
 
@@ -487,6 +498,39 @@ class Optimize_X86 : public InstructionSelector {
                     false)) ||
                   false)) ||
 
+                // We can use the AVX512_SapphireRapids accumulating dot products
+                // on pure VectorReduce nodes with 0 as the accumulator.
+                ((factor == 4) &&
+                 target.has_feature(Target::AVX512_SapphireRapids) &&
+                 ((op->type.element_of() == Int(32)) ||
+                  (op->type.element_of() == Float(32))) &&
+
+                 // Accumulating pmaddubsw
+                 (rewrite(
+                      h_add(cast(Int(32, lanes * 4), widening_mul(x, y)), lanes),
+                      v_instr(VectorInstruction::dot_product, make_zero(Int(32, lanes)), x, y),
+                      is_uint(x, 8) && is_int(y, 8)) ||
+
+                  rewrite(
+                      h_add(cast(Int(32, lanes * 4), widening_mul(x, y)), lanes),
+                      v_instr(VectorInstruction::dot_product, make_zero(Int(32, lanes)), y, x),
+                      is_int(x, 8) && is_uint(y, 8)) ||
+
+                  // Accumulating pmaddwd.
+                  rewrite(
+                      h_add(widening_mul(x, y), lanes),
+                      v_instr(VectorInstruction::dot_product, make_zero(Int(32, lanes)), x, y),
+                      is_int(x, 16, lanes * 2) && is_int(y, 16, lanes * 2)) ||
+
+                  // Accumulating fp dot products.
+                  // TODO(rootjalex): This would be more powerful with lossless_cast checking.
+                  rewrite(
+                      h_add(cast(Float(32, lanes * 4), x) * cast(Float(32, lanes * 4), y), lanes),
+                      v_instr(VectorInstruction::dot_product, make_zero(Float(32, lanes)), x, y),
+                      is_bfloat(x, 16) && is_bfloat(y, 16)) ||
+
+                  false)) ||
+
                 // psadbw is always supported via SSE2.
                 ((factor == 8) &&
                  (rewrite(

From 19b2c5efbf762328132963d7a4ed1bdad6991561 Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Mon, 8 Aug 2022 16:57:30 -0400
Subject: [PATCH 41/55] rm stray 'protected'

---
 src/CodeGen_LLVM.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h
index 1fbb4328ac7c..14f021d83e67 100644
--- a/src/CodeGen_LLVM.h
+++ b/src/CodeGen_LLVM.h
@@ -513,7 +513,6 @@ class CodeGen_LLVM : public IRVisitor {
         `codegen_vector_reduce`. **/
     virtual Expr split_vector_reduce(const VectorReduce *op, const Expr &init) const;
 
-protected:
     /** Are we inside an atomic node that uses mutex locks?
         This is used for detecting deadlocks from nested atomics & illegal vectorization. */
     bool inside_atomic_mutex_node;

From a98f268c3e740d63b82dc7720fa8a12eb8e2aa37 Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Mon, 8 Aug 2022 17:34:38 -0400
Subject: [PATCH 42/55] update x86 saturating_cast rules using intrinsic

---
 src/X86Optimize.cpp | 48 ++++++++++++++++++++++-----------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/src/X86Optimize.cpp b/src/X86Optimize.cpp
index 6ddffc17833b..347366ea1b1f 100644
--- a/src/X86Optimize.cpp
+++ b/src/X86Optimize.cpp
@@ -214,30 +214,6 @@ class Optimize_X86 : public InstructionSelector {
                  v_instr(VectorInstruction::pmulhrs, x, y),
                  is_int(x, 16) && is_int(y, 16))) ||
 
-            // saturating_narrow is always supported (via SSE2) for:
-            //   int32 -> int16, int16 -> int8, int16 -> uint8
-            rewrite(
-                cast(Int(16, lanes), max(min(x, i32_i16max), i32_i16min)),
-                v_instr(VectorInstruction::saturating_narrow, x),
-                is_int(x, 32)) ||
-
-            rewrite(
-                cast(Int(8, lanes), max(min(x, i16_i8max), i16_i8min)),
-                v_instr(VectorInstruction::saturating_narrow, x),
-                is_int(x, 16)) ||
-
-            rewrite(
-                cast(UInt(8, lanes), max(min(x, i16_u8max), i16_u8min)),
-                v_instr(VectorInstruction::saturating_narrow, x),
-                is_int(x, 16)) ||
-
-            //   int32 -> uint16 is supported via SSE41
-            (target.has_feature(Target::SSE41) &&
-             rewrite(
-                 cast(UInt(16, lanes), max(min(x, i32_u16max), i32_u16min)),
-                 v_instr(VectorInstruction::saturating_narrow, x),
-                 is_int(x, 32))) ||
-
             // f32_to_bf16 is supported only via Target::AVX512_SapphireRapids
             (target.has_feature(Target::AVX512_SapphireRapids) &&
              rewrite(
@@ -295,6 +271,30 @@ class Optimize_X86 : public InstructionSelector {
         auto y_uint = cast(unsigned_type, y);
 
         if (
+            // saturating_narrow is always supported (via SSE2) for:
+            //   int32 -> int16, int16 -> int8, int16 -> uint8
+            rewrite(
+                saturating_cast(Int(16, lanes), x),
+                v_instr(VectorInstruction::saturating_narrow, x),
+                is_int(x, 32)) ||
+
+            rewrite(
+                saturating_cast(Int(8, lanes), x),
+                v_instr(VectorInstruction::saturating_narrow, x),
+                is_int(x, 16)) ||
+
+            rewrite(
+                saturating_cast(UInt(8, lanes), x),
+                v_instr(VectorInstruction::saturating_narrow, x),
+                is_int(x, 16)) ||
+
+            //   int32 -> uint16 is supported via SSE41
+            (target.has_feature(Target::SSE41) &&
+             rewrite(
+                 saturating_cast(UInt(16, lanes), x),
+                 v_instr(VectorInstruction::saturating_narrow, x),
+                 is_int(x, 32))) ||
+
             // We can redirect signed rounding halving add to unsigned rounding
             // halving add by adding 128 / 32768 to the result if the sign of the
             // args differs.

From 22d17e7957ce0d30e1590357f56388a704ef9b27 Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Mon, 8 Aug 2022 17:55:30 -0400
Subject: [PATCH 43/55] fix namespace issue

---
 src/CodeGen_LLVM.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h
index 14f021d83e67..9c5d767214f1 100644
--- a/src/CodeGen_LLVM.h
+++ b/src/CodeGen_LLVM.h
@@ -45,10 +45,11 @@ class GlobalVariable;
 namespace Halide {
 
 struct ExternSignature;
-class InstructionSelector;
 
 namespace Internal {
 
+class InstructionSelector;
+
 /** A code generator abstract base class. Actual code generators
  * (e.g. CodeGen_X86) inherit from this. This class is responsible
  * for taking a Halide Stmt and producing llvm bitcode, machine

From e2045bfd2d1fc59d46a00aed9669f05e68abb403 Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Mon, 22 Aug 2022 22:46:59 -0400
Subject: [PATCH 44/55] place Expr constants on the stack

---
 src/X86Optimize.cpp | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/src/X86Optimize.cpp b/src/X86Optimize.cpp
index 347366ea1b1f..db07a59bdf1d 100644
--- a/src/X86Optimize.cpp
+++ b/src/X86Optimize.cpp
@@ -446,6 +446,13 @@ class Optimize_X86 : public InstructionSelector {
         const int factor = value_lanes / lanes;
         Expr value = op->value;
 
+        // Useful constants for some of the below rules.
+        Expr one_i16 = make_one(Int(16, value_lanes));
+        Expr one_i8 = make_one(Int(8, value_lanes));
+        Expr one_u8 = make_one(Int(8, value_lanes));
+        Expr zero_i32 = make_zero(Int(32, lanes));
+        Expr zero_f32 = make_zero(Float(32, lanes));
+
         switch (op->op) {
         case VectorReduce::Add: {
             auto rewrite = IRMatcher::rewriter(IRMatcher::h_add(value, lanes), op->type);
@@ -462,7 +469,7 @@ class Optimize_X86 : public InstructionSelector {
                   // Horizontal widening add via pmaddwd
                   rewrite(
                       h_add(cast(Int(32, value_lanes), x), lanes),
-                      v_instr(VectorInstruction::dot_product, x, make_const(Int(16, value_lanes), 1)),
+                      v_instr(VectorInstruction::dot_product, x, one_i16),
                       is_int(x, 16)) ||
 
                   (rewrite(
@@ -475,17 +482,17 @@ class Optimize_X86 : public InstructionSelector {
                    // Horizontal widening adds using 2-way saturating dot products.
                    (rewrite(
                         h_add(cast(UInt(16, value_lanes), x), lanes),
-                        cast(UInt(16, lanes), typed(Int(16, lanes), v_instr(VectorInstruction::saturating_dot_product, x, make_const(Int(8, value_lanes), 1)))),
+                        cast(UInt(16, lanes), typed(Int(16, lanes), v_instr(VectorInstruction::saturating_dot_product, x, one_i8))),
                         is_uint(x, 8)) ||
 
                     rewrite(
                         h_add(cast(Int(16, value_lanes), x), lanes),
-                        v_instr(VectorInstruction::saturating_dot_product, x, make_const(Int(8, value_lanes), 1)),
+                        v_instr(VectorInstruction::saturating_dot_product, x, one_i8),
                         is_uint(x, 8)) ||
 
                     rewrite(
                         h_add(cast(Int(16, value_lanes), x), lanes),
-                        v_instr(VectorInstruction::saturating_dot_product, make_const(UInt(8, value_lanes), 1), x),
+                        v_instr(VectorInstruction::saturating_dot_product, one_u8, x),
                         is_int(x, 8)) ||
 
                     // SSE41 and AVX2 support horizontal_add via phadd intrinsics.
@@ -508,25 +515,25 @@ class Optimize_X86 : public InstructionSelector {
                  // Accumulating pmaddubsw
                  (rewrite(
                       h_add(cast(Int(32, lanes * 4), widening_mul(x, y)), lanes),
-                      v_instr(VectorInstruction::dot_product, make_zero(Int(32, lanes)), x, y),
+                      v_instr(VectorInstruction::dot_product, zero_i32, x, y),
                       is_uint(x, 8) && is_int(y, 8)) ||
 
                   rewrite(
                       h_add(cast(Int(32, lanes * 4), widening_mul(x, y)), lanes),
-                      v_instr(VectorInstruction::dot_product, make_zero(Int(32, lanes)), y, x),
+                      v_instr(VectorInstruction::dot_product, zero_i32, y, x),
                       is_int(x, 8) && is_uint(y, 8)) ||
 
                   // Accumulating pmaddwd.
                   rewrite(
                       h_add(widening_mul(x, y), lanes),
-                      v_instr(VectorInstruction::dot_product, make_zero(Int(32, lanes)), x, y),
+                      v_instr(VectorInstruction::dot_product, zero_i32, x, y),
                       is_int(x, 16, lanes * 2) && is_int(y, 16, lanes * 2)) ||
 
                   // Accumulating fp dot products.
                   // TODO(rootjalex): This would be more powerful with lossless_cast checking.
                   rewrite(
                       h_add(cast(Float(32, lanes * 4), x) * cast(Float(32, lanes * 4), y), lanes),
-                      v_instr(VectorInstruction::dot_product, make_zero(Float(32, lanes)), x, y),
+                      v_instr(VectorInstruction::dot_product, zero_f32, x, y),
                       is_bfloat(x, 16) && is_bfloat(y, 16)) ||
 
                   false)) ||

From dc4d1f744e2607a3e0ba3e0ce3c7bbc83c9fc377 Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Mon, 22 Aug 2022 23:59:14 -0400
Subject: [PATCH 45/55] i8 -> u8 bugfix

---
 src/X86Optimize.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/X86Optimize.cpp b/src/X86Optimize.cpp
index db07a59bdf1d..679f3b6fd72a 100644
--- a/src/X86Optimize.cpp
+++ b/src/X86Optimize.cpp
@@ -449,7 +449,7 @@ class Optimize_X86 : public InstructionSelector {
         // Useful constants for some of the below rules.
         Expr one_i16 = make_one(Int(16, value_lanes));
         Expr one_i8 = make_one(Int(8, value_lanes));
-        Expr one_u8 = make_one(Int(8, value_lanes));
+        Expr one_u8 = make_one(UInt(8, value_lanes));
         Expr zero_i32 = make_zero(Int(32, lanes));
         Expr zero_f32 = make_zero(Float(32, lanes));
 

From 9a5327c6243d8c53b1214150a2583eaceba44b98 Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Wed, 24 Aug 2022 12:06:47 -0400
Subject: [PATCH 46/55] add better type checking in IRMatch for SpecificExpr
 cases

---
 src/IRMatch.h | 53 ++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 50 insertions(+), 3 deletions(-)

diff --git a/src/IRMatch.h b/src/IRMatch.h
index c50be51b76c9..1719628ec8a7 100644
--- a/src/IRMatch.h
+++ b/src/IRMatch.h
@@ -585,8 +585,13 @@ IntLiteral pattern_arg(int64_t x) {
 }
 
 template<typename T>
-HALIDE_ALWAYS_INLINE void assert_is_lvalue_if_expr() {
-    static_assert(!std::is_same<typename std::decay<T>::type, Expr>::value || std::is_lvalue_reference<T>::value,
+static constexpr bool is_lvalue_if_expr() {
+    return !std::is_same<typename std::decay<T>::type, Expr>::value || std::is_lvalue_reference<T>::value;
+}
+
+template<typename T>
+HALIDE_ALWAYS_INLINE static constexpr void assert_is_lvalue_if_expr() {
+    static_assert(is_lvalue_if_expr<T>(),
                   "Exprs are captured by reference by IRMatcher objects and so must be lvalues");
 }
 
@@ -1537,68 +1542,98 @@ HALIDE_ALWAYS_INLINE auto intrin(Call::IntrinsicOp intrinsic_op, Args... args) n
 
 template<typename A>
 auto abs(A &&a) noexcept -> Intrin<decltype(pattern_arg(a))> {
+    assert_is_lvalue_if_expr<A>();
     return {Call::abs, pattern_arg(a)};
 }
 template<typename A, typename B>
 auto absd(A &&a, B &&b) noexcept -> Intrin<decltype(pattern_arg(a)), decltype(pattern_arg(b))> {
+    assert_is_lvalue_if_expr<A>();
+    assert_is_lvalue_if_expr<B>();
     return {Call::absd, pattern_arg(a), pattern_arg(b)};
 }
 template<typename A, typename B>
 auto widening_add(A &&a, B &&b) noexcept -> Intrin<decltype(pattern_arg(a)), decltype(pattern_arg(b))> {
+    assert_is_lvalue_if_expr<A>();
+    assert_is_lvalue_if_expr<B>();
     return {Call::widening_add, pattern_arg(a), pattern_arg(b)};
 }
 template<typename A, typename B>
 auto widening_sub(A &&a, B &&b) noexcept -> Intrin<decltype(pattern_arg(a)), decltype(pattern_arg(b))> {
+    assert_is_lvalue_if_expr<A>();
+    assert_is_lvalue_if_expr<B>();
     return {Call::widening_sub, pattern_arg(a), pattern_arg(b)};
 }
 template<typename A, typename B>
 auto widening_mul(A &&a, B &&b) noexcept -> Intrin<decltype(pattern_arg(a)), decltype(pattern_arg(b))> {
+    assert_is_lvalue_if_expr<A>();
+    assert_is_lvalue_if_expr<B>();
     return {Call::widening_mul, pattern_arg(a), pattern_arg(b)};
 }
 template<typename A, typename B>
 auto saturating_add(A &&a, B &&b) noexcept -> Intrin<decltype(pattern_arg(a)), decltype(pattern_arg(b))> {
+    assert_is_lvalue_if_expr<A>();
+    assert_is_lvalue_if_expr<B>();
     return {Call::saturating_add, pattern_arg(a), pattern_arg(b)};
 }
 template<typename A, typename B>
 auto saturating_sub(A &&a, B &&b) noexcept -> Intrin<decltype(pattern_arg(a)), decltype(pattern_arg(b))> {
+    assert_is_lvalue_if_expr<A>();
+    assert_is_lvalue_if_expr<B>();
     return {Call::saturating_sub, pattern_arg(a), pattern_arg(b)};
 }
 template<typename A>
 auto saturating_cast(const Type &t, A &&a) noexcept -> Intrin<decltype(pattern_arg(a))> {
+    assert_is_lvalue_if_expr<A>();
     Intrin<decltype(pattern_arg(a))> p = {Call::saturating_cast, pattern_arg(a)};
     p.optional_type_hint = t;
     return p;
 }
 template<typename A, typename B>
 auto halving_add(A &&a, B &&b) noexcept -> Intrin<decltype(pattern_arg(a)), decltype(pattern_arg(b))> {
+    assert_is_lvalue_if_expr<A>();
+    assert_is_lvalue_if_expr<B>();
     return {Call::halving_add, pattern_arg(a), pattern_arg(b)};
 }
 template<typename A, typename B>
 auto halving_sub(A &&a, B &&b) noexcept -> Intrin<decltype(pattern_arg(a)), decltype(pattern_arg(b))> {
+    assert_is_lvalue_if_expr<A>();
+    assert_is_lvalue_if_expr<B>();
     return {Call::halving_sub, pattern_arg(a), pattern_arg(b)};
 }
 template<typename A, typename B>
 auto rounding_halving_add(A &&a, B &&b) noexcept -> Intrin<decltype(pattern_arg(a)), decltype(pattern_arg(b))> {
+    assert_is_lvalue_if_expr<A>();
+    assert_is_lvalue_if_expr<B>();
     return {Call::rounding_halving_add, pattern_arg(a), pattern_arg(b)};
 }
 template<typename A, typename B>
 auto shift_left(A &&a, B &&b) noexcept -> Intrin<decltype(pattern_arg(a)), decltype(pattern_arg(b))> {
+    assert_is_lvalue_if_expr<A>();
+    assert_is_lvalue_if_expr<B>();
     return {Call::shift_left, pattern_arg(a), pattern_arg(b)};
 }
 template<typename A, typename B>
 auto shift_right(A &&a, B &&b) noexcept -> Intrin<decltype(pattern_arg(a)), decltype(pattern_arg(b))> {
+    assert_is_lvalue_if_expr<A>();
+    assert_is_lvalue_if_expr<B>();
     return {Call::shift_right, pattern_arg(a), pattern_arg(b)};
 }
 template<typename A, typename B>
 auto rounding_shift_left(A &&a, B &&b) noexcept -> Intrin<decltype(pattern_arg(a)), decltype(pattern_arg(b))> {
+    assert_is_lvalue_if_expr<A>();
+    assert_is_lvalue_if_expr<B>();
     return {Call::rounding_shift_left, pattern_arg(a), pattern_arg(b)};
 }
 template<typename A, typename B>
 auto rounding_shift_right(A &&a, B &&b) noexcept -> Intrin<decltype(pattern_arg(a)), decltype(pattern_arg(b))> {
+    assert_is_lvalue_if_expr<A>();
+    assert_is_lvalue_if_expr<B>();
     return {Call::rounding_shift_right, pattern_arg(a), pattern_arg(b)};
 }
 template<typename A, typename B>
 auto bitwise_xor(A &&a, B &&b) noexcept -> Intrin<decltype(pattern_arg(a)), decltype(pattern_arg(b))> {
+    assert_is_lvalue_if_expr<A>();
+    assert_is_lvalue_if_expr<B>();
     return {Call::bitwise_xor, pattern_arg(a), pattern_arg(b)};
 }
 template<typename A, typename B>
@@ -1609,6 +1644,8 @@ HALIDE_ALWAYS_INLINE auto operator^(A &&a, B &&b) noexcept -> auto{
 }
 template<typename A, typename B>
 auto bitwise_and(A &&a, B &&b) noexcept -> Intrin<decltype(pattern_arg(a)), decltype(pattern_arg(b))> {
+    assert_is_lvalue_if_expr<A>();
+    assert_is_lvalue_if_expr<B>();
     return {Call::bitwise_and, pattern_arg(a), pattern_arg(b)};
 }
 template<typename A, typename B>
@@ -1619,6 +1656,8 @@ HALIDE_ALWAYS_INLINE auto operator&(A &&a, B &&b) noexcept -> auto{
 }
 template<typename A, typename B>
 auto bitwise_or(A &&a, B &&b) noexcept -> Intrin<decltype(pattern_arg(a)), decltype(pattern_arg(b))> {
+    assert_is_lvalue_if_expr<A>();
+    assert_is_lvalue_if_expr<B>();
     return {Call::bitwise_or, pattern_arg(a), pattern_arg(b)};
 }
 template<typename A, typename B>
@@ -1629,10 +1668,16 @@ HALIDE_ALWAYS_INLINE auto operator|(A &&a, B &&b) noexcept -> auto{
 }
 template<typename A, typename B, typename C>
 auto mul_shift_right(A &&a, B &&b, C &&c) noexcept -> Intrin<decltype(pattern_arg(a)), decltype(pattern_arg(b)), decltype(pattern_arg(c))> {
+    assert_is_lvalue_if_expr<A>();
+    assert_is_lvalue_if_expr<B>();
+    assert_is_lvalue_if_expr<C>();
     return {Call::mul_shift_right, pattern_arg(a), pattern_arg(b), pattern_arg(c)};
 }
 template<typename A, typename B, typename C>
 auto rounding_mul_shift_right(A &&a, B &&b, C &&c) noexcept -> Intrin<decltype(pattern_arg(a)), decltype(pattern_arg(b)), decltype(pattern_arg(c))> {
+    assert_is_lvalue_if_expr<A>();
+    assert_is_lvalue_if_expr<B>();
+    assert_is_lvalue_if_expr<C>();
     return {Call::rounding_mul_shift_right, pattern_arg(a), pattern_arg(b), pattern_arg(c)};
 }
 
@@ -1828,6 +1873,7 @@ inline std::ostream &operator<<(std::ostream &s, const BroadcastOp<A, B> &op) {
 template<typename A, typename B>
 HALIDE_ALWAYS_INLINE auto broadcast(A &&a, B lanes) noexcept -> BroadcastOp<decltype(pattern_arg(a)), decltype(pattern_arg(lanes))> {
     assert_is_lvalue_if_expr<A>();
+    assert_is_lvalue_if_expr<B>();
     return {pattern_arg(a), pattern_arg(lanes)};
 }
 
@@ -1981,7 +2027,7 @@ struct VectorInstructionOp {
     VectorInstructionOp(const VectorInstruction::InstructionOp _op, Args... args) noexcept
         : op(_op), args(args...) {
         static_assert(sizeof...(Args) > 0 && sizeof...(Args) <= 3,
-                      "VectorInstructionOp must have non-zero arguments, and update make() if more than 3 arguments.");
+                        "VectorInstructionOp must have non-zero arguments, and update make() if more than 3 arguments.");
     }
 };
 
@@ -1997,6 +2043,7 @@ std::ostream &operator<<(std::ostream &s, const VectorInstructionOp<Args...> &op
 
 template<typename... Args>
 HALIDE_ALWAYS_INLINE auto v_instr(const VectorInstruction::InstructionOp op, Args... args) noexcept -> VectorInstructionOp<decltype(pattern_arg(args))...> {
+    static_assert(and_reduce((is_lvalue_if_expr<Args>())...), "All parameters to a VectorInstructionOp must be lvalues if Exprs");
     return {op, pattern_arg(args)...};
 }
 

From 292d8e582c217d9a226b061e577e0c427de667d1 Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Wed, 24 Aug 2022 12:10:53 -0400
Subject: [PATCH 47/55] clang format

---
 src/IRMatch.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/IRMatch.h b/src/IRMatch.h
index 1719628ec8a7..8b69d84d8621 100644
--- a/src/IRMatch.h
+++ b/src/IRMatch.h
@@ -2027,7 +2027,7 @@ struct VectorInstructionOp {
     VectorInstructionOp(const VectorInstruction::InstructionOp _op, Args... args) noexcept
         : op(_op), args(args...) {
         static_assert(sizeof...(Args) > 0 && sizeof...(Args) <= 3,
-                        "VectorInstructionOp must have non-zero arguments, and update make() if more than 3 arguments.");
+                      "VectorInstructionOp must have non-zero arguments, and update make() if more than 3 arguments.");
     }
 };
 

From 3b0dc43f173c0b9daa0b41444e3584b545ad64e1 Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Wed, 24 Aug 2022 12:35:35 -0400
Subject: [PATCH 48/55] missing &&

---
 src/IRMatch.h       |  2 +-
 src/X86Optimize.cpp | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/IRMatch.h b/src/IRMatch.h
index 8b69d84d8621..3574aa432416 100644
--- a/src/IRMatch.h
+++ b/src/IRMatch.h
@@ -2042,7 +2042,7 @@ std::ostream &operator<<(std::ostream &s, const VectorInstructionOp<Args...> &op
 }
 
 template<typename... Args>
-HALIDE_ALWAYS_INLINE auto v_instr(const VectorInstruction::InstructionOp op, Args... args) noexcept -> VectorInstructionOp<decltype(pattern_arg(args))...> {
+HALIDE_ALWAYS_INLINE auto v_instr(const VectorInstruction::InstructionOp op, Args&&... args) noexcept -> VectorInstructionOp<decltype(pattern_arg(args))...> {
     static_assert(and_reduce((is_lvalue_if_expr<Args>())...), "All parameters to a VectorInstructionOp must be lvalues if Exprs");
     return {op, pattern_arg(args)...};
 }
diff --git a/src/X86Optimize.cpp b/src/X86Optimize.cpp
index 679f3b6fd72a..f4bd53c2e868 100644
--- a/src/X86Optimize.cpp
+++ b/src/X86Optimize.cpp
@@ -447,11 +447,11 @@ class Optimize_X86 : public InstructionSelector {
         Expr value = op->value;
 
         // Useful constants for some of the below rules.
-        Expr one_i16 = make_one(Int(16, value_lanes));
-        Expr one_i8 = make_one(Int(8, value_lanes));
-        Expr one_u8 = make_one(UInt(8, value_lanes));
-        Expr zero_i32 = make_zero(Int(32, lanes));
-        Expr zero_f32 = make_zero(Float(32, lanes));
+        const Expr one_i16 = make_one(Int(16, value_lanes));
+        const Expr one_i8 = make_one(Int(8, value_lanes));
+        const Expr one_u8 = make_one(UInt(8, value_lanes));
+        const Expr zero_i32 = make_zero(Int(32, lanes));
+        const Expr zero_f32 = make_zero(Float(32, lanes));
 
         switch (op->op) {
         case VectorReduce::Add: {

From 1eb0e94fac9c502fd443ff1f35e788dfb1dfd69a Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Wed, 24 Aug 2022 12:49:48 -0400
Subject: [PATCH 49/55] clang format

---
 src/IRMatch.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/IRMatch.h b/src/IRMatch.h
index 3574aa432416..876fbb0aa1b4 100644
--- a/src/IRMatch.h
+++ b/src/IRMatch.h
@@ -2042,7 +2042,7 @@ std::ostream &operator<<(std::ostream &s, const VectorInstructionOp<Args...> &op
 }
 
 template<typename... Args>
-HALIDE_ALWAYS_INLINE auto v_instr(const VectorInstruction::InstructionOp op, Args&&... args) noexcept -> VectorInstructionOp<decltype(pattern_arg(args))...> {
+HALIDE_ALWAYS_INLINE auto v_instr(const VectorInstruction::InstructionOp op, Args &&...args) noexcept -> VectorInstructionOp<decltype(pattern_arg(args))...> {
     static_assert(and_reduce((is_lvalue_if_expr<Args>())...), "All parameters to a VectorInstructionOp must be lvalues if Exprs");
     return {op, pattern_arg(args)...};
 }

From f6eb2bf1d133a28d75a96516d66ae4d9bb0a3aa4 Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Wed, 24 Aug 2022 13:02:38 -0400
Subject: [PATCH 50/55] update SpecificExpr comment + remove dangling TODO
 comments

---
 src/IRMatch.h       |  2 ++
 src/IRMutator.cpp   |  2 --
 src/IRVisitor.cpp   |  2 --
 src/X86Optimize.cpp | 16 +---------------
 4 files changed, 3 insertions(+), 19 deletions(-)

diff --git a/src/IRMatch.h b/src/IRMatch.h
index 876fbb0aa1b4..097aecb89bd2 100644
--- a/src/IRMatch.h
+++ b/src/IRMatch.h
@@ -211,6 +211,8 @@ struct SpecificExpr {
     constexpr static IRNodeType max_node_type = IRNodeType::Shuffle;
     constexpr static bool canonical = true;
 
+    // Having SpecificExpr hold an Expr instead of a BaseExprNode reference
+    // is catastrophic for performance and stack space usage.
     const BaseExprNode &expr;
 
     template<uint32_t bound>
diff --git a/src/IRMutator.cpp b/src/IRMutator.cpp
index 7b05b46ac22f..b1703a6cccd1 100644
--- a/src/IRMutator.cpp
+++ b/src/IRMutator.cpp
@@ -328,13 +328,11 @@ Expr IRMutator::visit(const Shuffle *op) {
 }
 
 Expr IRMutator::visit(const VectorInstruction *op) {
-    // internal_error << "Always implement VectorInstruction visitor for IRMutator subclass\n";
     auto [new_args, changed] = mutate_with_changes(op->args);
     if (!changed) {
         return op;
     }
     return VectorInstruction::make(op->type, op->op, new_args);
-    // return Expr();
 }
 
 Expr IRMutator::visit(const VectorReduce *op) {
diff --git a/src/IRVisitor.cpp b/src/IRVisitor.cpp
index 2332d7bcc2e3..97c55d8075ac 100644
--- a/src/IRVisitor.cpp
+++ b/src/IRVisitor.cpp
@@ -258,7 +258,6 @@ void IRVisitor::visit(const Shuffle *op) {
 }
 
 void IRVisitor::visit(const VectorInstruction *op) {
-    // internal_error << "Always implement VectorInstruction visitor for IRVisitor subclass\n";
     for (const auto &arg : op->args) {
         arg.accept(this);
     }
@@ -523,7 +522,6 @@ void IRGraphVisitor::visit(const Shuffle *op) {
 }
 
 void IRGraphVisitor::visit(const VectorInstruction *op) {
-    // internal_error << "Always implement VectorInstruction visitor for IRGraphVisitor subclass\n";
     for (const auto &arg : op->args) {
         include(arg);
     }
diff --git a/src/X86Optimize.cpp b/src/X86Optimize.cpp
index f4bd53c2e868..7a27ac9a92e6 100644
--- a/src/X86Optimize.cpp
+++ b/src/X86Optimize.cpp
@@ -196,16 +196,6 @@ class Optimize_X86 : public InstructionSelector {
 
         auto rewrite = IRMatcher::rewriter(IRMatcher::cast(op->type, op->value), op->type);
 
-        // TODO: saturating casts should be intrinsics, and supported in IRMatch.h.
-        const Expr i32_i16max = cast(Int(32, lanes), Int(16).max());
-        const Expr i32_i16min = cast(Int(32, lanes), Int(16).min());
-        const Expr i16_i8max = cast(Int(16, lanes), Int(8).max());
-        const Expr i16_i8min = cast(Int(16, lanes), Int(8).min());
-        const Expr i16_u8max = cast(Int(16, lanes), UInt(8).max());
-        const Expr i16_u8min = cast(Int(16, lanes), UInt(8).min());
-        const Expr i32_u16max = cast(Int(32, lanes), UInt(16).max());
-        const Expr i32_u16min = cast(Int(32, lanes), UInt(16).min());
-
         if (
             // pmulhrs is supported via AVX2 and SSE41, so SSE41 is the LCD.
             (target.has_feature(Target::SSE41) &&
@@ -235,7 +225,7 @@ class Optimize_X86 : public InstructionSelector {
             return IRGraphMutator::visit(op);
         }
 
-        // TODO: This optimization is hard to do via a rewrite-rule because of lossless_cast.
+        // TODO(rootjalex): This optimization is hard to do via a rewrite-rule because of lossless_cast.
 
         // A 16-bit mul-shift-right of less than 16 can sometimes be rounded up to a
         // full 16 to use pmulh(u)w by left-shifting one of the operands. This is
@@ -337,10 +327,6 @@ class Optimize_X86 : public InstructionSelector {
                         typed(Int(16, lanes), 32767),
                         v_instr(VectorInstruction::pmulhrs, x, y)))) ||
 
-            // TODO(rootjalex): The following intrinsics are
-            // simply one-to-one mappings, should they even
-            // be handled here?
-
             // int(8 | 16 | 32) -> uint is supported via SSE41
             // float32 is always supported (via SSE2).
             (((target.has_feature(Target::SSE41) && op->type.is_int() && bits <= 32) ||

From 95e5070742e7da2246681bfd23e4123afd6394fa Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Mon, 12 Sep 2022 16:57:20 -0700
Subject: [PATCH 51/55] fix signed absd lowering on x86

---
 src/X86Optimize.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/X86Optimize.cpp b/src/X86Optimize.cpp
index 7a27ac9a92e6..192cb48879e5 100644
--- a/src/X86Optimize.cpp
+++ b/src/X86Optimize.cpp
@@ -308,7 +308,8 @@ class Optimize_X86 : public InstructionSelector {
             // Current best way to lower absd on x86.
             rewrite(
                 absd(x, y),
-                max(x, y) - min(x, y),
+                // Cast is a no-op reinterpret.
+                cast(op->type, max(x, y) - min(x, y)),
                 is_int(x) && is_int(y)) ||
 
             // pmulh is always supported (via SSE2).

From 7aaf9a7802aad3b155efef813f89a01a8a351ef8 Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Mon, 12 Sep 2022 17:00:40 -0700
Subject: [PATCH 52/55] add type assertion to Optimize_X86::mutate

---
 src/X86Optimize.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/X86Optimize.cpp b/src/X86Optimize.cpp
index 192cb48879e5..ae5a4424679b 100644
--- a/src/X86Optimize.cpp
+++ b/src/X86Optimize.cpp
@@ -68,6 +68,13 @@ class Optimize_X86 : public InstructionSelector {
         : InstructionSelector(target, codegen) {
     }
 
+    using IRGraphMutator::mutate;
+    Expr mutate(const Expr &e) override {
+        Expr expr = IRGraphMutator::mutate(e);
+        internal_assert(expr.type() == e.type()) << "(X86Optimize) Found type mismatch: " << e << " -> " << expr << "\n";
+        return expr;
+    }
+
 protected:
     bool should_peephole_optimize(const Type &type) {
         // We only have peephole optimizations for vectors here.

From b4e2e42158f8a3427e45d101e8320b9feae30096 Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Mon, 12 Sep 2022 17:21:37 -0700
Subject: [PATCH 53/55] use shuffle for deinterleave on VectorInstruction

---
 src/Deinterleave.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/Deinterleave.cpp b/src/Deinterleave.cpp
index 1ad291736ff7..c8ea9c4d1831 100644
--- a/src/Deinterleave.cpp
+++ b/src/Deinterleave.cpp
@@ -196,9 +196,8 @@ class Deinterleaver : public IRGraphMutator {
     using IRMutator::visit;
 
     Expr visit(const VectorInstruction *op) override {
-        internal_error << "Deinterleaver should never receive VectorInstruction node, received:\n"
-                       << Expr(op) << "\n";
-        return Expr();
+        // We can't do anything special here.
+        return give_up_and_shuffle(op);
     }
 
     Expr visit(const VectorReduce *op) override {

From 15e1a8c9cedcff9756fda8cf26eeb09090025566 Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Mon, 12 Sep 2022 22:52:40 -0700
Subject: [PATCH 54/55] do not try to extract when a vector is a simple
 extract_element

---
 src/Simplify_Shuffle.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/Simplify_Shuffle.cpp b/src/Simplify_Shuffle.cpp
index 35622aee9c4e..e8a8aae634ce 100644
--- a/src/Simplify_Shuffle.cpp
+++ b/src/Simplify_Shuffle.cpp
@@ -9,6 +9,10 @@ using std::vector;
 
 Expr Simplify::visit(const Shuffle *op, ExprInfo *bounds) {
     if (op->is_extract_element()) {
+        if (op->vectors.size() == 1) {
+            // We cannot simplify this further.
+            return op;
+        }
         int index = op->indices[0];
         internal_assert(index >= 0);
         for (const Expr &vector : op->vectors) {

From cc2fac7e21ac67cf4115112d8e1eeec99b4ccb1c Mon Sep 17 00:00:00 2001
From: Alexander Root <aroot222@mit.edu>
Date: Tue, 13 Sep 2022 11:11:04 -0700
Subject: [PATCH 55/55] don't call 'simplify' in deinterleave on extract_lane

---
 src/Deinterleave.cpp     | 6 ++++++
 src/Simplify_Shuffle.cpp | 4 ----
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/Deinterleave.cpp b/src/Deinterleave.cpp
index c8ea9c4d1831..3f380be535c3 100644
--- a/src/Deinterleave.cpp
+++ b/src/Deinterleave.cpp
@@ -407,6 +407,12 @@ Expr deinterleave(Expr e, int starting_lane, int lane_stride, int new_lanes, con
     Deinterleaver d(starting_lane, lane_stride, new_lanes, lets);
     e = d.mutate(e);
     e = common_subexpression_elimination(e);
+    if (const Shuffle *shuffle = e.as<Shuffle>()) {
+        if (shuffle->is_extract_element() && shuffle->vectors.size() == 1) {
+            // calling `simplify` here will produce an infinite recursive loop.
+            return e;
+        }
+    }
     return simplify(e);
 }
 }  // namespace
diff --git a/src/Simplify_Shuffle.cpp b/src/Simplify_Shuffle.cpp
index e8a8aae634ce..35622aee9c4e 100644
--- a/src/Simplify_Shuffle.cpp
+++ b/src/Simplify_Shuffle.cpp
@@ -9,10 +9,6 @@ using std::vector;
 
 Expr Simplify::visit(const Shuffle *op, ExprInfo *bounds) {
     if (op->is_extract_element()) {
-        if (op->vectors.size() == 1) {
-            // We cannot simplify this further.
-            return op;
-        }
         int index = op->indices[0];
         internal_assert(index >= 0);
         for (const Expr &vector : op->vectors) {