diff --git a/apps/simd_op_check/driver.cpp b/apps/simd_op_check/driver.cpp
index 1eba3d9e903b..fc29c0488c4d 100644
--- a/apps/simd_op_check/driver.cpp
+++ b/apps/simd_op_check/driver.cpp
@@ -11,6 +11,8 @@ struct filter {
     const char *name;
     int (*fn)(halide_buffer_t *,   // float32
               halide_buffer_t *,   // float64
+              halide_buffer_t *,   // float16
+              halide_buffer_t *,   // bfloat16
               halide_buffer_t *,   // int8
               halide_buffer_t *,   // uint8
               halide_buffer_t *,   // int16
@@ -33,7 +35,7 @@ extern "C" void halide_print(void *, const char *msg) {
 }
 
 template<typename T>
-halide_buffer_t make_buffer(int w, int h) {
+halide_buffer_t make_buffer(int w, int h, halide_type_t halide_type) {
     T *mem = NULL;
 #ifdef __APPLE__
     // memalign() isn't present on OSX, but posix_memalign is
@@ -53,7 +55,7 @@ halide_buffer_t make_buffer(int w, int h) {
     buf.host = (uint8_t *)mem;
     buf.dim[0].extent = w;
     buf.dim[1].extent = h;
-    buf.type = halide_type_of<T>();
+    buf.type = halide_type;
     buf.dim[0].stride = 1;
     buf.dim[1].stride = w;
     buf.dim[0].min = -128;
@@ -73,18 +75,20 @@ int main(int argc, char **argv) {
     bool error = false;
     // Make some input buffers
     halide_buffer_t bufs[] = {
-        make_buffer<float>(W, H),
-        make_buffer<double>(W, H),
-        make_buffer<int8_t>(W, H),
-        make_buffer<uint8_t>(W, H),
-        make_buffer<int16_t>(W, H),
-        make_buffer<uint16_t>(W, H),
-        make_buffer<int32_t>(W, H),
-        make_buffer<uint32_t>(W, H),
-        make_buffer<int64_t>(W, H),
-        make_buffer<uint64_t>(W, H)};
+        make_buffer<float>(W, H, halide_type_of<float>()),
+        make_buffer<double>(W, H, halide_type_of<double>()),
+        make_buffer<uint16_t>(W, H, halide_type_t(halide_type_float, 16)),
+        make_buffer<uint16_t>(W, H, halide_type_t(halide_type_bfloat, 16)),
+        make_buffer<int8_t>(W, H, halide_type_of<int8_t>()),
+        make_buffer<uint8_t>(W, H, halide_type_of<uint8_t>()),
+        make_buffer<int16_t>(W, H, halide_type_of<int16_t>()),
+        make_buffer<uint16_t>(W, H, halide_type_of<uint16_t>()),
+        make_buffer<int32_t>(W, H, halide_type_of<int32_t>()),
+        make_buffer<uint32_t>(W, H, halide_type_of<uint32_t>()),
+        make_buffer<int64_t>(W, H, halide_type_of<int64_t>()),
+        make_buffer<uint64_t>(W, H, halide_type_of<uint64_t>())};
 
-    halide_buffer_t out = make_buffer<double>(1, 1);
+    halide_buffer_t out = make_buffer<double>(1, 1, halide_type_of<double>());
 
     double *out_value = (double *)(out.host);
 
@@ -101,6 +105,8 @@ int main(int argc, char **argv) {
              bufs + 7,
              bufs + 8,
              bufs + 9,
+             bufs + 10,
+             bufs + 11,
              &out);
         if (*out_value) {
             printf("Error: %f\n", *out_value);
diff --git a/python_bindings/src/halide/halide_/PyEnums.cpp b/python_bindings/src/halide/halide_/PyEnums.cpp
index 79a68e37db06..1913b204fbd4 100644
--- a/python_bindings/src/halide/halide_/PyEnums.cpp
+++ b/python_bindings/src/halide/halide_/PyEnums.cpp
@@ -152,6 +152,7 @@ void define_enums(py::module &m) {
         .value("AVX512_KNL", Target::Feature::AVX512_KNL)
         .value("AVX512_Skylake", Target::Feature::AVX512_Skylake)
         .value("AVX512_Cannonlake", Target::Feature::AVX512_Cannonlake)
+        .value("AVX512_Zen4", Target::Feature::AVX512_Zen4)
         .value("AVX512_SapphireRapids", Target::Feature::AVX512_SapphireRapids)
         .value("TraceLoads", Target::Feature::TraceLoads)
         .value("TraceStores", Target::Feature::TraceStores)
diff --git a/src/CSE.cpp b/src/CSE.cpp
index 7c3a182e86a9..7d39fcc90dc5 100644
--- a/src/CSE.cpp
+++ b/src/CSE.cpp
@@ -283,6 +283,39 @@ Expr common_subexpression_elimination(const Expr &e_in, bool lift_all) {
 
     debug(4) << "After removing lets: " << e << "\n";
 
+    // CSE is run on unsanitized Exprs from the user, and may contain Vars with
+    // the same name as the temporaries we intend to introduce. Find any such
+    // Vars so that we know not to use those names.
+    class UniqueNameProvider : public IRGraphVisitor {
+        using IRGraphVisitor::visit;
+
+        const char prefix = 't';  // Annoyingly, this can't be static because this is a local class.
+
+        void visit(const Variable *op) override {
+            // It would be legal to just add all names found to the tracked set,
+            // but because we know the form of the new names we're going to
+            // introduce, we can save some time by only adding names that could
+            // plausibly collide. In the vast majority of cases, this check will
+            // result in the set being empty.
+            if (op->name.size() > 1 &&
+                op->name[0] == prefix &&
+                isdigit(op->name[1])) {
+                vars.insert(op->name);
+            }
+        }
+        std::set<string> vars;
+
+    public:
+        string make_unique_name() {
+            string name;
+            do {
+                name = unique_name(prefix);
+            } while (vars.count(name));
+            return name;
+        }
+    } namer;
+    e.accept(&namer);
+
     GVN gvn;
     e = gvn.mutate(e);
 
@@ -298,7 +331,7 @@ Expr common_subexpression_elimination(const Expr &e_in, bool lift_all) {
     for (size_t i = 0; i < gvn.entries.size(); i++) {
         const auto &e = gvn.entries[i];
         if (e->use_count > 1) {
-            string name = unique_name('t');
+            string name = namer.make_unique_name();
             lets.emplace_back(name, e->expr);
             // Point references to this expr to the variable instead.
             replacements[e->expr] = Variable::make(e->expr.type(), name);
diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp
index 45d4e224e277..c961c2b64bd6 100644
--- a/src/CodeGen_X86.cpp
+++ b/src/CodeGen_X86.cpp
@@ -29,6 +29,9 @@ namespace {
 // oldest feature flag that supports an instruction.
 Target complete_x86_target(Target t) {
     if (t.has_feature(Target::AVX512_SapphireRapids)) {
+        t.set_feature(Target::AVX512_Zen4);
+    }
+    if (t.has_feature(Target::AVX512_Zen4)) {
         t.set_feature(Target::AVX512_Cannonlake);
     }
     if (t.has_feature(Target::AVX512_Cannonlake)) {
@@ -67,8 +70,6 @@ class CodeGen_X86 : public CodeGen_Posix {
 
     int vector_lanes_for_slice(const Type &t) const;
 
-    llvm::Type *llvm_type_of(const Type &t) const override;
-
     using CodeGen_Posix::visit;
 
     void init_module() override;
@@ -210,12 +211,19 @@ const x86Intrinsic intrinsic_defs[] = {
     {"llvm.x86.sse2.pmulhu.w", UInt(16, 8), "pmulh", {UInt(16, 8), UInt(16, 8)}},
     {"llvm.x86.ssse3.pmul.hr.sw.128", Int(16, 8), "pmulhrs", {Int(16, 8), Int(16, 8)}, Target::SSE41},
 
+    // As of LLVM main September 5 2023, LLVM only has partial handling of
+    // bfloat16. The below rules will match fine for simple examples, but bfloat
+    // conversion will get folded through any nearby shuffles and cause
+    // unimplemented errors in llvm's x86 instruction selection for the shuffle
+    // node. Disabling them for now. See https://github.com/halide/Halide/issues/7219
+    /*
     // Convert FP32 to BF16
-    {"vcvtne2ps2bf16x32", BFloat(16, 32), "f32_to_bf16", {Float(32, 32)}, Target::AVX512_SapphireRapids},
-    {"llvm.x86.avx512bf16.cvtneps2bf16.512", BFloat(16, 16), "f32_to_bf16", {Float(32, 16)}, Target::AVX512_SapphireRapids},
-    {"llvm.x86.avx512bf16.cvtneps2bf16.256", BFloat(16, 8), "f32_to_bf16", {Float(32, 8)}, Target::AVX512_SapphireRapids},
+    {"vcvtne2ps2bf16x32", BFloat(16, 32), "f32_to_bf16", {Float(32, 32)}, Target::AVX512_Zen4},
+    {"llvm.x86.avx512bf16.cvtneps2bf16.512", BFloat(16, 16), "f32_to_bf16", {Float(32, 16)}, Target::AVX512_Zen4},
+    {"llvm.x86.avx512bf16.cvtneps2bf16.256", BFloat(16, 8), "f32_to_bf16", {Float(32, 8)}, Target::AVX512_Zen4},
     // LLVM does not provide an unmasked 128bit cvtneps2bf16 intrinsic, so provide a wrapper around the masked version.
-    {"vcvtneps2bf16x4", BFloat(16, 4), "f32_to_bf16", {Float(32, 4)}, Target::AVX512_SapphireRapids},
+    {"vcvtneps2bf16x4", BFloat(16, 4), "f32_to_bf16", {Float(32, 4)}, Target::AVX512_Zen4},
+    */
 
     // 2-way dot products
     {"llvm.x86.avx2.pmadd.ub.sw", Int(16, 16), "saturating_dot_product", {UInt(8, 32), Int(8, 32)}, Target::AVX2},
@@ -242,23 +250,23 @@ const x86Intrinsic intrinsic_defs[] = {
 
     // 4-way dot product vector reduction
     // The LLVM intrinsics combine the bf16 pairs into i32, so provide a wrapper to correctly call the intrinsic.
-    {"dpbf16psx16", Float(32, 16), "dot_product", {Float(32, 16), BFloat(16, 32), BFloat(16, 32)}, Target::AVX512_SapphireRapids},
+    {"dpbf16psx16", Float(32, 16), "dot_product", {Float(32, 16), BFloat(16, 32), BFloat(16, 32)}, Target::AVX512_Zen4},
     {"dpbf16psx8", Float(32, 8), "dot_product", {Float(32, 8), BFloat(16, 16), BFloat(16, 16)}, Target::AVX512_SapphireRapids},
     {"dpbf16psx4", Float(32, 4), "dot_product", {Float(32, 4), BFloat(16, 8), BFloat(16, 8)}, Target::AVX512_SapphireRapids},
 
-    {"dpbusdx16", Int(32, 16), "dot_product", {Int(32, 16), UInt(8, 64), Int(8, 64)}, Target::AVX512_SapphireRapids},
+    {"dpbusdx16", Int(32, 16), "dot_product", {Int(32, 16), UInt(8, 64), Int(8, 64)}, Target::AVX512_Zen4},
     {"dpbusdx8", Int(32, 8), "dot_product", {Int(32, 8), UInt(8, 32), Int(8, 32)}, Target::AVX512_SapphireRapids},
     {"dpbusdx4", Int(32, 4), "dot_product", {Int(32, 4), UInt(8, 16), Int(8, 16)}, Target::AVX512_SapphireRapids},
 
-    {"dpwssdx16", Int(32, 16), "dot_product", {Int(32, 16), Int(16, 32), Int(16, 32)}, Target::AVX512_SapphireRapids},
+    {"dpwssdx16", Int(32, 16), "dot_product", {Int(32, 16), Int(16, 32), Int(16, 32)}, Target::AVX512_Zen4},
     {"dpwssdx8", Int(32, 8), "dot_product", {Int(32, 8), Int(16, 16), Int(16, 16)}, Target::AVX512_SapphireRapids},
     {"dpwssdx4", Int(32, 4), "dot_product", {Int(32, 4), Int(16, 8), Int(16, 8)}, Target::AVX512_SapphireRapids},
 
-    {"dpbusdsx16", Int(32, 16), "saturating_dot_product", {Int(32, 16), UInt(8, 64), Int(8, 64)}, Target::AVX512_SapphireRapids},
+    {"dpbusdsx16", Int(32, 16), "saturating_dot_product", {Int(32, 16), UInt(8, 64), Int(8, 64)}, Target::AVX512_Zen4},
     {"dpbusdsx8", Int(32, 8), "saturating_dot_product", {Int(32, 8), UInt(8, 32), Int(8, 32)}, Target::AVX512_SapphireRapids},
     {"dpbusdsx4", Int(32, 4), "saturating_dot_product", {Int(32, 4), UInt(8, 16), Int(8, 16)}, Target::AVX512_SapphireRapids},
 
-    {"dpwssdsx16", Int(32, 16), "saturating_dot_product", {Int(32, 16), Int(16, 32), Int(16, 32)}, Target::AVX512_SapphireRapids},
+    {"dpwssdsx16", Int(32, 16), "saturating_dot_product", {Int(32, 16), Int(16, 32), Int(16, 32)}, Target::AVX512_Zen4},
     {"dpwssdsx8", Int(32, 8), "saturating_dot_product", {Int(32, 8), Int(16, 16), Int(16, 16)}, Target::AVX512_SapphireRapids},
     {"dpwssdsx4", Int(32, 4), "saturating_dot_product", {Int(32, 4), Int(16, 8), Int(16, 8)}, Target::AVX512_SapphireRapids},
 
@@ -488,9 +496,25 @@ void CodeGen_X86::visit(const Select *op) {
 }
 
 void CodeGen_X86::visit(const Cast *op) {
+    Type src = op->value.type();
+    Type dst = op->type;
+
+    if (target.has_feature(Target::F16C) &&
+        dst.code() == Type::Float &&
+        src.code() == Type::Float &&
+        (dst.bits() == 16 || src.bits() == 16)) {
+        // Node we use code() == Type::Float instead of is_float(), because we
+        // don't want to catch bfloat casts.
+
+        // This target doesn't support full float16 arithmetic, but it *does*
+        // support float16 casts, so we emit a vanilla LLVM cast node.
+        value = codegen(op->value);
+        value = builder->CreateFPCast(value, llvm_type_of(dst));
+        return;
+    }
 
-    if (!op->type.is_vector()) {
-        // We only have peephole optimizations for vectors in here.
+    if (!dst.is_vector()) {
+        // We only have peephole optimizations for vectors after this point.
         CodeGen_Posix::visit(op);
         return;
     }
@@ -513,7 +537,7 @@ void CodeGen_X86::visit(const Cast *op) {
     vector<Expr> matches;
     for (const Pattern &p : patterns) {
         if (expr_match(p.pattern, op, matches)) {
-            value = call_overloaded_intrin(op->type, p.intrin, matches);
+            value = call_overloaded_intrin(dst, p.intrin, matches);
             if (value) {
                 return;
             }
@@ -521,12 +545,12 @@ void CodeGen_X86::visit(const Cast *op) {
     }
 
     if (const Call *mul = Call::as_intrinsic(op->value, {Call::widening_mul})) {
-        if (op->value.type().bits() < op->type.bits() && op->type.bits() <= 32) {
+        if (src.bits() < dst.bits() && dst.bits() <= 32) {
             // LLVM/x86 really doesn't like 8 -> 16 bit multiplication. If we're
             // widening to 32-bits after a widening multiply, LLVM prefers to see a
             // widening multiply directly to 32-bits. This may result in extra
             // casts, so simplify to remove them.
-            value = codegen(simplify(Mul::make(Cast::make(op->type, mul->args[0]), Cast::make(op->type, mul->args[1]))));
+            value = codegen(simplify(Mul::make(Cast::make(dst, mul->args[0]), Cast::make(dst, mul->args[1]))));
             return;
         }
     }
@@ -871,6 +895,8 @@ string CodeGen_X86::mcpu_target() const {
     //          The CPU choice here *WILL* affect -mattrs!
     if (target.has_feature(Target::AVX512_SapphireRapids)) {
         return "sapphirerapids";
+    } else if (target.has_feature(Target::AVX512_Zen4)) {
+        return "znver4";
     } else if (target.has_feature(Target::AVX512_Cannonlake)) {
         return "cannonlake";
     } else if (target.has_feature(Target::AVX512_Skylake)) {
@@ -917,6 +943,8 @@ string CodeGen_X86::mcpu_tune() const {
         return "znver2";
     case Target::Processor::ZnVer3:
         return "znver3";
+    case Target::Processor::ZnVer4:
+        return "znver4";
 
     case Target::Processor::ProcessorGeneric:
         break;
@@ -958,8 +986,11 @@ string CodeGen_X86::mattrs() const {
         if (target.has_feature(Target::AVX512_Cannonlake)) {
             features += ",+avx512ifma,+avx512vbmi";
         }
+        if (target.has_feature(Target::AVX512_Zen4)) {
+            features += ",+avx512bf16,+avx512vnni,+avx512bitalg,+avx512vbmi2";
+        }
         if (target.has_feature(Target::AVX512_SapphireRapids)) {
-            features += ",+avx512bf16,+avx512vnni,+amx-int8,+amx-bf16";
+            features += ",+avxvnni,+amx-int8,+amx-bf16";
         }
     }
     return features;
@@ -997,21 +1028,6 @@ int CodeGen_X86::vector_lanes_for_slice(const Type &t) const {
     return slice_bits / t.bits();
 }
 
-llvm::Type *CodeGen_X86::llvm_type_of(const Type &t) const {
-    if (t.is_float() && t.bits() < 32) {
-        // LLVM as of August 2019 has all sorts of issues in the x86
-        // backend for half types. It injects expensive calls to
-        // convert between float and half for seemingly no reason
-        // (e.g. to do a select), and bitcasting to int16 doesn't
-        // help, because it simplifies away the bitcast for you.
-        // See: https://bugs.llvm.org/show_bug.cgi?id=43065
-        // and: https://github.com/halide/Halide/issues/4166
-        return llvm_type_of(t.with_code(halide_type_uint));
-    } else {
-        return CodeGen_Posix::llvm_type_of(t);
-    }
-}
-
 }  // namespace
 
 std::unique_ptr<CodeGen_Posix> new_CodeGen_X86(const Target &target) {
diff --git a/src/Definition.h b/src/Definition.h
index 11c9012c2f7a..890d16d673e1 100644
--- a/src/Definition.h
+++ b/src/Definition.h
@@ -76,13 +76,23 @@ class Definition {
      * definition. */
     void mutate(IRMutator *);
 
-    /** Get the default (no-specialization) arguments (left-hand-side) of the definition */
+    /** Get the default (no-specialization) arguments (left-hand-side) of the definition.
+     *
+     * Warning: Any Vars in the Exprs are not qualified with the Func name, so
+     * the Exprs may contain names which collide with names provided by
+     * unique_name.
+     */
     // @{
     const std::vector<Expr> &args() const;
     std::vector<Expr> &args();
     // @}
 
-    /** Get the default (no-specialization) right-hand-side of the definition */
+    /** Get the default (no-specialization) right-hand-side of the definition.
+     *
+     * Warning: Any Vars in the Exprs are not qualified with the Func name, so
+     * the Exprs may contain names which collide with names provided by
+     * unique_name.
+     */
     // @{
     const std::vector<Expr> &values() const;
     std::vector<Expr> &values();
diff --git a/src/Func.cpp b/src/Func.cpp
index 884d39b97050..121233aed7a5 100644
--- a/src/Func.cpp
+++ b/src/Func.cpp
@@ -1090,6 +1090,34 @@ void Stage::split(const string &old, const string &outer, const string &inner, c
             << "Use TailStrategy::GuardWithIf instead.";
     }
 
+    bool predicate_loads_ok = !exact;
+    if (predicate_loads_ok && tail == TailStrategy::PredicateLoads) {
+        // If it's the outermost split in this dimension, PredicateLoads
+        // is OK. Otherwise we can't prove it's safe.
+        std::set<string> inner_vars;
+        for (const Split &s : definition.schedule().splits()) {
+            if (s.is_split()) {
+                inner_vars.insert(s.inner);
+                if (inner_vars.count(s.old_var)) {
+                    inner_vars.insert(s.outer);
+                }
+            } else if (s.is_rename() || s.is_purify()) {
+                if (inner_vars.count(s.old_var)) {
+                    inner_vars.insert(s.outer);
+                }
+            } else if (s.is_fuse()) {
+                if (inner_vars.count(s.inner) || inner_vars.count(s.outer)) {
+                    inner_vars.insert(s.old_var);
+                }
+            }
+        }
+        predicate_loads_ok = !inner_vars.count(old_name);
+        user_assert(predicate_loads_ok || tail != TailStrategy::PredicateLoads)
+            << "Can't use TailStrategy::PredicateLoads for splitting " << old_name
+            << " in the definition of " << name() << ". "
+            << "PredicateLoads may not be used to split a Var stemming from the inner Var of a prior split.";
+    }
+
     if (tail == TailStrategy::Auto) {
         // Select a tail strategy
         if (exact) {
diff --git a/src/Function.h b/src/Function.h
index 5e1734b2d1bb..66b62a01f66b 100644
--- a/src/Function.h
+++ b/src/Function.h
@@ -167,7 +167,12 @@ class Function {
     int required_dimensions() const;
 
     /** Get the right-hand-side of the pure definition. Returns an
-     * empty vector if there is no pure definition. */
+     * empty vector if there is no pure definition.
+     *
+     * Warning: Any Vars in the Exprs are not qualified with the Func name, so
+     * the Exprs may contain names which collide with names provided by
+     * unique_name.
+     */
     const std::vector<Expr> &values() const;
 
     /** Does this function have a pure definition? */
diff --git a/src/Simplify_Stmts.cpp b/src/Simplify_Stmts.cpp
index b1bac9f361d3..5be05e42e6c6 100644
--- a/src/Simplify_Stmts.cpp
+++ b/src/Simplify_Stmts.cpp
@@ -63,7 +63,11 @@ Stmt Simplify::visit(const IfThenElse *op) {
     if (else_unreachable) {
         return then_case;
     } else if (then_unreachable) {
-        return else_case;
+        if (else_case.defined()) {
+            return else_case;
+        } else {
+            return Evaluate::make(0);
+        }
     }
 
     if (is_no_op(else_case)) {
diff --git a/src/Target.cpp b/src/Target.cpp
index 60c8dbd9cfcd..597d5bf5367d 100644
--- a/src/Target.cpp
+++ b/src/Target.cpp
@@ -128,8 +128,10 @@ Target::Processor get_amd_processor(unsigned family, unsigned model, bool have_s
         }
         break;
     case 0x19:  // AMD Family 19h
-        if (model <= 0x0f || model == 0x21) {
+        if ((model & 0xf0) == 0 || model == 0x21) {
             return Target::Processor::ZnVer3;  // 00h-0Fh, 21h: Zen3
+        } else if (model == 0x61) {
+            return Target::Processor::ZnVer4;  // 61h: Zen4
         }
         break;
     default:
@@ -215,8 +217,22 @@ Target calculate_host_target() {
 
     if (vendor_signature == VendorSignatures::AuthenticAMD) {
         processor = get_amd_processor(family, model, have_sse3);
+
+        if (processor == Target::Processor::ZnVer4) {
+            Target t{os, arch, bits, processor, initial_features, vector_bits};
+            t.set_features({Target::SSE41, Target::AVX,
+                            Target::F16C, Target::FMA,
+                            Target::AVX2, Target::AVX512,
+                            Target::AVX512_Skylake, Target::AVX512_Cannonlake,
+                            Target::AVX512_Zen4});
+            return t;
+        }
     }
 
+    // Processors not specifically detected by model number above use the cpuid
+    // feature bits to determine what flags are supported. For future models,
+    // detect them explicitly above rather than extending the code below.
+
     if (have_sse41) {
         initial_features.push_back(Target::SSE41);
     }
@@ -265,12 +281,12 @@ Target calculate_host_target() {
             if ((info2[1] & avx512_cannonlake) == avx512_cannonlake) {
                 initial_features.push_back(Target::AVX512_Cannonlake);
 
-                const uint32_t avx512vnni = 1U << 11;  // vnni result in ecx
-                const uint32_t avx512bf16 = 1U << 5;   // bf16 result in eax, with cpuid(eax=7, ecx=1)
+                const uint32_t avxvnni = 1U << 4;     // avxvnni (note, not avx512vnni) result in eax
+                const uint32_t avx512bf16 = 1U << 5;  // bf16 result in eax, with cpuid(eax=7, ecx=1)
                 int info3[4];
                 cpuid(info3, 7, 1);
                 // TODO: port to family/model -based detection.
-                if ((info2[2] & avx512vnni) == avx512vnni &&
+                if ((info3[0] & avxvnni) == avxvnni &&
                     (info3[0] & avx512bf16) == avx512bf16) {
                     initial_features.push_back(Target::AVX512_SapphireRapids);
                 }
@@ -441,6 +457,7 @@ const std::map<std::string, Target::Processor> processor_name_map = {
     {"tune_znver1", Target::Processor::ZnVer1},
     {"tune_znver2", Target::Processor::ZnVer2},
     {"tune_znver3", Target::Processor::ZnVer3},
+    {"tune_znver4", Target::Processor::ZnVer4},
 };
 
 bool lookup_processor(const std::string &tok, Target::Processor &result) {
@@ -502,6 +519,7 @@ const std::map<std::string, Target::Feature> feature_name_map = {
     {"avx512_skylake", Target::AVX512_Skylake},
     {"avx512_cannonlake", Target::AVX512_Cannonlake},
     {"avx512_sapphirerapids", Target::AVX512_SapphireRapids},
+    {"avx512_zen4", Target::AVX512_Zen4},
     {"trace_loads", Target::TraceLoads},
     {"trace_stores", Target::TraceStores},
     {"trace_realizations", Target::TraceRealizations},
@@ -1258,7 +1276,7 @@ bool Target::get_runtime_compatible_target(const Target &other, Target &result)
     // clang-format on
 
     // clang-format off
-    const std::array<Feature, 14> intersection_features = {{
+    const std::array<Feature, 15> intersection_features = {{
         ARMv7s,
         ARMv81a,
         AVX,
@@ -1268,6 +1286,7 @@ bool Target::get_runtime_compatible_target(const Target &other, Target &result)
         AVX512_KNL,
         AVX512_SapphireRapids,
         AVX512_Skylake,
+        AVX512_Zen4,
         F16C,
         FMA,
         FMA4,
diff --git a/src/Target.h b/src/Target.h
index 783eb1c61c6b..76b06aed6b8e 100644
--- a/src/Target.h
+++ b/src/Target.h
@@ -73,6 +73,7 @@ struct Target {
         ZnVer1,    /// Tune for AMD Zen   CPU (AMD Family 17h, launched 2017).
         ZnVer2,    /// Tune for AMD Zen 2 CPU (AMD Family 17h, launched 2019).
         ZnVer3,    /// Tune for AMD Zen 3 CPU (AMD Family 19h, launched 2020).
+        ZnVer4,    /// Tune for AMD Zen 4 CPU (AMD Family 19h, launched 2022).
     } processor_tune = ProcessorGeneric;
 
     /** Optional features a target can have.
@@ -130,6 +131,7 @@ struct Target {
         AVX512_Skylake = halide_target_feature_avx512_skylake,
         AVX512_Cannonlake = halide_target_feature_avx512_cannonlake,
         AVX512_SapphireRapids = halide_target_feature_avx512_sapphirerapids,
+        AVX512_Zen4 = halide_target_feature_avx512_zen4,
         TraceLoads = halide_target_feature_trace_loads,
         TraceStores = halide_target_feature_trace_stores,
         TraceRealizations = halide_target_feature_trace_realizations,
diff --git a/src/VectorizeLoops.cpp b/src/VectorizeLoops.cpp
index f5ad0d573b9a..46a8553e91d2 100644
--- a/src/VectorizeLoops.cpp
+++ b/src/VectorizeLoops.cpp
@@ -960,19 +960,27 @@ class VectorSubs : public IRMutator {
 
             vectorized_vars.push_back({op->name, min, (int)extent_int->value});
             update_replacements();
-            // Go over lets which were vectorized and update them according to the current
-            // loop level.
-            for (auto it = scope.cbegin(); it != scope.cend(); ++it) {
-                string vectorized_name = get_widened_var_name(it.name());
-                Expr vectorized_value = mutate(it.value());
+            // Go over lets which were vectorized in the order of their occurrence and update
+            // them according to the current loop level.
+            for (auto let = containing_lets.begin(); let != containing_lets.end(); let++) {
+                // Skip if this var wasn't vectorized.
+                if (!scope.contains(let->first)) {
+                    continue;
+                }
+                string vectorized_name = get_widened_var_name(let->first);
+                Expr vectorized_value = mutate(scope.get(let->first));
                 vector_scope.push(vectorized_name, vectorized_value);
             }
 
             body = mutate(body);
 
             // Append vectorized lets for this loop level.
-            for (auto it = scope.cbegin(); it != scope.cend(); ++it) {
-                string vectorized_name = get_widened_var_name(it.name());
+            for (auto let = containing_lets.rbegin(); let != containing_lets.rend(); let++) {
+                // Skip if this var wasn't vectorized.
+                if (!scope.contains(let->first)) {
+                    continue;
+                }
+                string vectorized_name = get_widened_var_name(let->first);
                 Expr vectorized_value = vector_scope.get(vectorized_name);
                 vector_scope.pop(vectorized_name);
                 InterleavedRamp ir;
diff --git a/src/runtime/HalideRuntime.h b/src/runtime/HalideRuntime.h
index 522b9ff608d2..81088971418c 100644
--- a/src/runtime/HalideRuntime.h
+++ b/src/runtime/HalideRuntime.h
@@ -1360,7 +1360,8 @@ typedef enum halide_target_feature_t {
     halide_target_feature_avx512_knl,             ///< Enable the AVX512 features supported by Knight's Landing chips, such as the Xeon Phi x200. This includes the base AVX512 set, and also AVX512-CD and AVX512-ER.
     halide_target_feature_avx512_skylake,         ///< Enable the AVX512 features supported by Skylake Xeon server processors. This adds AVX512-VL, AVX512-BW, and AVX512-DQ to the base set. The main difference from the base AVX512 set is better support for small integer ops. Note that this does not include the Knight's Landing features. Note also that these features are not available on Skylake desktop and mobile processors.
     halide_target_feature_avx512_cannonlake,      ///< Enable the AVX512 features expected to be supported by future Cannonlake processors. This includes all of the Skylake features, plus AVX512-IFMA and AVX512-VBMI.
-    halide_target_feature_avx512_sapphirerapids,  ///< Enable the AVX512 features supported by Sapphire Rapids processors. This include all of the Cannonlake features, plus AVX512-VNNI and AVX512-BF16.
+    halide_target_feature_avx512_zen4,            ///< Enable the AVX512 features supported by Zen4 processors. This include all of the Cannonlake features, plus AVX512-VNNI, AVX512-BF16, and more.
+    halide_target_feature_avx512_sapphirerapids,  ///< Enable the AVX512 features supported by Sapphire Rapids processors. This include all of the Zen4 features, plus AVX-VNNI and AMX instructions.
     halide_target_feature_trace_loads,            ///< Trace all loads done by the pipeline. Equivalent to calling Func::trace_loads on every non-inlined Func.
     halide_target_feature_trace_stores,           ///< Trace all stores done by the pipeline. Equivalent to calling Func::trace_stores on every non-inlined Func.
     halide_target_feature_trace_realizations,     ///< Trace all realizations done by the pipeline. Equivalent to calling Func::trace_realizations on every non-inlined Func.
diff --git a/src/runtime/x86_cpu_features.cpp b/src/runtime/x86_cpu_features.cpp
index 1fb65bcf6ef9..8e5fabe9cbd2 100644
--- a/src/runtime/x86_cpu_features.cpp
+++ b/src/runtime/x86_cpu_features.cpp
@@ -37,9 +37,44 @@ WEAK CpuFeatures halide_get_cpu_features() {
     features.set_known(halide_target_feature_avx512_cannonlake);
     features.set_known(halide_target_feature_avx512_sapphirerapids);
 
+    // Detect CPU features by specific microarchitecture.
+    int32_t vendor[4];
+    cpuid(vendor, 0);
     int32_t info[4];
     cpuid(info, 1);
 
+    uint32_t family = (info[0] >> 8) & 0xF;  // Bits 8..11
+    uint32_t model = (info[0] >> 4) & 0xF;   // Bits 4..7
+    if (family == 0x6 || family == 0xF) {
+        if (family == 0xF) {
+            // Examine extended family ID if family ID is 0xF.
+            family += (info[0] >> 20) & 0xFf;  // Bits 20..27
+        }
+        // Examine extended model ID if family ID is 0x6 or 0xF.
+        model += ((info[0] >> 16) & 0xF) << 4;  // Bits 16..19
+    }
+
+    if (vendor[1] == 0x68747541 && vendor[3] == 0x69746e65 && vendor[2] == 0x444d4163) {
+        // AMD
+        if (family == 0x19 && model == 0x61) {
+            // Zen4
+            features.set_available(halide_target_feature_sse41);
+            features.set_available(halide_target_feature_avx);
+            features.set_available(halide_target_feature_f16c);
+            features.set_available(halide_target_feature_fma);
+            features.set_available(halide_target_feature_avx2);
+            features.set_available(halide_target_feature_avx512);
+            features.set_available(halide_target_feature_avx512_skylake);
+            features.set_available(halide_target_feature_avx512_cannonlake);
+            features.set_available(halide_target_feature_avx512_zen4);
+            return features;
+        }
+    }
+
+    // Legacy code to detect CPU by feature bits instead. Handle new
+    // microarchitectures above rather than making the code below more
+    // complicated.
+
     const bool have_sse41 = (info[2] & (1 << 19)) != 0;
     const bool have_avx = (info[2] & (1 << 28)) != 0;
     const bool have_f16c = (info[2] & (1 << 29)) != 0;
@@ -70,8 +105,8 @@ WEAK CpuFeatures halide_get_cpu_features() {
         constexpr uint32_t avx512bw = 1U << 30;
         constexpr uint32_t avx512vl = 1U << 31;
         constexpr uint32_t avx512ifma = 1U << 21;
-        constexpr uint32_t avx512vnni = 1U << 11;  // vnni result in ecx
-        constexpr uint32_t avx512bf16 = 1U << 5;   // bf16 result in eax, cpuid(eax=7, ecx=1)
+        constexpr uint32_t avxvnni = 1U << 4;
+        constexpr uint32_t avx512bf16 = 1U << 5;  // bf16 result in eax, cpuid(eax=7, ecx=1)
         constexpr uint32_t avx512 = avx512f | avx512cd;
         constexpr uint32_t avx512_knl = avx512 | avx512pf | avx512er;
         constexpr uint32_t avx512_skylake = avx512 | avx512vl | avx512bw | avx512dq;
@@ -92,7 +127,7 @@ WEAK CpuFeatures halide_get_cpu_features() {
 
                 int32_t info3[4];
                 cpuid(info3, 7, 1);
-                if ((info2[2] & avx512vnni) == avx512vnni &&
+                if ((info3[0] & avxvnni) == avxvnni &&
                     (info3[0] & avx512bf16) == avx512bf16) {
                     features.set_available(halide_target_feature_avx512_sapphirerapids);
                 }
diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt
index a1d6f7ce726d..a4a25eeae87f 100644
--- a/test/correctness/CMakeLists.txt
+++ b/test/correctness/CMakeLists.txt
@@ -59,6 +59,7 @@ tests(GROUPS correctness
       constraints.cpp
       convolution_multiple_kernels.cpp
       cross_compilation.cpp
+      cse_name_collision.cpp
       cse_nan.cpp
       cuda_8_bit_dot_product.cpp
       custom_allocator.cpp
diff --git a/test/correctness/cse_name_collision.cpp b/test/correctness/cse_name_collision.cpp
new file mode 100644
index 000000000000..c370337dac00
--- /dev/null
+++ b/test/correctness/cse_name_collision.cpp
@@ -0,0 +1,50 @@
+#include "Halide.h"
+
+using namespace Halide;
+
+int main(int argc, char **argv) {
+    Var t0("t0"), t1("t1"), t2("t2");
+
+    // Construct a Func RHS that uses Vars with names that collide with those
+    // that will be generated by CSE.
+    Expr e = cast<uint32_t>(t0 + t1);
+
+    // Add a bunch of reuse of subexpressions so that CSE introduces lets.
+    e = e * e;
+    e = e * e;
+    e = e * e;
+
+    // Add additional uses of t0, t1 that will appear inside the innermost let
+    // body, where they're guaranteed to collide with enclosing lets.
+    e += cast<uint32_t>(t0) + cast<uint32_t>(t1);
+
+    // CSE should know to not introduce uses of t0, t1 because those already
+    // occur in e. It may introduce uses of t2, which is unseen, but that
+    // shouldn't confuse things because it's bound by an enclosing let so it
+    // should be clear to compiler passes that it's distinct from the Var t2 on
+    // the LHS.
+    Func f;
+    f(t0, t1, t2) = e;
+
+    Buffer<uint32_t> buf = f.realize({32, 32, 32});
+
+    for (int i = 0; i < 32; i++) {
+        for (int j = 0; j < 32; j++) {
+            for (int k = 0; k < 32; k++) {
+                uint32_t correct = i + j;
+                correct *= correct;
+                correct *= correct;
+                correct *= correct;
+                correct += i + j;
+                if (buf(i, j, k) != correct) {
+                    printf("buf(%d, %d, %d) = %d instead of %d\n",
+                           i, j, k, buf(i, j, k), correct);
+                    return 1;
+                }
+            }
+        }
+    }
+
+    printf("Success!\n");
+    return 0;
+}
diff --git a/test/correctness/simd_op_check_x86.cpp b/test/correctness/simd_op_check_x86.cpp
index f86134d37630..51d4a0b18ccb 100644
--- a/test/correctness/simd_op_check_x86.cpp
+++ b/test/correctness/simd_op_check_x86.cpp
@@ -33,6 +33,9 @@ class SimdOpCheckX86 : public SimdOpCheckTest {
         // There's no separate target for SSS4.2; we currently assume that
         // it should be used iff AVX is being used.
         use_sse42 = use_avx;
+
+        use_avx512_vnni = target.has_feature(Target::AVX512_Zen4);
+        use_avx_vnni = target.has_feature(Target::AVX512_SapphireRapids);
     }
 
     void add_tests() override {
@@ -45,6 +48,7 @@ class SimdOpCheckX86 : public SimdOpCheckTest {
     void check_sse_and_avx() {
         Expr f64_1 = in_f64(x), f64_2 = in_f64(x + 16), f64_3 = in_f64(x + 32);
         Expr f32_1 = in_f32(x), f32_2 = in_f32(x + 16), f32_3 = in_f32(x + 32);
+        Expr f16_1 = in_f16(x), f16_2 = in_f16(x + 16), f16_3 = in_f16(x + 32);
         Expr i8_1 = in_i8(x), i8_2 = in_i8(x + 16), i8_3 = in_i8(x + 32);
         Expr u8_1 = in_u8(x), u8_2 = in_u8(x + 16), u8_3 = in_u8(x + 32);
         Expr i16_1 = in_i16(x), i16_2 = in_i16(x + 16), i16_3 = in_i16(x + 32);
@@ -496,6 +500,11 @@ class SimdOpCheckX86 : public SimdOpCheckTest {
                 check_x86_fixed_point("zmm", 2);
             }
 
+            if (target.has_feature(Target::F16C)) {
+                check("vcvtps2ph", 8, cast(Float(16), f32_1));
+                check("vcvtph2ps", 8, cast(Float(32), f16_1));
+            }
+
             check(use_avx512 ? "vpaddq*zmm" : "vpaddq*ymm", 8, i64_1 + i64_2);
             check(use_avx512 ? "vpsubq*zmm" : "vpsubq*ymm", 8, i64_1 - i64_2);
             check(use_avx512 ? "vpmullq" : "vpmuludq*ymm", 8, u64_1 * u64_2);
@@ -574,49 +583,60 @@ class SimdOpCheckX86 : public SimdOpCheckTest {
             check("vpmaxsq", 8, max(i64_1, i64_2));
             check("vpminsq", 8, min(i64_1, i64_2));
         }
-        if (use_avx512 && target.has_feature(Target::AVX512_SapphireRapids)) {
-            // TODO: broken, see https://github.com/halide/Halide/issues/7219
-            // check("vcvtne2ps2bf16*zmm", 32, cast(BFloat(16), f32_1));
-            // check("vcvtneps2bf16*ymm", 16, cast(BFloat(16), f32_1));
-            // check("vcvtneps2bf16*xmm", 8, cast(BFloat(16), f32_1));
-            // check("vcvtneps2bf16*xmm", 4, cast(BFloat(16), f32_1));
+        if (use_avx512_vnni) {
+            // For our targets, avx512_vnni implies avx512_bf16
+            // Disabled due to https://github.com/halide/Halide/issues/7219
+            /*
+            check("vcvtne2ps2bf16*zmm", 32, cast(BFloat(16), f32_1));
+            check("vcvtneps2bf16*ymm", 16, cast(BFloat(16), f32_1));
+            check("vcvtneps2bf16*xmm", 8, cast(BFloat(16), f32_1));
+            check("vcvtneps2bf16*xmm", 4, cast(BFloat(16), f32_1));
+            */
 
             {
                 // 16 bit, 2 element dot product
                 RDom r(0, 2);
                 check("vdpbf16ps*zmm", 16, sum(f32(in_bf16(2 * x + r)) * in_bf16(2 * x + r + 32)));
-                check("vdpbf16ps*ymm", 8, sum(f32(in_bf16(2 * x + r)) * in_bf16(2 * x + r + 32)));
-                check("vdpbf16ps*xmm", 4, sum(f32(in_bf16(2 * x + r)) * in_bf16(2 * x + r + 32)));
                 check("vpdpwssd*zmm", 16, sum(i32(in_i16(2 * x + r)) * in_i16(2 * x + r + 32)));
-                check("vpdpwssd*ymm", 8, sum(i32(in_i16(2 * x + r)) * in_i16(2 * x + r + 32)));
-                check("vpdpwssd*xmm", 4, sum(i32(in_i16(2 * x + r)) * in_i16(2 * x + r + 32)));
+                if (use_avx_vnni) {
+                    check("vdpbf16ps*ymm", 8, sum(f32(in_bf16(2 * x + r)) * in_bf16(2 * x + r + 32)));
+                    check("vdpbf16ps*xmm", 4, sum(f32(in_bf16(2 * x + r)) * in_bf16(2 * x + r + 32)));
+                    check("vpdpwssd*ymm", 8, sum(i32(in_i16(2 * x + r)) * in_i16(2 * x + r + 32)));
+                    check("vpdpwssd*xmm", 4, sum(i32(in_i16(2 * x + r)) * in_i16(2 * x + r + 32)));
+                }
             }
             {
                 // 8 bit, 4 element dot product
                 RDom r(0, 4);
                 check("vpdpbusd*zmm", 16, sum(i32(in_u8(4 * x + r)) * in_i8(4 * x + r + 32)));
                 check("vpdpbusd*zmm", 16, sum(i32(in_i8(4 * x + r)) * in_u8(4 * x + r + 32)));
-                check("vpdpbusd*ymm", 8, sum(i32(in_u8(4 * x + r)) * in_i8(4 * x + r + 32)));
-                check("vpdpbusd*ymm", 8, sum(i32(in_i8(4 * x + r)) * in_u8(4 * x + r + 32)));
-                check("vpdpbusd*xmm", 4, sum(i32(in_u8(4 * x + r)) * in_i8(4 * x + r + 32)));
-                check("vpdpbusd*xmm", 4, sum(i32(in_i8(4 * x + r)) * in_u8(4 * x + r + 32)));
+                if (use_avx_vnni) {
+                    check("vpdpbusd*ymm", 8, sum(i32(in_u8(4 * x + r)) * in_i8(4 * x + r + 32)));
+                    check("vpdpbusd*ymm", 8, sum(i32(in_i8(4 * x + r)) * in_u8(4 * x + r + 32)));
+                    check("vpdpbusd*xmm", 4, sum(i32(in_u8(4 * x + r)) * in_i8(4 * x + r + 32)));
+                    check("vpdpbusd*xmm", 4, sum(i32(in_i8(4 * x + r)) * in_u8(4 * x + r + 32)));
+                }
             }
             {
                 // 16 bit, 2 element saturaing dot product
                 RDom r(0, 2);
                 check("vpdpwssds*zmm", 16, saturating_sum(i32(in_i16(2 * x + r)) * in_i16(2 * x + r + 32)));
-                check("vpdpwssds*ymm", 8, saturating_sum(i32(in_i16(2 * x + r)) * in_i16(2 * x + r + 32)));
-                check("vpdpwssds*xmm", 4, saturating_sum(i32(in_i16(2 * x + r)) * in_i16(2 * x + r + 32)));
+                if (use_avx_vnni) {
+                    check("vpdpwssds*ymm", 8, saturating_sum(i32(in_i16(2 * x + r)) * in_i16(2 * x + r + 32)));
+                    check("vpdpwssds*xmm", 4, saturating_sum(i32(in_i16(2 * x + r)) * in_i16(2 * x + r + 32)));
+                }
             }
             {
                 // 8 bit, 4 element saturating dot product
                 RDom r(0, 4);
                 check("vpdpbusds*zmm", 16, saturating_sum(i32(in_u8(4 * x + r)) * in_i8(4 * x + r + 32)));
                 check("vpdpbusds*zmm", 16, saturating_sum(i32(in_i8(4 * x + r)) * in_u8(4 * x + r + 32)));
-                check("vpdpbusds*ymm", 8, saturating_sum(i32(in_u8(4 * x + r)) * in_i8(4 * x + r + 32)));
-                check("vpdpbusds*ymm", 8, saturating_sum(i32(in_i8(4 * x + r)) * in_u8(4 * x + r + 32)));
-                check("vpdpbusds*xmm", 4, saturating_sum(i32(in_u8(4 * x + r)) * in_i8(4 * x + r + 32)));
-                check("vpdpbusds*xmm", 4, saturating_sum(i32(in_i8(4 * x + r)) * in_u8(4 * x + r + 32)));
+                if (use_avx_vnni) {
+                    check("vpdpbusds*ymm", 8, saturating_sum(i32(in_u8(4 * x + r)) * in_i8(4 * x + r + 32)));
+                    check("vpdpbusds*ymm", 8, saturating_sum(i32(in_i8(4 * x + r)) * in_u8(4 * x + r + 32)));
+                    check("vpdpbusds*xmm", 4, saturating_sum(i32(in_u8(4 * x + r)) * in_i8(4 * x + r + 32)));
+                    check("vpdpbusds*xmm", 4, saturating_sum(i32(in_i8(4 * x + r)) * in_u8(4 * x + r + 32)));
+                }
             }
         }
     }
@@ -624,6 +644,8 @@ class SimdOpCheckX86 : public SimdOpCheckTest {
 private:
     bool use_avx2{false};
     bool use_avx512{false};
+    bool use_avx512_vnni{false};
+    bool use_avx_vnni{false};
     bool use_avx{false};
     bool use_sse41{false};
     bool use_sse42{false};
@@ -638,14 +660,18 @@ int main(int argc, char **argv) {
         {
             Target("x86-32-linux"),
             Target("x86-32-linux-sse41"),
-            Target("x86-64-linux-sse41-avx"),
-            Target("x86-64-linux-sse41-avx-avx2"),
+            // Always turn on f16c when using avx. Sandy Bridge had avx without
+            // f16c, but f16c is orthogonal to everything else, so there's no
+            // real reason to test avx without it.
+            Target("x86-64-linux-sse41-avx-f16c"),
+            Target("x86-64-linux-sse41-avx-f16c-avx2"),
             // See above: don't test avx512 without extra features, the test
             // isn't yet set up to test it properly.
             // Target("x86-64-linux-sse41-avx-avx2-avx512"),
             // Target("x86-64-linux-sse41-avx-avx2-avx512-avx512_knl"),
-            Target("x86-64-linux-sse41-avx-avx2-avx512-avx512_skylake"),
-            Target("x86-64-linux-sse41-avx-avx2-avx512-avx512_skylake-avx512_cannonlake"),
-            Target("x86-64-linux-sse41-avx-avx2-avx512-avx512_skylake-avx512_cannonlake-avx512_sapphirerapids"),
+            Target("x86-64-linux-sse41-avx-f16c-avx2-avx512-avx512_skylake"),
+            Target("x86-64-linux-sse41-avx-f16c-avx2-avx512-avx512_skylake-avx512_cannonlake"),
+            Target("x86-64-linux-sse41-avx-f16c-avx2-avx512-avx512_skylake-avx512_cannonlake-avx512_zen4"),
+            Target("x86-64-linux-sse41-avx-f16c-avx2-avx512-avx512_skylake-avx512_cannonlake-avx512_zen4-avx512_sapphirerapids"),
         });
 }
diff --git a/test/correctness/vectorize_nested.cpp b/test/correctness/vectorize_nested.cpp
index dd4cfd3905c0..e9800ef8f034 100644
--- a/test/correctness/vectorize_nested.cpp
+++ b/test/correctness/vectorize_nested.cpp
@@ -192,6 +192,29 @@ int vectorize_all_d() {
     return 0;
 }
 
+int vectorize_lets_order() {
+    const int width = 128;
+    const int height = 128;
+
+    Var x("x"), y("y"), yo("yo"), yi("yi"), yoi("yoi"), yoio("yoio"), yoii("yoii");
+    Func f("f");
+    f(x, y) = x + y;
+    f.split(y, yo, yi, 8, TailStrategy::Auto)
+        .split(yo, yo, yoi, 4, TailStrategy::RoundUp)
+        .vectorize(yoi)
+        .vectorize(yi)
+        .split(yoi, yoio, yoii, 2, TailStrategy::Auto);
+    Buffer<int> result = f.realize({width, height});
+
+    auto cmp_func = [](int x, int y) {
+        return x + y;
+    };
+    if (check_image(result, cmp_func)) {
+        return 1;
+    }
+
+    return 0;
+}
 int vectorize_inner_of_scalarization() {
     ImageParam in(UInt(8), 2);
 
@@ -289,6 +312,11 @@ int main(int argc, char **argv) {
         return 1;
     }
 
+    if (vectorize_lets_order()) {
+        printf("vectorize_lets_order failed\n");
+        return 1;
+    }
+
     if (vectorize_inner_of_scalarization()) {
         printf("vectorize_inner_of_scalarization failed\n");
         return 1;
diff --git a/test/error/CMakeLists.txt b/test/error/CMakeLists.txt
index f217d19534f0..6e69490657f5 100644
--- a/test/error/CMakeLists.txt
+++ b/test/error/CMakeLists.txt
@@ -77,6 +77,7 @@ tests(GROUPS error
       overflow_during_constant_folding.cpp
       pointer_arithmetic.cpp
       race_condition.cpp
+      predicate_loads_used_in_inner_splits.cpp
       rdom_undefined.cpp
       rdom_where_races.cpp
       realization_with_too_many_outputs.cpp
diff --git a/test/error/predicate_loads_used_in_inner_splits.cpp b/test/error/predicate_loads_used_in_inner_splits.cpp
new file mode 100644
index 000000000000..f0e2d658dcac
--- /dev/null
+++ b/test/error/predicate_loads_used_in_inner_splits.cpp
@@ -0,0 +1,15 @@
+#include "Halide.h"
+
+using namespace Halide;
+
+int main(int argc, char **argv) {
+    Func f;
+    Var x, xo, xi, xio, xii;
+    f(x) = x;
+    f.split(x, xo, xi, 2, TailStrategy::Auto)
+        .split(xi, xio, xii, 4, TailStrategy::PredicateLoads)
+        .reorder(xo, xio, xii);
+
+    printf("Success!\n");
+    return 0;
+}
diff --git a/tutorial/CMakeLists.txt b/tutorial/CMakeLists.txt
index 19b72d58c09c..2221e61d74ec 100644
--- a/tutorial/CMakeLists.txt
+++ b/tutorial/CMakeLists.txt
@@ -209,3 +209,6 @@ if (TARGET Halide::Mullapudi2016)
     add_test(NAME tutorial_lesson_21_auto_scheduler_run COMMAND lesson_21_auto_scheduler_run)
     set_tests_properties(tutorial_lesson_21_auto_scheduler_run PROPERTIES LABELS "tutorial;multithreaded")
 endif ()
+
+# Lesson 22
+add_tutorial(lesson_22_jit_performance.cpp)
diff --git a/tutorial/lesson_22_jit_performance.cpp b/tutorial/lesson_22_jit_performance.cpp
new file mode 100644
index 000000000000..e1178b95bbcd
--- /dev/null
+++ b/tutorial/lesson_22_jit_performance.cpp
@@ -0,0 +1,259 @@
+// Halide tutorial lesson 22: JIT compilation performance
+
+// This lesson demonstrates the various performance implications of the
+// various Halide methods of doing "Just-In-Time" compilation.
+
+// On linux, you can compile and run it like so:
+// g++ lesson_22*.cpp -g -I <path/to/Halide.h> -I <path/to/tools/halide_benchmark.h> -L <path/to/libHalide.so> -lHalide -lpthread -ldl -o lesson_22 -std=c++17
+// LD_LIBRARY_PATH=<path/to/libHalide.so> ./lesson_20
+
+// On os x:
+// g++ lesson_22*.cpp -g -I <path/to/Halide.h> -I <path/to/tools/halide_benchmark.h> -L <path/to/libHalide.so> -lHalide -o lesson_22 -std=c++17
+// DYLD_LIBRARY_PATH=<path/to/libHalide.dylib> ./lesson_22
+
+// If you have the entire Halide source tree, you can also build it by
+// running:
+//    make tutorial_lesson_22_jit_performance
+// in a shell at the top of the halide source tree.
+
+#include "Halide.h"
+#include "halide_benchmark.h"
+#include <stdio.h>
+
+using namespace Halide;
+using namespace Halide::Tools; // for benchmark()
+
+// Let's define a helper function to construct a simple pipeline that we'll use for our performance tests.
+Pipeline make_pipeline() {
+    // We'll start with a simple transpose operation...
+    Func input("input"), output("output");
+    Var x("x"), y("y");
+
+    // Fill the input with a linear combination of the coordinate values...
+    input(x, y) = cast<uint16_t>(x + y);
+    input.compute_root();
+
+    // Transpose the rows and cols 
+    output(x, y) = input(y, x);
+
+    // Schedule it ... there's a number of possibilities here to do an efficient block-wise transpose.
+    Var xi("xi"), yi("yi");
+    
+    // Let's focus on 8x8 subtiles, and then vectorize across X, and unroll across Y.
+    output.tile(x, y, xi, yi, 8, 8).vectorize(xi).unroll(yi);
+
+    // For more advanced scheduling: 
+    //
+    // We can improve this even more by using the .in() directive (see Tutorial 19), 
+    // which allows us to interpose new Funcs in between input and output.
+    // 
+    // Here we can inject a block_transpose function to allow us to do 8 vectorized loads from the input.
+    Func block_transpose("block_transpose"), block("block");
+    block_transpose = input.in(output).compute_at(output, x).vectorize(x).unroll(y);
+    //
+    // And now Let's reorder and vectorize in X across the block.
+    block = block_transpose.in(output).reorder_storage(y, x).compute_at(output, x).vectorize(x).unroll(y);
+
+    // Return the constructed pipeline
+    return Pipeline(output);
+}
+
+int main(int argc, char **argv) {
+    // Since we'll be using the same sample and iteration counts for our benchmarking,
+    // let's define them here in the outermost scope.
+    constexpr int samples = 100;
+    constexpr int iterations = 1;
+    
+    // Now, let's measure the performance of constructing and executing a simple pipeline from scratch...
+    {
+        size_t count = 0;
+        double t = benchmark(samples, iterations, [&]() {
+
+            // First, create an output buffer to hold the results.
+            Buffer<uint16_t> result(1024, 1024);
+            
+            // Now, construct our pipeline from scratch.
+            Pipeline pipeline = make_pipeline();
+
+            // And then call realize to execute the pipeline.
+            pipeline.realize(result);
+            ++count;
+        });
+
+        // On a MacBook Pro M1, we should get around ~1800 times/sec.
+        std::cout << "Compile & Execute Pipeline (from scratch): " << int(count / t) << " times/sec\n";
+    }
+
+    // This time, let's create the pipeline outside the timing loop and re-use it for each execution...
+    {
+        // Create our pipeline, and re-use it in the loop below
+        Pipeline pipeline = make_pipeline();
+
+        size_t count = 0;
+        double t = benchmark(samples, iterations, [&]() {
+
+            // Create our output buffer
+            Buffer<uint16_t> result(1024, 1024);
+            
+            // Now, call realize
+            pipeline.realize(result);
+            ++count;
+        });
+
+        // On a MacBook Pro M1, we should get around ~175000 times/sec (almost 95-100x times faster!).
+        std::cout << "Compile & Execute Pipeline (re-use pipeline): " << int(count / t) << " times/sec\n";
+    }
+
+    // Let's do the same thing as before, but explicitly JIT compile before we realize...
+    {
+        Pipeline pipeline = make_pipeline();
+
+        // Let's JIT compile for our target before we realize, and see what happens...
+        const Target target = get_jit_target_from_environment();
+        pipeline.compile_jit(target);
+
+        size_t count = 0;
+        double t = benchmark(samples, iterations, [&]() {
+            Buffer<uint16_t> result(1024, 1024);
+            pipeline.realize(result);
+            ++count;
+        });
+ 
+        // On a MacBook Pro M1, this should be about the same as the previous run (about ~175000 times/sec)
+        //
+        // This may seem somewhat surprising, since compiling before realizing doesn't seem to make 
+        // much of a difference to the previous case.  However, the first call to realize() will implicitly
+        // JIT-compile and cache the generated code associated with the Pipeline object, which is basically 
+        // what we've done here. Each subsequent call to realize uses the cached version of the native code, 
+        // so there's no additional overhead, and the cost is amortized as we re-use the pipeline.
+        std::cout << "Execute Pipeline (compile before realize): " << int(count / t) << " times/sec\n";
+
+        // Another subtlety is the creation of the result buffer ... the declaration implicitly
+        // allocates memory which will add overhead to each loop iteration. This time, let's try 
+        // using the realize({1024, 1024}) call which will use the buffer managed by the pipeline 
+        // object for the outputs...
+        count = 0;
+        t = benchmark(samples, iterations, [&]() {
+            Buffer<uint16_t> result = pipeline.realize({1024, 1024});
+            ++count;
+        });
+
+        // On a MacBook Pro M1, this should be about the same as the previous run (about ~175000 times/sec).
+        std::cout << "Execute Pipeline (same but with realize({})): " << int(count / t) << " times/sec\n";
+
+        // Or ... we could move the declaration of the result buffer outside the timing loop, and
+        // re-use the allocation (with the caveat that we will be stomping over its contents on each 
+        // execution).
+        Buffer<uint16_t> result(1024, 1024);
+
+        count = 0;
+        t = benchmark(samples, iterations, [&]() {
+            pipeline.realize(result);
+            ++count;
+        });
+
+        // On a MacBook Pro M1, this should be much more efficient ... ~200000 times/sec (or 10-12% faster).
+        std::cout << "Execute Pipeline (re-use buffer with realize): " << int(count / t) << " times/sec\n";
+    }
+
+    // Alternatively, we could compile to a Callable object...
+    {
+        Pipeline pipeline = make_pipeline();
+        const Target target = get_jit_target_from_environment();
+
+        // Here, we can ask the pipeline for its argument list (these are either Params,
+        // ImageParams, or Buffers) so that we can construct a Callable object with the same 
+        // calling convention.
+        auto arguments = pipeline.infer_arguments();
+
+        // The Callable object acts as a convenient way of invoking the compiled code like
+        // a function call, using an argv-like syntax for the argument list. It also caches 
+        // the JIT compiled code, so there's no code generation overhead when invoking the
+        // callable object and executing the pipeline.
+        Callable callable = pipeline.compile_to_callable(arguments, target);
+
+        // Again, we'll pre-allocate and re-use the result buffer.
+        Buffer<uint16_t> result(1024, 1024);
+
+        size_t count = 0;
+        double t = benchmark(samples, iterations, [&]() {
+            callable(result);
+            ++count;
+        });
+
+        // This should be about the same as the previous run (about ~200000 times/sec).
+        std::cout << "Execute Pipeline (compile to callable): " << int(count / t) << " times/sec\n";
+
+        // Perhaps even more convient, we can create a std::function object from the callable,
+        // which allows cleaner type checking for the parameters, and slightly less overhead
+        // for invoking the function. The list used for the template parameters needs to match
+        // the list for the parameters of the pipeline.  Here, we have a single result buffer,
+        // so we specify Buffer<uint16_t> in our call to .make_std_function<>. If we had other 
+        // scalar parameters, input buffers or output buffers, we'd pass them in the template 
+        // parameter list too.
+        auto function = callable.make_std_function<Buffer<uint16_t>>();
+
+        count = 0;
+        t = benchmark(samples, iterations, [&]() {
+            function(result);
+            ++count;
+        });
+
+        // On a MacBook Pro M1, this should be slightly more efficient than the callable (~1% faster).
+        std::cout << "Execute Pipeline (compile to std::function): " << int(count / t) << " times/sec\n";
+    }
+
+    // Let's see how much time is spent on just compiling...
+    {
+        Pipeline pipeline = make_pipeline();
+
+        // Only the first call to compile_jit() is expensive ... after the code is generated,
+        // it gets stored in a cache for later re-use, so repeatedly calling compile_jit has
+        // very little overhead after its been cached.
+
+        size_t count = 0;
+        double t = benchmark(samples, iterations, [&]() {
+            pipeline.compile_jit();
+            ++count;
+        });
+
+        // Only the first call does any work and the rest are essentially free.
+        // On a MacBook Pro M1, we should expect ~2 billion times/sec.
+        std::cout << "Compile JIT (using cache): " << int(count / t) << " times/sec\n";
+
+        // You can invalidate the cache manually, which will destroy all the compiled state.
+        count = 0;
+        t = benchmark(samples, iterations, [&]() {
+            pipeline.invalidate_cache();
+            pipeline.compile_jit();
+            ++count;
+        });
+
+        // This is an intentionally expensive loop, and very slow!
+        // On a MacBook Pro M1, we should see only ~2000 times/sec.
+        std::cout << "Compile JIT (from scratch): " << int(count / t) << " times/sec\n";
+    }
+
+    // Alternatively we could compile to a Module...
+    {
+        Pipeline pipeline = make_pipeline();
+        auto args = pipeline.infer_arguments();
+
+        // Compiling to a module generates a self-contained Module containing an internal-representation
+        // of the lowered code suitable for further compilation. So, it's not directly
+        // runnable, but it can be used to link/combine Modules and generate object files,
+        // static libs, bitcode, etc.
+
+        size_t count = 0;
+        double t = benchmark(samples, iterations, [&]() {
+            Module m = pipeline.compile_to_module(args, "transpose");
+            ++count;
+        });
+
+        // On a MacBook Pro M1, this should be around ~10000 times/sec
+        std::cout << "Compile to Module: " << int(count / t) << " times/sec\n";
+    }
+
+    printf("DONE!\n");
+    return 0;
+}